1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10; Show what the atomic optimization pass will do for global pointers. 11 12define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 13; GFX7LESS-LABEL: add_i32_constant: 14; GFX7LESS: ; %bb.0: ; %entry 15; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 16; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 17; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 18; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 19; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 20; GFX7LESS-NEXT: ; implicit-def: $vgpr1 21; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 22; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 23; GFX7LESS-NEXT: ; %bb.1: 24; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 25; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 26; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 27; GFX7LESS-NEXT: s_mov_b32 s10, -1 28; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 29; GFX7LESS-NEXT: s_mov_b32 s8, s2 30; GFX7LESS-NEXT: s_mov_b32 s9, s3 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 32; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 33; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 34; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 35; GFX7LESS-NEXT: buffer_wbinvl1 36; GFX7LESS-NEXT: .LBB0_2: 37; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 40; GFX7LESS-NEXT: s_mov_b32 s2, -1 41; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 43; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 44; GFX7LESS-NEXT: s_endpgm 45; 46; GFX89-LABEL: add_i32_constant: 47; GFX89: ; %bb.0: ; %entry 48; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 49; GFX89-NEXT: s_mov_b64 s[6:7], exec 50; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 51; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 52; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX89-NEXT: ; implicit-def: $vgpr1 54; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 55; GFX89-NEXT: s_cbranch_execz .LBB0_2 56; GFX89-NEXT: ; %bb.1: 57; GFX89-NEXT: s_waitcnt lgkmcnt(0) 58; GFX89-NEXT: s_mov_b32 s8, s2 59; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 60; GFX89-NEXT: s_mul_i32 s2, s2, 5 61; GFX89-NEXT: s_mov_b32 s11, 0xf000 62; GFX89-NEXT: s_mov_b32 s10, -1 63; GFX89-NEXT: s_mov_b32 s9, s3 64; GFX89-NEXT: v_mov_b32_e32 v1, s2 65; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 66; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 67; GFX89-NEXT: s_waitcnt vmcnt(0) 68; GFX89-NEXT: buffer_wbinvl1_vol 69; GFX89-NEXT: .LBB0_2: 70; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 71; GFX89-NEXT: v_readfirstlane_b32 s4, v1 72; GFX89-NEXT: s_waitcnt lgkmcnt(0) 73; GFX89-NEXT: s_mov_b32 s3, 0xf000 74; GFX89-NEXT: s_mov_b32 s2, -1 75; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 76; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX89-NEXT: s_endpgm 78; 79; GFX1064-LABEL: add_i32_constant: 80; GFX1064: ; %bb.0: ; %entry 81; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 82; GFX1064-NEXT: s_mov_b64 s[6:7], exec 83; GFX1064-NEXT: ; implicit-def: $vgpr1 84; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 85; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 86; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 87; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX1064-NEXT: s_cbranch_execz .LBB0_2 89; GFX1064-NEXT: ; %bb.1: 90; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 91; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 92; GFX1064-NEXT: s_mul_i32 s6, s6, 5 93; GFX1064-NEXT: s_mov_b32 s10, -1 94; GFX1064-NEXT: v_mov_b32_e32 v1, s6 95; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 96; GFX1064-NEXT: s_mov_b32 s8, s2 97; GFX1064-NEXT: s_mov_b32 s9, s3 98; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 99; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 100; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 101; GFX1064-NEXT: s_waitcnt vmcnt(0) 102; GFX1064-NEXT: buffer_gl0_inv 103; GFX1064-NEXT: buffer_gl1_inv 104; GFX1064-NEXT: .LBB0_2: 105; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 106; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 107; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 108; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 109; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 110; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 111; GFX1064-NEXT: s_mov_b32 s2, -1 112; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 113; GFX1064-NEXT: s_endpgm 114; 115; GFX1032-LABEL: add_i32_constant: 116; GFX1032: ; %bb.0: ; %entry 117; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 118; GFX1032-NEXT: s_mov_b32 s5, exec_lo 119; GFX1032-NEXT: ; implicit-def: $vgpr1 120; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 121; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 122; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 123; GFX1032-NEXT: s_cbranch_execz .LBB0_2 124; GFX1032-NEXT: ; %bb.1: 125; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 126; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 127; GFX1032-NEXT: s_mul_i32 s5, s5, 5 128; GFX1032-NEXT: s_mov_b32 s10, -1 129; GFX1032-NEXT: v_mov_b32_e32 v1, s5 130; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 131; GFX1032-NEXT: s_mov_b32 s8, s2 132; GFX1032-NEXT: s_mov_b32 s9, s3 133; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 134; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 135; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 136; GFX1032-NEXT: s_waitcnt vmcnt(0) 137; GFX1032-NEXT: buffer_gl0_inv 138; GFX1032-NEXT: buffer_gl1_inv 139; GFX1032-NEXT: .LBB0_2: 140; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 141; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 142; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 143; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 144; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 145; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 146; GFX1032-NEXT: s_mov_b32 s2, -1 147; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 148; GFX1032-NEXT: s_endpgm 149entry: 150 %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel 151 store i32 %old, i32 addrspace(1)* %out 152 ret void 153} 154 155define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { 156; GFX7LESS-LABEL: add_i32_uniform: 157; GFX7LESS: ; %bb.0: ; %entry 158; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 159; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 160; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 161; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 162; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 163; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 164; GFX7LESS-NEXT: ; implicit-def: $vgpr1 165; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 166; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 167; GFX7LESS-NEXT: ; %bb.1: 168; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 169; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 170; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 171; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 172; GFX7LESS-NEXT: s_mov_b32 s14, -1 173; GFX7LESS-NEXT: s_mov_b32 s12, s6 174; GFX7LESS-NEXT: s_mov_b32 s13, s7 175; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 176; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 177; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 178; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 179; GFX7LESS-NEXT: buffer_wbinvl1 180; GFX7LESS-NEXT: .LBB1_2: 181; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 182; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 183; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 184; GFX7LESS-NEXT: s_mov_b32 s6, -1 185; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 186; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 187; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 188; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 189; GFX7LESS-NEXT: s_endpgm 190; 191; GFX8-LABEL: add_i32_uniform: 192; GFX8: ; %bb.0: ; %entry 193; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 194; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 195; GFX8-NEXT: s_mov_b64 s[2:3], exec 196; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 197; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 198; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 199; GFX8-NEXT: ; implicit-def: $vgpr1 200; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 201; GFX8-NEXT: s_cbranch_execz .LBB1_2 202; GFX8-NEXT: ; %bb.1: 203; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 204; GFX8-NEXT: s_waitcnt lgkmcnt(0) 205; GFX8-NEXT: s_mul_i32 s2, s8, s2 206; GFX8-NEXT: s_mov_b32 s15, 0xf000 207; GFX8-NEXT: s_mov_b32 s14, -1 208; GFX8-NEXT: s_mov_b32 s12, s6 209; GFX8-NEXT: s_mov_b32 s13, s7 210; GFX8-NEXT: v_mov_b32_e32 v1, s2 211; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 212; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 213; GFX8-NEXT: s_waitcnt vmcnt(0) 214; GFX8-NEXT: buffer_wbinvl1_vol 215; GFX8-NEXT: .LBB1_2: 216; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 218; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 219; GFX8-NEXT: v_readfirstlane_b32 s0, v1 220; GFX8-NEXT: s_mov_b32 s7, 0xf000 221; GFX8-NEXT: s_mov_b32 s6, -1 222; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 223; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 224; GFX8-NEXT: s_endpgm 225; 226; GFX9-LABEL: add_i32_uniform: 227; GFX9: ; %bb.0: ; %entry 228; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 229; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 230; GFX9-NEXT: s_mov_b64 s[2:3], exec 231; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 232; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 233; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 234; GFX9-NEXT: ; implicit-def: $vgpr1 235; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 236; GFX9-NEXT: s_cbranch_execz .LBB1_2 237; GFX9-NEXT: ; %bb.1: 238; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 240; GFX9-NEXT: s_mul_i32 s2, s8, s2 241; GFX9-NEXT: s_mov_b32 s15, 0xf000 242; GFX9-NEXT: s_mov_b32 s14, -1 243; GFX9-NEXT: s_mov_b32 s12, s6 244; GFX9-NEXT: s_mov_b32 s13, s7 245; GFX9-NEXT: v_mov_b32_e32 v1, s2 246; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 247; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 248; GFX9-NEXT: s_waitcnt vmcnt(0) 249; GFX9-NEXT: buffer_wbinvl1_vol 250; GFX9-NEXT: .LBB1_2: 251; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 254; GFX9-NEXT: v_readfirstlane_b32 s0, v1 255; GFX9-NEXT: s_mov_b32 s7, 0xf000 256; GFX9-NEXT: s_mov_b32 s6, -1 257; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 258; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 259; GFX9-NEXT: s_endpgm 260; 261; GFX1064-LABEL: add_i32_uniform: 262; GFX1064: ; %bb.0: ; %entry 263; GFX1064-NEXT: s_clause 0x1 264; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 265; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 266; GFX1064-NEXT: s_mov_b64 s[2:3], exec 267; GFX1064-NEXT: ; implicit-def: $vgpr1 268; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 269; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 270; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 271; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 272; GFX1064-NEXT: s_cbranch_execz .LBB1_2 273; GFX1064-NEXT: ; %bb.1: 274; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 275; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 276; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 277; GFX1064-NEXT: s_mul_i32 s2, s8, s2 278; GFX1064-NEXT: s_mov_b32 s14, -1 279; GFX1064-NEXT: v_mov_b32_e32 v1, s2 280; GFX1064-NEXT: s_mov_b32 s12, s6 281; GFX1064-NEXT: s_mov_b32 s13, s7 282; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 283; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 284; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 285; GFX1064-NEXT: s_waitcnt vmcnt(0) 286; GFX1064-NEXT: buffer_gl0_inv 287; GFX1064-NEXT: buffer_gl1_inv 288; GFX1064-NEXT: .LBB1_2: 289; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 290; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 293; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 294; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 295; GFX1064-NEXT: s_mov_b32 s6, -1 296; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 297; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; GFX1064-NEXT: s_endpgm 299; 300; GFX1032-LABEL: add_i32_uniform: 301; GFX1032: ; %bb.0: ; %entry 302; GFX1032-NEXT: s_clause 0x1 303; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 304; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 305; GFX1032-NEXT: s_mov_b32 s3, exec_lo 306; GFX1032-NEXT: ; implicit-def: $vgpr1 307; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 308; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 309; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 310; GFX1032-NEXT: s_cbranch_execz .LBB1_2 311; GFX1032-NEXT: ; %bb.1: 312; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 313; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 314; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 315; GFX1032-NEXT: s_mul_i32 s1, s2, s1 316; GFX1032-NEXT: s_mov_b32 s10, -1 317; GFX1032-NEXT: v_mov_b32_e32 v1, s1 318; GFX1032-NEXT: s_mov_b32 s8, s6 319; GFX1032-NEXT: s_mov_b32 s9, s7 320; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 321; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 322; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 323; GFX1032-NEXT: s_waitcnt vmcnt(0) 324; GFX1032-NEXT: buffer_gl0_inv 325; GFX1032-NEXT: buffer_gl1_inv 326; GFX1032-NEXT: .LBB1_2: 327; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 328; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 329; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 330; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 331; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 332; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 333; GFX1032-NEXT: s_mov_b32 s6, -1 334; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 335; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 336; GFX1032-NEXT: s_endpgm 337entry: 338 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel 339 store i32 %old, i32 addrspace(1)* %out 340 ret void 341} 342 343define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 344; GFX7LESS-LABEL: add_i32_varying: 345; GFX7LESS: ; %bb.0: ; %entry 346; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 347; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 348; GFX7LESS-NEXT: s_mov_b32 s6, -1 349; GFX7LESS-NEXT: s_mov_b32 s10, s6 350; GFX7LESS-NEXT: s_mov_b32 s11, s7 351; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 352; GFX7LESS-NEXT: s_mov_b32 s8, s2 353; GFX7LESS-NEXT: s_mov_b32 s9, s3 354; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 355; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 356; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 357; GFX7LESS-NEXT: buffer_wbinvl1 358; GFX7LESS-NEXT: s_mov_b32 s4, s0 359; GFX7LESS-NEXT: s_mov_b32 s5, s1 360; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 361; GFX7LESS-NEXT: s_endpgm 362; 363; GFX8-LABEL: add_i32_varying: 364; GFX8: ; %bb.0: ; %entry 365; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 366; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 367; GFX8-NEXT: v_mov_b32_e32 v1, 0 368; GFX8-NEXT: s_mov_b64 exec, s[4:5] 369; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 370; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 371; GFX8-NEXT: v_mov_b32_e32 v2, v0 372; GFX8-NEXT: s_not_b64 exec, exec 373; GFX8-NEXT: v_mov_b32_e32 v2, 0 374; GFX8-NEXT: s_not_b64 exec, exec 375; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 376; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 377; GFX8-NEXT: s_nop 1 378; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 379; GFX8-NEXT: s_nop 1 380; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 381; GFX8-NEXT: s_nop 1 382; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 383; GFX8-NEXT: s_nop 1 384; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 385; GFX8-NEXT: s_nop 1 386; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 387; GFX8-NEXT: v_readlane_b32 s6, v2, 63 388; GFX8-NEXT: s_nop 0 389; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 390; GFX8-NEXT: s_mov_b64 exec, s[4:5] 391; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 392; GFX8-NEXT: ; implicit-def: $vgpr0 393; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 394; GFX8-NEXT: s_cbranch_execz .LBB2_2 395; GFX8-NEXT: ; %bb.1: 396; GFX8-NEXT: s_mov_b32 s11, 0xf000 397; GFX8-NEXT: s_mov_b32 s10, -1 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: s_mov_b32 s8, s2 400; GFX8-NEXT: s_mov_b32 s9, s3 401; GFX8-NEXT: v_mov_b32_e32 v0, s6 402; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 403; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 404; GFX8-NEXT: s_waitcnt vmcnt(0) 405; GFX8-NEXT: buffer_wbinvl1_vol 406; GFX8-NEXT: .LBB2_2: 407; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 408; GFX8-NEXT: v_readfirstlane_b32 s4, v0 409; GFX8-NEXT: v_mov_b32_e32 v0, v1 410; GFX8-NEXT: s_waitcnt lgkmcnt(0) 411; GFX8-NEXT: s_mov_b32 s3, 0xf000 412; GFX8-NEXT: s_mov_b32 s2, -1 413; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 414; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 415; GFX8-NEXT: s_endpgm 416; 417; GFX9-LABEL: add_i32_varying: 418; GFX9: ; %bb.0: ; %entry 419; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 420; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 421; GFX9-NEXT: v_mov_b32_e32 v1, 0 422; GFX9-NEXT: s_mov_b64 exec, s[4:5] 423; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 424; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 425; GFX9-NEXT: v_mov_b32_e32 v2, v0 426; GFX9-NEXT: s_not_b64 exec, exec 427; GFX9-NEXT: v_mov_b32_e32 v2, 0 428; GFX9-NEXT: s_not_b64 exec, exec 429; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 435; GFX9-NEXT: s_nop 1 436; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 437; GFX9-NEXT: s_nop 1 438; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 439; GFX9-NEXT: s_nop 1 440; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 441; GFX9-NEXT: v_readlane_b32 s6, v2, 63 442; GFX9-NEXT: s_nop 0 443; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 444; GFX9-NEXT: s_mov_b64 exec, s[4:5] 445; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 446; GFX9-NEXT: ; implicit-def: $vgpr0 447; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 448; GFX9-NEXT: s_cbranch_execz .LBB2_2 449; GFX9-NEXT: ; %bb.1: 450; GFX9-NEXT: s_mov_b32 s11, 0xf000 451; GFX9-NEXT: s_mov_b32 s10, -1 452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 453; GFX9-NEXT: s_mov_b32 s8, s2 454; GFX9-NEXT: s_mov_b32 s9, s3 455; GFX9-NEXT: v_mov_b32_e32 v0, s6 456; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 457; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 458; GFX9-NEXT: s_waitcnt vmcnt(0) 459; GFX9-NEXT: buffer_wbinvl1_vol 460; GFX9-NEXT: .LBB2_2: 461; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 462; GFX9-NEXT: v_readfirstlane_b32 s4, v0 463; GFX9-NEXT: v_mov_b32_e32 v0, v1 464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 465; GFX9-NEXT: s_mov_b32 s3, 0xf000 466; GFX9-NEXT: s_mov_b32 s2, -1 467; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 468; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 469; GFX9-NEXT: s_endpgm 470; 471; GFX1064-LABEL: add_i32_varying: 472; GFX1064: ; %bb.0: ; %entry 473; GFX1064-NEXT: v_mov_b32_e32 v1, v0 474; GFX1064-NEXT: s_not_b64 exec, exec 475; GFX1064-NEXT: v_mov_b32_e32 v1, 0 476; GFX1064-NEXT: s_not_b64 exec, exec 477; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 479; GFX1064-NEXT: v_mov_b32_e32 v3, 0 480; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 481; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 482; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 483; GFX1064-NEXT: v_mov_b32_e32 v2, v1 484; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 485; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 486; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 487; GFX1064-NEXT: v_mov_b32_e32 v2, s4 488; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 489; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 490; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 491; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 492; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 493; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 494; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 498; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 499; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 500; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 501; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 502; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 503; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 504; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 505; GFX1064-NEXT: s_mov_b32 s4, s9 506; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 507; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 508; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 509; GFX1064-NEXT: s_mov_b32 s6, -1 510; GFX1064-NEXT: ; implicit-def: $vgpr0 511; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 512; GFX1064-NEXT: s_cbranch_execz .LBB2_2 513; GFX1064-NEXT: ; %bb.1: 514; GFX1064-NEXT: v_mov_b32_e32 v0, s4 515; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 516; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 517; GFX1064-NEXT: s_mov_b32 s4, s2 518; GFX1064-NEXT: s_mov_b32 s5, s3 519; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 520; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 521; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 522; GFX1064-NEXT: s_waitcnt vmcnt(0) 523; GFX1064-NEXT: buffer_gl0_inv 524; GFX1064-NEXT: buffer_gl1_inv 525; GFX1064-NEXT: .LBB2_2: 526; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 527; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 528; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 529; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 530; GFX1064-NEXT: v_mov_b32_e32 v0, v3 531; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 532; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 533; GFX1064-NEXT: s_mov_b32 s2, s6 534; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 535; GFX1064-NEXT: s_endpgm 536; 537; GFX1032-LABEL: add_i32_varying: 538; GFX1032: ; %bb.0: ; %entry 539; GFX1032-NEXT: v_mov_b32_e32 v1, v0 540; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 541; GFX1032-NEXT: v_mov_b32_e32 v1, 0 542; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 543; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 544; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 545; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 546; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 547; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 548; GFX1032-NEXT: v_mov_b32_e32 v2, v1 549; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 550; GFX1032-NEXT: s_mov_b32 exec_lo, s2 551; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 552; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 553; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 554; GFX1032-NEXT: v_mov_b32_e32 v3, 0 555; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 556; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 557; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 558; GFX1032-NEXT: s_mov_b32 exec_lo, s4 559; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 560; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 561; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 562; GFX1032-NEXT: s_mov_b32 exec_lo, s4 563; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 564; GFX1032-NEXT: s_mov_b32 s4, s6 565; GFX1032-NEXT: s_mov_b32 s6, -1 566; GFX1032-NEXT: ; implicit-def: $vgpr0 567; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 568; GFX1032-NEXT: s_cbranch_execz .LBB2_2 569; GFX1032-NEXT: ; %bb.1: 570; GFX1032-NEXT: v_mov_b32_e32 v0, s4 571; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 572; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 573; GFX1032-NEXT: s_mov_b32 s4, s2 574; GFX1032-NEXT: s_mov_b32 s5, s3 575; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 576; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 577; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 578; GFX1032-NEXT: s_waitcnt vmcnt(0) 579; GFX1032-NEXT: buffer_gl0_inv 580; GFX1032-NEXT: buffer_gl1_inv 581; GFX1032-NEXT: .LBB2_2: 582; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 583; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 584; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 585; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 586; GFX1032-NEXT: v_mov_b32_e32 v0, v3 587; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 588; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 589; GFX1032-NEXT: s_mov_b32 s2, s6 590; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 591; GFX1032-NEXT: s_endpgm 592entry: 593 %lane = call i32 @llvm.amdgcn.workitem.id.x() 594 %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel 595 store i32 %old, i32 addrspace(1)* %out 596 ret void 597} 598 599define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 600; GFX7LESS-LABEL: add_i64_constant: 601; GFX7LESS: ; %bb.0: ; %entry 602; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 603; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 604; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 605; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 606; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 607; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 608; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 609; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 610; GFX7LESS-NEXT: ; %bb.1: 611; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 612; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 613; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 614; GFX7LESS-NEXT: s_mov_b32 s10, -1 615; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 616; GFX7LESS-NEXT: s_mov_b32 s8, s2 617; GFX7LESS-NEXT: s_mov_b32 s9, s3 618; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 619; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 620; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 621; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 622; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 623; GFX7LESS-NEXT: buffer_wbinvl1 624; GFX7LESS-NEXT: .LBB3_2: 625; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 626; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 627; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 628; GFX7LESS-NEXT: s_mov_b32 s2, -1 629; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 630; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 631; GFX7LESS-NEXT: s_waitcnt expcnt(0) 632; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 633; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 634; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 635; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 636; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 637; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 638; GFX7LESS-NEXT: s_endpgm 639; 640; GFX89-LABEL: add_i64_constant: 641; GFX89: ; %bb.0: ; %entry 642; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 643; GFX89-NEXT: s_mov_b64 s[6:7], exec 644; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 645; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 646; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 647; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 648; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc 649; GFX89-NEXT: s_cbranch_execz .LBB3_2 650; GFX89-NEXT: ; %bb.1: 651; GFX89-NEXT: s_waitcnt lgkmcnt(0) 652; GFX89-NEXT: s_mov_b32 s8, s2 653; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 654; GFX89-NEXT: s_mul_i32 s2, s2, 5 655; GFX89-NEXT: s_mov_b32 s11, 0xf000 656; GFX89-NEXT: s_mov_b32 s10, -1 657; GFX89-NEXT: s_mov_b32 s9, s3 658; GFX89-NEXT: v_mov_b32_e32 v0, s2 659; GFX89-NEXT: v_mov_b32_e32 v1, 0 660; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 661; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 662; GFX89-NEXT: s_waitcnt vmcnt(0) 663; GFX89-NEXT: buffer_wbinvl1_vol 664; GFX89-NEXT: .LBB3_2: 665; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] 666; GFX89-NEXT: s_waitcnt lgkmcnt(0) 667; GFX89-NEXT: v_readfirstlane_b32 s2, v0 668; GFX89-NEXT: v_readfirstlane_b32 s3, v1 669; GFX89-NEXT: v_mov_b32_e32 v0, s2 670; GFX89-NEXT: v_mov_b32_e32 v1, s3 671; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 672; GFX89-NEXT: s_mov_b32 s3, 0xf000 673; GFX89-NEXT: s_mov_b32 s2, -1 674; GFX89-NEXT: s_nop 2 675; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 676; GFX89-NEXT: s_endpgm 677; 678; GFX1064-LABEL: add_i64_constant: 679; GFX1064: ; %bb.0: ; %entry 680; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 681; GFX1064-NEXT: s_mov_b64 s[6:7], exec 682; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 683; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 684; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 685; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 686; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 687; GFX1064-NEXT: s_cbranch_execz .LBB3_2 688; GFX1064-NEXT: ; %bb.1: 689; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 690; GFX1064-NEXT: v_mov_b32_e32 v1, 0 691; GFX1064-NEXT: s_mul_i32 s6, s6, 5 692; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 693; GFX1064-NEXT: v_mov_b32_e32 v0, s6 694; GFX1064-NEXT: s_mov_b32 s10, -1 695; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 696; GFX1064-NEXT: s_mov_b32 s8, s2 697; GFX1064-NEXT: s_mov_b32 s9, s3 698; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 699; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 700; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 701; GFX1064-NEXT: s_waitcnt vmcnt(0) 702; GFX1064-NEXT: buffer_gl0_inv 703; GFX1064-NEXT: buffer_gl1_inv 704; GFX1064-NEXT: .LBB3_2: 705; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 706; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 707; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 708; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 709; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 710; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 711; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 712; GFX1064-NEXT: s_mov_b32 s2, -1 713; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 714; GFX1064-NEXT: s_endpgm 715; 716; GFX1032-LABEL: add_i64_constant: 717; GFX1032: ; %bb.0: ; %entry 718; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 719; GFX1032-NEXT: s_mov_b32 s5, exec_lo 720; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 721; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 722; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 723; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 724; GFX1032-NEXT: s_cbranch_execz .LBB3_2 725; GFX1032-NEXT: ; %bb.1: 726; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 727; GFX1032-NEXT: v_mov_b32_e32 v1, 0 728; GFX1032-NEXT: s_mul_i32 s5, s5, 5 729; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 730; GFX1032-NEXT: v_mov_b32_e32 v0, s5 731; GFX1032-NEXT: s_mov_b32 s10, -1 732; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 733; GFX1032-NEXT: s_mov_b32 s8, s2 734; GFX1032-NEXT: s_mov_b32 s9, s3 735; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 736; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 737; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 738; GFX1032-NEXT: s_waitcnt vmcnt(0) 739; GFX1032-NEXT: buffer_gl0_inv 740; GFX1032-NEXT: buffer_gl1_inv 741; GFX1032-NEXT: .LBB3_2: 742; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 743; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 744; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 745; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 746; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 747; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 748; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 749; GFX1032-NEXT: s_mov_b32 s2, -1 750; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 751; GFX1032-NEXT: s_endpgm 752entry: 753 %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel 754 store i64 %old, i64 addrspace(1)* %out 755 ret void 756} 757 758define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { 759; GFX7LESS-LABEL: add_i64_uniform: 760; GFX7LESS: ; %bb.0: ; %entry 761; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 762; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 763; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 764; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 765; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 766; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 767; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 768; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 769; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 770; GFX7LESS-NEXT: ; %bb.1: 771; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 772; GFX7LESS-NEXT: s_mov_b32 s14, -1 773; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 774; GFX7LESS-NEXT: s_mov_b32 s12, s6 775; GFX7LESS-NEXT: s_mov_b32 s13, s7 776; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 777; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 778; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 779; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 780; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 781; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 782; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 783; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 784; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 785; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 786; GFX7LESS-NEXT: buffer_wbinvl1 787; GFX7LESS-NEXT: .LBB4_2: 788; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 789; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 790; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 791; GFX7LESS-NEXT: s_mov_b32 s6, -1 792; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 793; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 794; GFX7LESS-NEXT: s_waitcnt expcnt(0) 795; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 796; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 797; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 798; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 799; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 800; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 801; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 802; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 803; GFX7LESS-NEXT: s_endpgm 804; 805; GFX8-LABEL: add_i64_uniform: 806; GFX8: ; %bb.0: ; %entry 807; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 808; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 809; GFX8-NEXT: s_mov_b64 s[8:9], exec 810; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 811; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 812; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 813; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 814; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 815; GFX8-NEXT: s_cbranch_execz .LBB4_2 816; GFX8-NEXT: ; %bb.1: 817; GFX8-NEXT: s_waitcnt lgkmcnt(0) 818; GFX8-NEXT: s_mov_b32 s12, s6 819; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 820; GFX8-NEXT: v_mov_b32_e32 v0, s6 821; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 822; GFX8-NEXT: s_mul_i32 s6, s1, s6 823; GFX8-NEXT: s_mov_b32 s15, 0xf000 824; GFX8-NEXT: s_mov_b32 s14, -1 825; GFX8-NEXT: s_mov_b32 s13, s7 826; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 827; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 828; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 829; GFX8-NEXT: s_waitcnt vmcnt(0) 830; GFX8-NEXT: buffer_wbinvl1_vol 831; GFX8-NEXT: .LBB4_2: 832; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 833; GFX8-NEXT: s_waitcnt lgkmcnt(0) 834; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 835; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 836; GFX8-NEXT: v_readfirstlane_b32 s0, v0 837; GFX8-NEXT: v_readfirstlane_b32 s1, v1 838; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 839; GFX8-NEXT: v_mov_b32_e32 v3, s1 840; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 841; GFX8-NEXT: s_mov_b32 s7, 0xf000 842; GFX8-NEXT: s_mov_b32 s6, -1 843; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 844; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 845; GFX8-NEXT: s_endpgm 846; 847; GFX9-LABEL: add_i64_uniform: 848; GFX9: ; %bb.0: ; %entry 849; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 850; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 851; GFX9-NEXT: s_mov_b64 s[8:9], exec 852; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 853; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 854; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 855; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 856; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 857; GFX9-NEXT: s_cbranch_execz .LBB4_2 858; GFX9-NEXT: ; %bb.1: 859; GFX9-NEXT: s_waitcnt lgkmcnt(0) 860; GFX9-NEXT: s_mov_b32 s12, s6 861; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 862; GFX9-NEXT: s_mov_b32 s13, s7 863; GFX9-NEXT: s_mul_i32 s7, s3, s6 864; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 865; GFX9-NEXT: s_add_i32 s8, s8, s7 866; GFX9-NEXT: s_mul_i32 s6, s2, s6 867; GFX9-NEXT: s_mov_b32 s15, 0xf000 868; GFX9-NEXT: s_mov_b32 s14, -1 869; GFX9-NEXT: v_mov_b32_e32 v0, s6 870; GFX9-NEXT: v_mov_b32_e32 v1, s8 871; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 872; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc 873; GFX9-NEXT: s_waitcnt vmcnt(0) 874; GFX9-NEXT: buffer_wbinvl1_vol 875; GFX9-NEXT: .LBB4_2: 876; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 877; GFX9-NEXT: s_waitcnt lgkmcnt(0) 878; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 879; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 880; GFX9-NEXT: v_readfirstlane_b32 s0, v0 881; GFX9-NEXT: v_readfirstlane_b32 s1, v1 882; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 883; GFX9-NEXT: v_mov_b32_e32 v3, s1 884; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 885; GFX9-NEXT: s_mov_b32 s7, 0xf000 886; GFX9-NEXT: s_mov_b32 s6, -1 887; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc 888; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 889; GFX9-NEXT: s_endpgm 890; 891; GFX1064-LABEL: add_i64_uniform: 892; GFX1064: ; %bb.0: ; %entry 893; GFX1064-NEXT: s_clause 0x1 894; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 895; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 896; GFX1064-NEXT: s_mov_b64 s[8:9], exec 897; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 898; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 899; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 900; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 901; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 902; GFX1064-NEXT: s_cbranch_execz .LBB4_2 903; GFX1064-NEXT: ; %bb.1: 904; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 905; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 906; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 907; GFX1064-NEXT: s_mul_i32 s9, s3, s8 908; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 909; GFX1064-NEXT: s_mul_i32 s8, s2, s8 910; GFX1064-NEXT: s_add_i32 s10, s10, s9 911; GFX1064-NEXT: v_mov_b32_e32 v0, s8 912; GFX1064-NEXT: v_mov_b32_e32 v1, s10 913; GFX1064-NEXT: s_mov_b32 s10, -1 914; GFX1064-NEXT: s_mov_b32 s8, s6 915; GFX1064-NEXT: s_mov_b32 s9, s7 916; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 917; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 918; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 919; GFX1064-NEXT: s_waitcnt vmcnt(0) 920; GFX1064-NEXT: buffer_gl0_inv 921; GFX1064-NEXT: buffer_gl1_inv 922; GFX1064-NEXT: .LBB4_2: 923; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 924; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 925; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 926; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 927; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 928; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 929; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 930; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 931; GFX1064-NEXT: s_mov_b32 s6, -1 932; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 933; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v2 934; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc 935; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 936; GFX1064-NEXT: s_endpgm 937; 938; GFX1032-LABEL: add_i64_uniform: 939; GFX1032: ; %bb.0: ; %entry 940; GFX1032-NEXT: s_clause 0x1 941; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 942; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 943; GFX1032-NEXT: s_mov_b32 s8, exec_lo 944; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 945; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 946; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 947; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 948; GFX1032-NEXT: s_cbranch_execz .LBB4_2 949; GFX1032-NEXT: ; %bb.1: 950; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 951; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 952; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 953; GFX1032-NEXT: s_mul_i32 s8, s3, s1 954; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 955; GFX1032-NEXT: s_mul_i32 s1, s2, s1 956; GFX1032-NEXT: s_add_i32 s9, s9, s8 957; GFX1032-NEXT: v_mov_b32_e32 v0, s1 958; GFX1032-NEXT: v_mov_b32_e32 v1, s9 959; GFX1032-NEXT: s_mov_b32 s10, -1 960; GFX1032-NEXT: s_mov_b32 s8, s6 961; GFX1032-NEXT: s_mov_b32 s9, s7 962; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 963; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 964; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 965; GFX1032-NEXT: s_waitcnt vmcnt(0) 966; GFX1032-NEXT: buffer_gl0_inv 967; GFX1032-NEXT: buffer_gl1_inv 968; GFX1032-NEXT: .LBB4_2: 969; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 970; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 971; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 972; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 973; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0 974; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 975; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 976; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 977; GFX1032-NEXT: s_mov_b32 s6, -1 978; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 979; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v2 980; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 981; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 982; GFX1032-NEXT: s_endpgm 983entry: 984 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel 985 store i64 %old, i64 addrspace(1)* %out 986 ret void 987} 988 989define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 990; GFX7LESS-LABEL: add_i64_varying: 991; GFX7LESS: ; %bb.0: ; %entry 992; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 993; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 994; GFX7LESS-NEXT: s_mov_b32 s6, -1 995; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 996; GFX7LESS-NEXT: s_mov_b32 s10, s6 997; GFX7LESS-NEXT: s_mov_b32 s11, s7 998; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 999; GFX7LESS-NEXT: s_mov_b32 s8, s2 1000; GFX7LESS-NEXT: s_mov_b32 s9, s3 1001; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1002; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1003; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1004; GFX7LESS-NEXT: buffer_wbinvl1 1005; GFX7LESS-NEXT: s_mov_b32 s4, s0 1006; GFX7LESS-NEXT: s_mov_b32 s5, s1 1007; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1008; GFX7LESS-NEXT: s_endpgm 1009; 1010; GFX89-LABEL: add_i64_varying: 1011; GFX89: ; %bb.0: ; %entry 1012; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1013; GFX89-NEXT: s_mov_b32 s7, 0xf000 1014; GFX89-NEXT: s_mov_b32 s6, -1 1015; GFX89-NEXT: s_mov_b32 s10, s6 1016; GFX89-NEXT: s_mov_b32 s11, s7 1017; GFX89-NEXT: s_waitcnt lgkmcnt(0) 1018; GFX89-NEXT: s_mov_b32 s8, s2 1019; GFX89-NEXT: s_mov_b32 s9, s3 1020; GFX89-NEXT: v_mov_b32_e32 v1, 0 1021; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1022; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1023; GFX89-NEXT: s_waitcnt vmcnt(0) 1024; GFX89-NEXT: buffer_wbinvl1_vol 1025; GFX89-NEXT: s_mov_b32 s4, s0 1026; GFX89-NEXT: s_mov_b32 s5, s1 1027; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1028; GFX89-NEXT: s_endpgm 1029; 1030; GFX10-LABEL: add_i64_varying: 1031; GFX10: ; %bb.0: ; %entry 1032; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1033; GFX10-NEXT: v_mov_b32_e32 v1, 0 1034; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1035; GFX10-NEXT: s_mov_b32 s6, -1 1036; GFX10-NEXT: s_mov_b32 s11, s7 1037; GFX10-NEXT: s_mov_b32 s10, s6 1038; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1039; GFX10-NEXT: s_mov_b32 s8, s2 1040; GFX10-NEXT: s_mov_b32 s9, s3 1041; GFX10-NEXT: s_mov_b32 s4, s0 1042; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1043; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1044; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc 1045; GFX10-NEXT: s_waitcnt vmcnt(0) 1046; GFX10-NEXT: buffer_gl0_inv 1047; GFX10-NEXT: buffer_gl1_inv 1048; GFX10-NEXT: s_mov_b32 s5, s1 1049; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1050; GFX10-NEXT: s_endpgm 1051entry: 1052 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1053 %zext = zext i32 %lane to i64 1054 %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel 1055 store i64 %old, i64 addrspace(1)* %out 1056 ret void 1057} 1058 1059define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1060; GFX7LESS-LABEL: sub_i32_constant: 1061; GFX7LESS: ; %bb.0: ; %entry 1062; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1063; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1064; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1065; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1066; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1067; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1068; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1069; GFX7LESS-NEXT: s_cbranch_execz .LBB6_2 1070; GFX7LESS-NEXT: ; %bb.1: 1071; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1072; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1073; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1074; GFX7LESS-NEXT: s_mov_b32 s10, -1 1075; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1076; GFX7LESS-NEXT: s_mov_b32 s8, s2 1077; GFX7LESS-NEXT: s_mov_b32 s9, s3 1078; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1079; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1080; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1081; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1082; GFX7LESS-NEXT: buffer_wbinvl1 1083; GFX7LESS-NEXT: .LBB6_2: 1084; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1085; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1087; GFX7LESS-NEXT: s_mov_b32 s2, -1 1088; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1089; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1090; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1091; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1092; GFX7LESS-NEXT: s_endpgm 1093; 1094; GFX8-LABEL: sub_i32_constant: 1095; GFX8: ; %bb.0: ; %entry 1096; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1097; GFX8-NEXT: s_mov_b64 s[6:7], exec 1098; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1099; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1100; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1101; GFX8-NEXT: ; implicit-def: $vgpr1 1102; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1103; GFX8-NEXT: s_cbranch_execz .LBB6_2 1104; GFX8-NEXT: ; %bb.1: 1105; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX8-NEXT: s_mov_b32 s8, s2 1107; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1108; GFX8-NEXT: s_mul_i32 s2, s2, 5 1109; GFX8-NEXT: s_mov_b32 s11, 0xf000 1110; GFX8-NEXT: s_mov_b32 s10, -1 1111; GFX8-NEXT: s_mov_b32 s9, s3 1112; GFX8-NEXT: v_mov_b32_e32 v1, s2 1113; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1114; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1115; GFX8-NEXT: s_waitcnt vmcnt(0) 1116; GFX8-NEXT: buffer_wbinvl1_vol 1117; GFX8-NEXT: .LBB6_2: 1118; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1119; GFX8-NEXT: v_readfirstlane_b32 s4, v1 1120; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1121; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1122; GFX8-NEXT: s_mov_b32 s3, 0xf000 1123; GFX8-NEXT: s_mov_b32 s2, -1 1124; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1125; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1126; GFX8-NEXT: s_endpgm 1127; 1128; GFX9-LABEL: sub_i32_constant: 1129; GFX9: ; %bb.0: ; %entry 1130; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1131; GFX9-NEXT: s_mov_b64 s[6:7], exec 1132; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1133; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1134; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1135; GFX9-NEXT: ; implicit-def: $vgpr1 1136; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1137; GFX9-NEXT: s_cbranch_execz .LBB6_2 1138; GFX9-NEXT: ; %bb.1: 1139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX9-NEXT: s_mov_b32 s8, s2 1141; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1142; GFX9-NEXT: s_mul_i32 s2, s2, 5 1143; GFX9-NEXT: s_mov_b32 s11, 0xf000 1144; GFX9-NEXT: s_mov_b32 s10, -1 1145; GFX9-NEXT: s_mov_b32 s9, s3 1146; GFX9-NEXT: v_mov_b32_e32 v1, s2 1147; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1148; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1149; GFX9-NEXT: s_waitcnt vmcnt(0) 1150; GFX9-NEXT: buffer_wbinvl1_vol 1151; GFX9-NEXT: .LBB6_2: 1152; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1153; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1154; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1155; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1156; GFX9-NEXT: s_mov_b32 s3, 0xf000 1157; GFX9-NEXT: s_mov_b32 s2, -1 1158; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1159; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1160; GFX9-NEXT: s_endpgm 1161; 1162; GFX1064-LABEL: sub_i32_constant: 1163; GFX1064: ; %bb.0: ; %entry 1164; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1165; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1166; GFX1064-NEXT: ; implicit-def: $vgpr1 1167; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1168; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1169; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1170; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1171; GFX1064-NEXT: s_cbranch_execz .LBB6_2 1172; GFX1064-NEXT: ; %bb.1: 1173; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1174; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1175; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1176; GFX1064-NEXT: s_mov_b32 s10, -1 1177; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1178; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1179; GFX1064-NEXT: s_mov_b32 s8, s2 1180; GFX1064-NEXT: s_mov_b32 s9, s3 1181; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1182; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1183; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1184; GFX1064-NEXT: s_waitcnt vmcnt(0) 1185; GFX1064-NEXT: buffer_gl0_inv 1186; GFX1064-NEXT: buffer_gl1_inv 1187; GFX1064-NEXT: .LBB6_2: 1188; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1189; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1190; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1191; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1192; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1193; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1194; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1195; GFX1064-NEXT: s_mov_b32 s2, -1 1196; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1197; GFX1064-NEXT: s_endpgm 1198; 1199; GFX1032-LABEL: sub_i32_constant: 1200; GFX1032: ; %bb.0: ; %entry 1201; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1202; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1203; GFX1032-NEXT: ; implicit-def: $vgpr1 1204; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 1205; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1206; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1207; GFX1032-NEXT: s_cbranch_execz .LBB6_2 1208; GFX1032-NEXT: ; %bb.1: 1209; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1210; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1211; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1212; GFX1032-NEXT: s_mov_b32 s10, -1 1213; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1214; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1215; GFX1032-NEXT: s_mov_b32 s8, s2 1216; GFX1032-NEXT: s_mov_b32 s9, s3 1217; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1218; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1219; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1220; GFX1032-NEXT: s_waitcnt vmcnt(0) 1221; GFX1032-NEXT: buffer_gl0_inv 1222; GFX1032-NEXT: buffer_gl1_inv 1223; GFX1032-NEXT: .LBB6_2: 1224; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1225; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1226; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1227; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1228; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1229; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1230; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1231; GFX1032-NEXT: s_mov_b32 s2, -1 1232; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1233; GFX1032-NEXT: s_endpgm 1234entry: 1235 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel 1236 store i32 %old, i32 addrspace(1)* %out 1237 ret void 1238} 1239 1240define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { 1241; GFX7LESS-LABEL: sub_i32_uniform: 1242; GFX7LESS: ; %bb.0: ; %entry 1243; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1244; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1245; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd 1246; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1247; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1248; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1249; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1250; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1251; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1252; GFX7LESS-NEXT: ; %bb.1: 1253; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1254; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1255; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1256; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 1257; GFX7LESS-NEXT: s_mov_b32 s14, -1 1258; GFX7LESS-NEXT: s_mov_b32 s12, s6 1259; GFX7LESS-NEXT: s_mov_b32 s13, s7 1260; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 1261; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1262; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1263; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1264; GFX7LESS-NEXT: buffer_wbinvl1 1265; GFX7LESS-NEXT: .LBB7_2: 1266; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1267; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1268; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1269; GFX7LESS-NEXT: s_mov_b32 s6, -1 1270; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1271; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 1272; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1273; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1274; GFX7LESS-NEXT: s_endpgm 1275; 1276; GFX8-LABEL: sub_i32_uniform: 1277; GFX8: ; %bb.0: ; %entry 1278; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1279; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 1280; GFX8-NEXT: s_mov_b64 s[2:3], exec 1281; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1282; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1283; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1284; GFX8-NEXT: ; implicit-def: $vgpr1 1285; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1286; GFX8-NEXT: s_cbranch_execz .LBB7_2 1287; GFX8-NEXT: ; %bb.1: 1288; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1289; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1290; GFX8-NEXT: s_mul_i32 s2, s8, s2 1291; GFX8-NEXT: s_mov_b32 s15, 0xf000 1292; GFX8-NEXT: s_mov_b32 s14, -1 1293; GFX8-NEXT: s_mov_b32 s12, s6 1294; GFX8-NEXT: s_mov_b32 s13, s7 1295; GFX8-NEXT: v_mov_b32_e32 v1, s2 1296; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1297; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1298; GFX8-NEXT: s_waitcnt vmcnt(0) 1299; GFX8-NEXT: buffer_wbinvl1_vol 1300; GFX8-NEXT: .LBB7_2: 1301; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1302; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1303; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 1304; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1305; GFX8-NEXT: s_mov_b32 s7, 0xf000 1306; GFX8-NEXT: s_mov_b32 s6, -1 1307; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1308; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1309; GFX8-NEXT: s_endpgm 1310; 1311; GFX9-LABEL: sub_i32_uniform: 1312; GFX9: ; %bb.0: ; %entry 1313; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1314; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 1315; GFX9-NEXT: s_mov_b64 s[2:3], exec 1316; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1317; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1318; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1319; GFX9-NEXT: ; implicit-def: $vgpr1 1320; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1321; GFX9-NEXT: s_cbranch_execz .LBB7_2 1322; GFX9-NEXT: ; %bb.1: 1323; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1324; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1325; GFX9-NEXT: s_mul_i32 s2, s8, s2 1326; GFX9-NEXT: s_mov_b32 s15, 0xf000 1327; GFX9-NEXT: s_mov_b32 s14, -1 1328; GFX9-NEXT: s_mov_b32 s12, s6 1329; GFX9-NEXT: s_mov_b32 s13, s7 1330; GFX9-NEXT: v_mov_b32_e32 v1, s2 1331; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1332; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1333; GFX9-NEXT: s_waitcnt vmcnt(0) 1334; GFX9-NEXT: buffer_wbinvl1_vol 1335; GFX9-NEXT: .LBB7_2: 1336; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1337; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1338; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 1339; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1340; GFX9-NEXT: s_mov_b32 s7, 0xf000 1341; GFX9-NEXT: s_mov_b32 s6, -1 1342; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1343; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1344; GFX9-NEXT: s_endpgm 1345; 1346; GFX1064-LABEL: sub_i32_uniform: 1347; GFX1064: ; %bb.0: ; %entry 1348; GFX1064-NEXT: s_clause 0x1 1349; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1350; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 1351; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1352; GFX1064-NEXT: ; implicit-def: $vgpr1 1353; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1354; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1355; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1356; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1357; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1358; GFX1064-NEXT: ; %bb.1: 1359; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1360; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 1361; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX1064-NEXT: s_mul_i32 s2, s8, s2 1363; GFX1064-NEXT: s_mov_b32 s14, -1 1364; GFX1064-NEXT: v_mov_b32_e32 v1, s2 1365; GFX1064-NEXT: s_mov_b32 s12, s6 1366; GFX1064-NEXT: s_mov_b32 s13, s7 1367; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1368; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1369; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 1370; GFX1064-NEXT: s_waitcnt vmcnt(0) 1371; GFX1064-NEXT: buffer_gl0_inv 1372; GFX1064-NEXT: buffer_gl1_inv 1373; GFX1064-NEXT: .LBB7_2: 1374; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1375; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1376; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1377; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 1378; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1379; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1380; GFX1064-NEXT: s_mov_b32 s6, -1 1381; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1382; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1383; GFX1064-NEXT: s_endpgm 1384; 1385; GFX1032-LABEL: sub_i32_uniform: 1386; GFX1032: ; %bb.0: ; %entry 1387; GFX1032-NEXT: s_clause 0x1 1388; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1389; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 1390; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1391; GFX1032-NEXT: ; implicit-def: $vgpr1 1392; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1393; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1394; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1395; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1396; GFX1032-NEXT: ; %bb.1: 1397; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1398; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1399; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1400; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1401; GFX1032-NEXT: s_mov_b32 s10, -1 1402; GFX1032-NEXT: v_mov_b32_e32 v1, s1 1403; GFX1032-NEXT: s_mov_b32 s8, s6 1404; GFX1032-NEXT: s_mov_b32 s9, s7 1405; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1406; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1407; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 1408; GFX1032-NEXT: s_waitcnt vmcnt(0) 1409; GFX1032-NEXT: buffer_gl0_inv 1410; GFX1032-NEXT: buffer_gl1_inv 1411; GFX1032-NEXT: .LBB7_2: 1412; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1413; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1414; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1415; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1416; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1417; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1418; GFX1032-NEXT: s_mov_b32 s6, -1 1419; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1420; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1421; GFX1032-NEXT: s_endpgm 1422entry: 1423 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel 1424 store i32 %old, i32 addrspace(1)* %out 1425 ret void 1426} 1427 1428define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { 1429; GFX7LESS-LABEL: sub_i32_varying: 1430; GFX7LESS: ; %bb.0: ; %entry 1431; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1432; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1433; GFX7LESS-NEXT: s_mov_b32 s6, -1 1434; GFX7LESS-NEXT: s_mov_b32 s10, s6 1435; GFX7LESS-NEXT: s_mov_b32 s11, s7 1436; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1437; GFX7LESS-NEXT: s_mov_b32 s8, s2 1438; GFX7LESS-NEXT: s_mov_b32 s9, s3 1439; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1440; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1441; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1442; GFX7LESS-NEXT: buffer_wbinvl1 1443; GFX7LESS-NEXT: s_mov_b32 s4, s0 1444; GFX7LESS-NEXT: s_mov_b32 s5, s1 1445; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1446; GFX7LESS-NEXT: s_endpgm 1447; 1448; GFX8-LABEL: sub_i32_varying: 1449; GFX8: ; %bb.0: ; %entry 1450; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1451; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1452; GFX8-NEXT: v_mov_b32_e32 v1, 0 1453; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1454; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1455; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1456; GFX8-NEXT: v_mov_b32_e32 v2, v0 1457; GFX8-NEXT: s_not_b64 exec, exec 1458; GFX8-NEXT: v_mov_b32_e32 v2, 0 1459; GFX8-NEXT: s_not_b64 exec, exec 1460; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1461; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1462; GFX8-NEXT: s_nop 1 1463; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1464; GFX8-NEXT: s_nop 1 1465; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1466; GFX8-NEXT: s_nop 1 1467; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1468; GFX8-NEXT: s_nop 1 1469; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1470; GFX8-NEXT: s_nop 1 1471; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1472; GFX8-NEXT: v_readlane_b32 s6, v2, 63 1473; GFX8-NEXT: s_nop 0 1474; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1475; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1476; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1477; GFX8-NEXT: ; implicit-def: $vgpr0 1478; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1479; GFX8-NEXT: s_cbranch_execz .LBB8_2 1480; GFX8-NEXT: ; %bb.1: 1481; GFX8-NEXT: s_mov_b32 s11, 0xf000 1482; GFX8-NEXT: s_mov_b32 s10, -1 1483; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1484; GFX8-NEXT: s_mov_b32 s8, s2 1485; GFX8-NEXT: s_mov_b32 s9, s3 1486; GFX8-NEXT: v_mov_b32_e32 v0, s6 1487; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1488; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1489; GFX8-NEXT: s_waitcnt vmcnt(0) 1490; GFX8-NEXT: buffer_wbinvl1_vol 1491; GFX8-NEXT: .LBB8_2: 1492; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1493; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1494; GFX8-NEXT: v_mov_b32_e32 v0, v1 1495; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1496; GFX8-NEXT: s_mov_b32 s3, 0xf000 1497; GFX8-NEXT: s_mov_b32 s2, -1 1498; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1499; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1500; GFX8-NEXT: s_endpgm 1501; 1502; GFX9-LABEL: sub_i32_varying: 1503; GFX9: ; %bb.0: ; %entry 1504; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1505; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1506; GFX9-NEXT: v_mov_b32_e32 v1, 0 1507; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1508; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1509; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1510; GFX9-NEXT: v_mov_b32_e32 v2, v0 1511; GFX9-NEXT: s_not_b64 exec, exec 1512; GFX9-NEXT: v_mov_b32_e32 v2, 0 1513; GFX9-NEXT: s_not_b64 exec, exec 1514; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1515; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1516; GFX9-NEXT: s_nop 1 1517; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1518; GFX9-NEXT: s_nop 1 1519; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1520; GFX9-NEXT: s_nop 1 1521; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1522; GFX9-NEXT: s_nop 1 1523; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1524; GFX9-NEXT: s_nop 1 1525; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1526; GFX9-NEXT: v_readlane_b32 s6, v2, 63 1527; GFX9-NEXT: s_nop 0 1528; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1529; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1530; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1531; GFX9-NEXT: ; implicit-def: $vgpr0 1532; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1533; GFX9-NEXT: s_cbranch_execz .LBB8_2 1534; GFX9-NEXT: ; %bb.1: 1535; GFX9-NEXT: s_mov_b32 s11, 0xf000 1536; GFX9-NEXT: s_mov_b32 s10, -1 1537; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1538; GFX9-NEXT: s_mov_b32 s8, s2 1539; GFX9-NEXT: s_mov_b32 s9, s3 1540; GFX9-NEXT: v_mov_b32_e32 v0, s6 1541; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1542; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1543; GFX9-NEXT: s_waitcnt vmcnt(0) 1544; GFX9-NEXT: buffer_wbinvl1_vol 1545; GFX9-NEXT: .LBB8_2: 1546; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1547; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1548; GFX9-NEXT: v_mov_b32_e32 v0, v1 1549; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1550; GFX9-NEXT: s_mov_b32 s3, 0xf000 1551; GFX9-NEXT: s_mov_b32 s2, -1 1552; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1553; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1554; GFX9-NEXT: s_endpgm 1555; 1556; GFX1064-LABEL: sub_i32_varying: 1557; GFX1064: ; %bb.0: ; %entry 1558; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1559; GFX1064-NEXT: s_not_b64 exec, exec 1560; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1561; GFX1064-NEXT: s_not_b64 exec, exec 1562; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1563; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1564; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1565; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1566; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1567; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1568; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1569; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1570; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1571; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1572; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1573; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1574; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 1575; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1576; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1577; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1578; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1579; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 1580; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 1581; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1582; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1583; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1584; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 1585; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 1586; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 1587; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1588; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1589; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 1590; GFX1064-NEXT: s_mov_b32 s4, s9 1591; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 1592; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 1593; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1594; GFX1064-NEXT: s_mov_b32 s6, -1 1595; GFX1064-NEXT: ; implicit-def: $vgpr0 1596; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 1597; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1598; GFX1064-NEXT: ; %bb.1: 1599; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1600; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1601; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1602; GFX1064-NEXT: s_mov_b32 s4, s2 1603; GFX1064-NEXT: s_mov_b32 s5, s3 1604; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1605; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1606; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1607; GFX1064-NEXT: s_waitcnt vmcnt(0) 1608; GFX1064-NEXT: buffer_gl0_inv 1609; GFX1064-NEXT: buffer_gl1_inv 1610; GFX1064-NEXT: .LBB8_2: 1611; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1612; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 1613; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1615; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1616; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1617; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1618; GFX1064-NEXT: s_mov_b32 s2, s6 1619; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1620; GFX1064-NEXT: s_endpgm 1621; 1622; GFX1032-LABEL: sub_i32_varying: 1623; GFX1032: ; %bb.0: ; %entry 1624; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1625; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1626; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1627; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1628; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1629; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1630; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1631; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1632; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1633; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1634; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1635; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1636; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1637; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1638; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1639; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1640; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 1641; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 1642; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1643; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1644; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1645; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1646; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 1647; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1648; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1649; GFX1032-NEXT: s_mov_b32 s4, s6 1650; GFX1032-NEXT: s_mov_b32 s6, -1 1651; GFX1032-NEXT: ; implicit-def: $vgpr0 1652; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo 1653; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1654; GFX1032-NEXT: ; %bb.1: 1655; GFX1032-NEXT: v_mov_b32_e32 v0, s4 1656; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1657; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1658; GFX1032-NEXT: s_mov_b32 s4, s2 1659; GFX1032-NEXT: s_mov_b32 s5, s3 1660; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1661; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1662; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 1663; GFX1032-NEXT: s_waitcnt vmcnt(0) 1664; GFX1032-NEXT: buffer_gl0_inv 1665; GFX1032-NEXT: buffer_gl1_inv 1666; GFX1032-NEXT: .LBB8_2: 1667; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1668; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 1669; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1671; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1672; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1673; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1674; GFX1032-NEXT: s_mov_b32 s2, s6 1675; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1676; GFX1032-NEXT: s_endpgm 1677entry: 1678 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1679 %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel 1680 store i32 %old, i32 addrspace(1)* %out 1681 ret void 1682} 1683 1684define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 1685; GFX7LESS-LABEL: sub_i64_constant: 1686; GFX7LESS: ; %bb.0: ; %entry 1687; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1688; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1689; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1690; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1691; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1692; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1693; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1694; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 1695; GFX7LESS-NEXT: ; %bb.1: 1696; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 1697; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1698; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 1699; GFX7LESS-NEXT: s_mov_b32 s10, -1 1700; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1701; GFX7LESS-NEXT: s_mov_b32 s8, s2 1702; GFX7LESS-NEXT: s_mov_b32 s9, s3 1703; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1704; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1705; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1706; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1707; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1708; GFX7LESS-NEXT: buffer_wbinvl1 1709; GFX7LESS-NEXT: .LBB9_2: 1710; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1711; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1712; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1713; GFX7LESS-NEXT: s_mov_b32 s2, -1 1714; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 1715; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 1716; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1717; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1718; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1719; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 1720; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1721; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1722; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1723; GFX7LESS-NEXT: s_endpgm 1724; 1725; GFX8-LABEL: sub_i64_constant: 1726; GFX8: ; %bb.0: ; %entry 1727; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1728; GFX8-NEXT: s_mov_b64 s[6:7], exec 1729; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1730; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1731; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1732; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1733; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1734; GFX8-NEXT: s_cbranch_execz .LBB9_2 1735; GFX8-NEXT: ; %bb.1: 1736; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1737; GFX8-NEXT: s_mov_b32 s8, s2 1738; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1739; GFX8-NEXT: s_mul_i32 s2, s2, 5 1740; GFX8-NEXT: s_mov_b32 s11, 0xf000 1741; GFX8-NEXT: s_mov_b32 s10, -1 1742; GFX8-NEXT: s_mov_b32 s9, s3 1743; GFX8-NEXT: v_mov_b32_e32 v0, s2 1744; GFX8-NEXT: v_mov_b32_e32 v1, 0 1745; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1746; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1747; GFX8-NEXT: s_waitcnt vmcnt(0) 1748; GFX8-NEXT: buffer_wbinvl1_vol 1749; GFX8-NEXT: .LBB9_2: 1750; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1751; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1752; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1753; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1754; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1755; GFX8-NEXT: v_mov_b32_e32 v2, s5 1756; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 1757; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1758; GFX8-NEXT: s_mov_b32 s3, 0xf000 1759; GFX8-NEXT: s_mov_b32 s2, -1 1760; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1761; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1762; GFX8-NEXT: s_endpgm 1763; 1764; GFX9-LABEL: sub_i64_constant: 1765; GFX9: ; %bb.0: ; %entry 1766; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1767; GFX9-NEXT: s_mov_b64 s[6:7], exec 1768; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1769; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1770; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1771; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1772; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1773; GFX9-NEXT: s_cbranch_execz .LBB9_2 1774; GFX9-NEXT: ; %bb.1: 1775; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1776; GFX9-NEXT: s_mov_b32 s8, s2 1777; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] 1778; GFX9-NEXT: s_mul_i32 s2, s2, 5 1779; GFX9-NEXT: s_mov_b32 s11, 0xf000 1780; GFX9-NEXT: s_mov_b32 s10, -1 1781; GFX9-NEXT: s_mov_b32 s9, s3 1782; GFX9-NEXT: v_mov_b32_e32 v0, s2 1783; GFX9-NEXT: v_mov_b32_e32 v1, 0 1784; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1785; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1786; GFX9-NEXT: s_waitcnt vmcnt(0) 1787; GFX9-NEXT: buffer_wbinvl1_vol 1788; GFX9-NEXT: .LBB9_2: 1789; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1790; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1791; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1792; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1793; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1794; GFX9-NEXT: v_mov_b32_e32 v2, s5 1795; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 1796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1797; GFX9-NEXT: s_mov_b32 s3, 0xf000 1798; GFX9-NEXT: s_mov_b32 s2, -1 1799; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 1800; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1801; GFX9-NEXT: s_endpgm 1802; 1803; GFX1064-LABEL: sub_i64_constant: 1804; GFX1064: ; %bb.0: ; %entry 1805; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1806; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1807; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1808; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1809; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1810; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1811; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1812; GFX1064-NEXT: s_cbranch_execz .LBB9_2 1813; GFX1064-NEXT: ; %bb.1: 1814; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1815; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1816; GFX1064-NEXT: s_mul_i32 s6, s6, 5 1817; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 1818; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1819; GFX1064-NEXT: s_mov_b32 s10, -1 1820; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1821; GFX1064-NEXT: s_mov_b32 s8, s2 1822; GFX1064-NEXT: s_mov_b32 s9, s3 1823; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1824; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1825; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1826; GFX1064-NEXT: s_waitcnt vmcnt(0) 1827; GFX1064-NEXT: buffer_gl0_inv 1828; GFX1064-NEXT: buffer_gl1_inv 1829; GFX1064-NEXT: .LBB9_2: 1830; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1831; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1832; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1833; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1834; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1835; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1836; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1837; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 1838; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1839; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1840; GFX1064-NEXT: s_mov_b32 s2, -1 1841; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1842; GFX1064-NEXT: s_endpgm 1843; 1844; GFX1032-LABEL: sub_i64_constant: 1845; GFX1032: ; %bb.0: ; %entry 1846; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1847; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1848; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1849; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1850; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1851; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1852; GFX1032-NEXT: s_cbranch_execz .LBB9_2 1853; GFX1032-NEXT: ; %bb.1: 1854; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1855; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1856; GFX1032-NEXT: s_mul_i32 s5, s5, 5 1857; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 1858; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1859; GFX1032-NEXT: s_mov_b32 s10, -1 1860; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1861; GFX1032-NEXT: s_mov_b32 s8, s2 1862; GFX1032-NEXT: s_mov_b32 s9, s3 1863; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1864; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1865; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 1866; GFX1032-NEXT: s_waitcnt vmcnt(0) 1867; GFX1032-NEXT: buffer_gl0_inv 1868; GFX1032-NEXT: buffer_gl1_inv 1869; GFX1032-NEXT: .LBB9_2: 1870; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1871; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1872; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1873; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1874; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1875; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1876; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1877; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 1878; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 1879; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1880; GFX1032-NEXT: s_mov_b32 s2, -1 1881; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1882; GFX1032-NEXT: s_endpgm 1883entry: 1884 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel 1885 store i64 %old, i64 addrspace(1)* %out 1886 ret void 1887} 1888 1889define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { 1890; GFX7LESS-LABEL: sub_i64_uniform: 1891; GFX7LESS: ; %bb.0: ; %entry 1892; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec 1893; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1894; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1895; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 1896; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 1897; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1898; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1899; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1900; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 1901; GFX7LESS-NEXT: ; %bb.1: 1902; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 1903; GFX7LESS-NEXT: s_mov_b32 s14, -1 1904; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1905; GFX7LESS-NEXT: s_mov_b32 s12, s6 1906; GFX7LESS-NEXT: s_mov_b32 s13, s7 1907; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1908; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 1909; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1910; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 1911; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 1912; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1913; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1914; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1915; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1916; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 1917; GFX7LESS-NEXT: buffer_wbinvl1 1918; GFX7LESS-NEXT: .LBB10_2: 1919; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1920; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1921; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1922; GFX7LESS-NEXT: s_mov_b32 s6, -1 1923; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1924; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 1925; GFX7LESS-NEXT: s_waitcnt expcnt(0) 1926; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 1927; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 1928; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 1929; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1930; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 1931; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 1932; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 1933; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1934; GFX7LESS-NEXT: s_endpgm 1935; 1936; GFX8-LABEL: sub_i64_uniform: 1937; GFX8: ; %bb.0: ; %entry 1938; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1939; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1940; GFX8-NEXT: s_mov_b64 s[8:9], exec 1941; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1942; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1943; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1944; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1945; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1946; GFX8-NEXT: s_cbranch_execz .LBB10_2 1947; GFX8-NEXT: ; %bb.1: 1948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX8-NEXT: s_mov_b32 s12, s6 1950; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1951; GFX8-NEXT: v_mov_b32_e32 v0, s6 1952; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 1953; GFX8-NEXT: s_mul_i32 s6, s1, s6 1954; GFX8-NEXT: s_mov_b32 s15, 0xf000 1955; GFX8-NEXT: s_mov_b32 s14, -1 1956; GFX8-NEXT: s_mov_b32 s13, s7 1957; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1958; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1959; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 1960; GFX8-NEXT: s_waitcnt vmcnt(0) 1961; GFX8-NEXT: buffer_wbinvl1_vol 1962; GFX8-NEXT: .LBB10_2: 1963; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1964; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1965; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 1966; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 1967; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1968; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1969; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 1970; GFX8-NEXT: v_mov_b32_e32 v3, s1 1971; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 1972; GFX8-NEXT: s_mov_b32 s7, 0xf000 1973; GFX8-NEXT: s_mov_b32 s6, -1 1974; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 1975; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1976; GFX8-NEXT: s_endpgm 1977; 1978; GFX9-LABEL: sub_i64_uniform: 1979; GFX9: ; %bb.0: ; %entry 1980; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1981; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1982; GFX9-NEXT: s_mov_b64 s[8:9], exec 1983; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 1984; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 1985; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1986; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1987; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1988; GFX9-NEXT: s_cbranch_execz .LBB10_2 1989; GFX9-NEXT: ; %bb.1: 1990; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1991; GFX9-NEXT: s_mov_b32 s12, s6 1992; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] 1993; GFX9-NEXT: s_mov_b32 s13, s7 1994; GFX9-NEXT: s_mul_i32 s7, s3, s6 1995; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1996; GFX9-NEXT: s_add_i32 s8, s8, s7 1997; GFX9-NEXT: s_mul_i32 s6, s2, s6 1998; GFX9-NEXT: s_mov_b32 s15, 0xf000 1999; GFX9-NEXT: s_mov_b32 s14, -1 2000; GFX9-NEXT: v_mov_b32_e32 v0, s6 2001; GFX9-NEXT: v_mov_b32_e32 v1, s8 2002; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2003; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc 2004; GFX9-NEXT: s_waitcnt vmcnt(0) 2005; GFX9-NEXT: buffer_wbinvl1_vol 2006; GFX9-NEXT: .LBB10_2: 2007; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 2008; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2009; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 2010; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2011; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2012; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2013; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 2014; GFX9-NEXT: v_mov_b32_e32 v3, s1 2015; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 2016; GFX9-NEXT: s_mov_b32 s7, 0xf000 2017; GFX9-NEXT: s_mov_b32 s6, -1 2018; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc 2019; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2020; GFX9-NEXT: s_endpgm 2021; 2022; GFX1064-LABEL: sub_i64_uniform: 2023; GFX1064: ; %bb.0: ; %entry 2024; GFX1064-NEXT: s_clause 0x1 2025; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2026; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2027; GFX1064-NEXT: s_mov_b64 s[8:9], exec 2028; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 2029; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 2030; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2031; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2032; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 2033; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2034; GFX1064-NEXT: ; %bb.1: 2035; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] 2036; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 2037; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2038; GFX1064-NEXT: s_mul_i32 s9, s3, s8 2039; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 2040; GFX1064-NEXT: s_mul_i32 s8, s2, s8 2041; GFX1064-NEXT: s_add_i32 s10, s10, s9 2042; GFX1064-NEXT: v_mov_b32_e32 v0, s8 2043; GFX1064-NEXT: v_mov_b32_e32 v1, s10 2044; GFX1064-NEXT: s_mov_b32 s10, -1 2045; GFX1064-NEXT: s_mov_b32 s8, s6 2046; GFX1064-NEXT: s_mov_b32 s9, s7 2047; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2048; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2049; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2050; GFX1064-NEXT: s_waitcnt vmcnt(0) 2051; GFX1064-NEXT: buffer_gl0_inv 2052; GFX1064-NEXT: buffer_gl1_inv 2053; GFX1064-NEXT: .LBB10_2: 2054; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2055; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 2056; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2057; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 2058; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2059; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 2060; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 2061; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2062; GFX1064-NEXT: s_mov_b32 s6, -1 2063; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 2064; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2 2065; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc 2066; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2067; GFX1064-NEXT: s_endpgm 2068; 2069; GFX1032-LABEL: sub_i64_uniform: 2070; GFX1032: ; %bb.0: ; %entry 2071; GFX1032-NEXT: s_clause 0x1 2072; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2073; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2074; GFX1032-NEXT: s_mov_b32 s8, exec_lo 2075; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2076; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 2077; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2078; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2079; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2080; GFX1032-NEXT: ; %bb.1: 2081; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 2082; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 2083; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2084; GFX1032-NEXT: s_mul_i32 s8, s3, s1 2085; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 2086; GFX1032-NEXT: s_mul_i32 s1, s2, s1 2087; GFX1032-NEXT: s_add_i32 s9, s9, s8 2088; GFX1032-NEXT: v_mov_b32_e32 v0, s1 2089; GFX1032-NEXT: v_mov_b32_e32 v1, s9 2090; GFX1032-NEXT: s_mov_b32 s10, -1 2091; GFX1032-NEXT: s_mov_b32 s8, s6 2092; GFX1032-NEXT: s_mov_b32 s9, s7 2093; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2094; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2095; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2096; GFX1032-NEXT: s_waitcnt vmcnt(0) 2097; GFX1032-NEXT: buffer_gl0_inv 2098; GFX1032-NEXT: buffer_gl1_inv 2099; GFX1032-NEXT: .LBB10_2: 2100; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2101; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 2102; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2103; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 2104; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0 2105; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 2106; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 2107; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2108; GFX1032-NEXT: s_mov_b32 s6, -1 2109; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 2110; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2 2111; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2112; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2113; GFX1032-NEXT: s_endpgm 2114entry: 2115 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel 2116 store i64 %old, i64 addrspace(1)* %out 2117 ret void 2118} 2119 2120define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { 2121; GFX7LESS-LABEL: sub_i64_varying: 2122; GFX7LESS: ; %bb.0: ; %entry 2123; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2124; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2125; GFX7LESS-NEXT: s_mov_b32 s6, -1 2126; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2127; GFX7LESS-NEXT: s_mov_b32 s10, s6 2128; GFX7LESS-NEXT: s_mov_b32 s11, s7 2129; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2130; GFX7LESS-NEXT: s_mov_b32 s8, s2 2131; GFX7LESS-NEXT: s_mov_b32 s9, s3 2132; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2133; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2134; GFX7LESS-NEXT: s_waitcnt vmcnt(0) 2135; GFX7LESS-NEXT: buffer_wbinvl1 2136; GFX7LESS-NEXT: s_mov_b32 s4, s0 2137; GFX7LESS-NEXT: s_mov_b32 s5, s1 2138; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2139; GFX7LESS-NEXT: s_endpgm 2140; 2141; GFX89-LABEL: sub_i64_varying: 2142; GFX89: ; %bb.0: ; %entry 2143; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2144; GFX89-NEXT: s_mov_b32 s7, 0xf000 2145; GFX89-NEXT: s_mov_b32 s6, -1 2146; GFX89-NEXT: s_mov_b32 s10, s6 2147; GFX89-NEXT: s_mov_b32 s11, s7 2148; GFX89-NEXT: s_waitcnt lgkmcnt(0) 2149; GFX89-NEXT: s_mov_b32 s8, s2 2150; GFX89-NEXT: s_mov_b32 s9, s3 2151; GFX89-NEXT: v_mov_b32_e32 v1, 0 2152; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2153; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2154; GFX89-NEXT: s_waitcnt vmcnt(0) 2155; GFX89-NEXT: buffer_wbinvl1_vol 2156; GFX89-NEXT: s_mov_b32 s4, s0 2157; GFX89-NEXT: s_mov_b32 s5, s1 2158; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2159; GFX89-NEXT: s_endpgm 2160; 2161; GFX10-LABEL: sub_i64_varying: 2162; GFX10: ; %bb.0: ; %entry 2163; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2164; GFX10-NEXT: v_mov_b32_e32 v1, 0 2165; GFX10-NEXT: s_mov_b32 s7, 0x31016000 2166; GFX10-NEXT: s_mov_b32 s6, -1 2167; GFX10-NEXT: s_mov_b32 s11, s7 2168; GFX10-NEXT: s_mov_b32 s10, s6 2169; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2170; GFX10-NEXT: s_mov_b32 s8, s2 2171; GFX10-NEXT: s_mov_b32 s9, s3 2172; GFX10-NEXT: s_mov_b32 s4, s0 2173; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2174; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2175; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc 2176; GFX10-NEXT: s_waitcnt vmcnt(0) 2177; GFX10-NEXT: buffer_gl0_inv 2178; GFX10-NEXT: buffer_gl1_inv 2179; GFX10-NEXT: s_mov_b32 s5, s1 2180; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2181; GFX10-NEXT: s_endpgm 2182entry: 2183 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2184 %zext = zext i32 %lane to i64 2185 %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel 2186 store i64 %old, i64 addrspace(1)* %out 2187 ret void 2188} 2189