1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) 10declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) 11 12; Show what the atomic optimization pass will do for raw buffers. 13 14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 15; GFX6-LABEL: add_i32_constant: 16; GFX6: ; %bb.0: ; %entry 17; GFX6-NEXT: s_mov_b64 s[2:3], exec 18; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 19; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 20; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 21; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 22; GFX6-NEXT: ; implicit-def: $vgpr1 23; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 24; GFX6-NEXT: s_cbranch_execz .LBB0_2 25; GFX6-NEXT: ; %bb.1: 26; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 27; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 28; GFX6-NEXT: s_mul_i32 s0, s0, 5 29; GFX6-NEXT: v_mov_b32_e32 v1, s0 30; GFX6-NEXT: s_waitcnt lgkmcnt(0) 31; GFX6-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 32; GFX6-NEXT: .LBB0_2: 33; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 34; GFX6-NEXT: s_waitcnt vmcnt(0) 35; GFX6-NEXT: v_readfirstlane_b32 s0, v1 36; GFX6-NEXT: s_mov_b32 s7, 0xf000 37; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s0 38; GFX6-NEXT: s_mov_b32 s6, -1 39; GFX6-NEXT: s_waitcnt lgkmcnt(0) 40; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 41; GFX6-NEXT: s_endpgm 42; 43; GFX8-LABEL: add_i32_constant: 44; GFX8: ; %bb.0: ; %entry 45; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 46; GFX8-NEXT: s_mov_b64 s[6:7], exec 47; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 48; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 49; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 50; GFX8-NEXT: ; implicit-def: $vgpr1 51; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 52; GFX8-NEXT: s_cbranch_execz .LBB0_2 53; GFX8-NEXT: ; %bb.1: 54; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 55; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 56; GFX8-NEXT: s_mul_i32 s0, s0, 5 57; GFX8-NEXT: v_mov_b32_e32 v1, s0 58; GFX8-NEXT: s_waitcnt lgkmcnt(0) 59; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 60; GFX8-NEXT: .LBB0_2: 61; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 62; GFX8-NEXT: s_waitcnt vmcnt(0) 63; GFX8-NEXT: v_readfirstlane_b32 s0, v1 64; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: v_mov_b32_e32 v0, s2 67; GFX8-NEXT: v_mov_b32_e32 v1, s3 68; GFX8-NEXT: flat_store_dword v[0:1], v2 69; GFX8-NEXT: s_endpgm 70; 71; GFX9-LABEL: add_i32_constant: 72; GFX9: ; %bb.0: ; %entry 73; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 74; GFX9-NEXT: s_mov_b64 s[6:7], exec 75; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 76; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 77; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 78; GFX9-NEXT: ; implicit-def: $vgpr1 79; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 80; GFX9-NEXT: s_cbranch_execz .LBB0_2 81; GFX9-NEXT: ; %bb.1: 82; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 83; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 84; GFX9-NEXT: s_mul_i32 s0, s0, 5 85; GFX9-NEXT: v_mov_b32_e32 v1, s0 86; GFX9-NEXT: s_waitcnt lgkmcnt(0) 87; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 88; GFX9-NEXT: .LBB0_2: 89; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 90; GFX9-NEXT: s_waitcnt vmcnt(0) 91; GFX9-NEXT: v_readfirstlane_b32 s0, v1 92; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0 93; GFX9-NEXT: v_mov_b32_e32 v1, 0 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 96; GFX9-NEXT: s_endpgm 97; 98; GFX10W64-LABEL: add_i32_constant: 99; GFX10W64: ; %bb.0: ; %entry 100; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 101; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 102; GFX10W64-NEXT: ; implicit-def: $vgpr1 103; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 104; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 105; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 106; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 107; GFX10W64-NEXT: s_cbranch_execz .LBB0_2 108; GFX10W64-NEXT: ; %bb.1: 109; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 110; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 111; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 112; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 113; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 114; GFX10W64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 115; GFX10W64-NEXT: .LBB0_2: 116; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 117; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 118; GFX10W64-NEXT: s_waitcnt vmcnt(0) 119; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 120; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 121; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 122; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 123; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 124; GFX10W64-NEXT: s_endpgm 125; 126; GFX10W32-LABEL: add_i32_constant: 127; GFX10W32: ; %bb.0: ; %entry 128; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 129; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 130; GFX10W32-NEXT: ; implicit-def: $vgpr1 131; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 132; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 133; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 134; GFX10W32-NEXT: s_cbranch_execz .LBB0_2 135; GFX10W32-NEXT: ; %bb.1: 136; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 137; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 138; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 139; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 140; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 141; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 142; GFX10W32-NEXT: .LBB0_2: 143; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 144; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 145; GFX10W32-NEXT: s_waitcnt vmcnt(0) 146; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 147; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 148; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 149; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 150; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 151; GFX10W32-NEXT: s_endpgm 152entry: 153 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 154 store i32 %old, i32 addrspace(1)* %out 155 ret void 156} 157 158define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { 159; GFX6-LABEL: add_i32_uniform: 160; GFX6: ; %bb.0: ; %entry 161; GFX6-NEXT: s_mov_b64 s[2:3], exec 162; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 163; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 164; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 165; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 166; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 167; GFX6-NEXT: ; implicit-def: $vgpr1 168; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 169; GFX6-NEXT: s_cbranch_execz .LBB1_2 170; GFX6-NEXT: ; %bb.1: 171; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 172; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 173; GFX6-NEXT: s_waitcnt lgkmcnt(0) 174; GFX6-NEXT: s_mul_i32 s0, s8, s0 175; GFX6-NEXT: v_mov_b32_e32 v1, s0 176; GFX6-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 177; GFX6-NEXT: .LBB1_2: 178; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 179; GFX6-NEXT: s_waitcnt vmcnt(0) 180; GFX6-NEXT: v_readfirstlane_b32 s0, v1 181; GFX6-NEXT: s_waitcnt lgkmcnt(0) 182; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 183; GFX6-NEXT: s_mov_b32 s7, 0xf000 184; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 185; GFX6-NEXT: s_mov_b32 s6, -1 186; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 187; GFX6-NEXT: s_endpgm 188; 189; GFX8-LABEL: add_i32_uniform: 190; GFX8: ; %bb.0: ; %entry 191; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 192; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 193; GFX8-NEXT: s_mov_b64 s[4:5], exec 194; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 195; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 196; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 197; GFX8-NEXT: ; implicit-def: $vgpr1 198; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 199; GFX8-NEXT: s_cbranch_execz .LBB1_2 200; GFX8-NEXT: ; %bb.1: 201; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 202; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 203; GFX8-NEXT: s_waitcnt lgkmcnt(0) 204; GFX8-NEXT: s_mul_i32 s0, s8, s0 205; GFX8-NEXT: v_mov_b32_e32 v1, s0 206; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 207; GFX8-NEXT: .LBB1_2: 208; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 210; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 211; GFX8-NEXT: s_waitcnt vmcnt(0) 212; GFX8-NEXT: v_readfirstlane_b32 s0, v1 213; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 214; GFX8-NEXT: v_mov_b32_e32 v0, s2 215; GFX8-NEXT: v_mov_b32_e32 v1, s3 216; GFX8-NEXT: flat_store_dword v[0:1], v2 217; GFX8-NEXT: s_endpgm 218; 219; GFX9-LABEL: add_i32_uniform: 220; GFX9: ; %bb.0: ; %entry 221; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 222; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 223; GFX9-NEXT: s_mov_b64 s[4:5], exec 224; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 225; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 226; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 227; GFX9-NEXT: ; implicit-def: $vgpr1 228; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 229; GFX9-NEXT: s_cbranch_execz .LBB1_2 230; GFX9-NEXT: ; %bb.1: 231; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 232; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 233; GFX9-NEXT: s_waitcnt lgkmcnt(0) 234; GFX9-NEXT: s_mul_i32 s0, s8, s0 235; GFX9-NEXT: v_mov_b32_e32 v1, s0 236; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 237; GFX9-NEXT: .LBB1_2: 238; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 239; GFX9-NEXT: s_waitcnt lgkmcnt(0) 240; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 241; GFX9-NEXT: s_waitcnt vmcnt(0) 242; GFX9-NEXT: v_readfirstlane_b32 s0, v1 243; GFX9-NEXT: v_mov_b32_e32 v1, 0 244; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 245; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 246; GFX9-NEXT: s_endpgm 247; 248; GFX10W64-LABEL: add_i32_uniform: 249; GFX10W64: ; %bb.0: ; %entry 250; GFX10W64-NEXT: s_clause 0x1 251; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 252; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 253; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 254; GFX10W64-NEXT: ; implicit-def: $vgpr1 255; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 256; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 257; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 258; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 259; GFX10W64-NEXT: s_cbranch_execz .LBB1_2 260; GFX10W64-NEXT: ; %bb.1: 261; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 262; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 263; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 264; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 265; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 266; GFX10W64-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc 267; GFX10W64-NEXT: .LBB1_2: 268; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 269; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 270; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 271; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 272; GFX10W64-NEXT: s_waitcnt vmcnt(0) 273; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 274; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 275; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 276; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 277; GFX10W64-NEXT: s_endpgm 278; 279; GFX10W32-LABEL: add_i32_uniform: 280; GFX10W32: ; %bb.0: ; %entry 281; GFX10W32-NEXT: s_clause 0x1 282; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 283; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 284; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 285; GFX10W32-NEXT: ; implicit-def: $vgpr1 286; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 287; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 288; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 289; GFX10W32-NEXT: s_cbranch_execz .LBB1_2 290; GFX10W32-NEXT: ; %bb.1: 291; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 292; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 293; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 294; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 295; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 296; GFX10W32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc 297; GFX10W32-NEXT: .LBB1_2: 298; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 299; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 300; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 301; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 302; GFX10W32-NEXT: s_waitcnt vmcnt(0) 303; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 304; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 305; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 306; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 307; GFX10W32-NEXT: s_endpgm 308entry: 309 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) 310 store i32 %old, i32 addrspace(1)* %out 311 ret void 312} 313 314define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 315; GFX6-LABEL: add_i32_varying_vdata: 316; GFX6: ; %bb.0: ; %entry 317; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 318; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 319; GFX6-NEXT: s_waitcnt lgkmcnt(0) 320; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 321; GFX6-NEXT: s_mov_b32 s3, 0xf000 322; GFX6-NEXT: s_mov_b32 s2, -1 323; GFX6-NEXT: s_waitcnt vmcnt(0) 324; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 325; GFX6-NEXT: s_endpgm 326; 327; GFX8-LABEL: add_i32_varying_vdata: 328; GFX8: ; %bb.0: ; %entry 329; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 330; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 331; GFX8-NEXT: v_mov_b32_e32 v1, 0 332; GFX8-NEXT: s_mov_b64 exec, s[4:5] 333; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 334; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 335; GFX8-NEXT: v_mov_b32_e32 v2, v0 336; GFX8-NEXT: s_not_b64 exec, exec 337; GFX8-NEXT: v_mov_b32_e32 v2, 0 338; GFX8-NEXT: s_not_b64 exec, exec 339; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 340; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 341; GFX8-NEXT: s_nop 1 342; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 343; GFX8-NEXT: s_nop 1 344; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 345; GFX8-NEXT: s_nop 1 346; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 347; GFX8-NEXT: s_nop 1 348; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 349; GFX8-NEXT: s_nop 1 350; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 351; GFX8-NEXT: v_readlane_b32 s6, v2, 63 352; GFX8-NEXT: s_nop 0 353; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 354; GFX8-NEXT: s_mov_b64 exec, s[4:5] 355; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 356; GFX8-NEXT: ; implicit-def: $vgpr0 357; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 358; GFX8-NEXT: s_cbranch_execz .LBB2_2 359; GFX8-NEXT: ; %bb.1: 360; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 361; GFX8-NEXT: v_mov_b32_e32 v0, s6 362; GFX8-NEXT: s_waitcnt lgkmcnt(0) 363; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 364; GFX8-NEXT: .LBB2_2: 365; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 366; GFX8-NEXT: s_waitcnt vmcnt(0) 367; GFX8-NEXT: v_readfirstlane_b32 s0, v0 368; GFX8-NEXT: v_mov_b32_e32 v0, v1 369; GFX8-NEXT: s_waitcnt lgkmcnt(0) 370; GFX8-NEXT: v_mov_b32_e32 v4, s3 371; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 372; GFX8-NEXT: v_mov_b32_e32 v3, s2 373; GFX8-NEXT: flat_store_dword v[3:4], v0 374; GFX8-NEXT: s_endpgm 375; 376; GFX9-LABEL: add_i32_varying_vdata: 377; GFX9: ; %bb.0: ; %entry 378; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 379; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 380; GFX9-NEXT: v_mov_b32_e32 v1, 0 381; GFX9-NEXT: s_mov_b64 exec, s[4:5] 382; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 383; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 384; GFX9-NEXT: v_mov_b32_e32 v2, v0 385; GFX9-NEXT: s_not_b64 exec, exec 386; GFX9-NEXT: v_mov_b32_e32 v2, 0 387; GFX9-NEXT: s_not_b64 exec, exec 388; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 389; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 390; GFX9-NEXT: s_nop 1 391; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 392; GFX9-NEXT: s_nop 1 393; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 394; GFX9-NEXT: s_nop 1 395; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 396; GFX9-NEXT: s_nop 1 397; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 398; GFX9-NEXT: s_nop 1 399; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 400; GFX9-NEXT: v_readlane_b32 s6, v2, 63 401; GFX9-NEXT: s_nop 0 402; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 403; GFX9-NEXT: s_mov_b64 exec, s[4:5] 404; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 405; GFX9-NEXT: ; implicit-def: $vgpr0 406; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 407; GFX9-NEXT: s_cbranch_execz .LBB2_2 408; GFX9-NEXT: ; %bb.1: 409; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 410; GFX9-NEXT: v_mov_b32_e32 v0, s6 411; GFX9-NEXT: s_waitcnt lgkmcnt(0) 412; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 413; GFX9-NEXT: .LBB2_2: 414; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 415; GFX9-NEXT: s_waitcnt vmcnt(0) 416; GFX9-NEXT: v_readfirstlane_b32 s0, v0 417; GFX9-NEXT: v_mov_b32_e32 v0, v1 418; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 419; GFX9-NEXT: v_mov_b32_e32 v3, 0 420; GFX9-NEXT: s_waitcnt lgkmcnt(0) 421; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 422; GFX9-NEXT: s_endpgm 423; 424; GFX10W64-LABEL: add_i32_varying_vdata: 425; GFX10W64: ; %bb.0: ; %entry 426; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 427; GFX10W64-NEXT: s_not_b64 exec, exec 428; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 429; GFX10W64-NEXT: s_not_b64 exec, exec 430; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 431; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 432; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 433; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 434; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 435; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 436; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 437; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 438; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 439; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 440; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 441; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 442; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 443; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 444; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 445; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 446; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 447; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 448; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 449; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 450; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 451; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 452; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 453; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 454; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 455; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 456; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 457; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 458; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 459; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 460; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 461; GFX10W64-NEXT: ; implicit-def: $vgpr0 462; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 463; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 464; GFX10W64-NEXT: ; %bb.1: 465; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 466; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 467; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 468; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 469; GFX10W64-NEXT: .LBB2_2: 470; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 471; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 472; GFX10W64-NEXT: s_waitcnt vmcnt(0) 473; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 474; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 475; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 476; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 477; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 478; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 479; GFX10W64-NEXT: s_endpgm 480; 481; GFX10W32-LABEL: add_i32_varying_vdata: 482; GFX10W32: ; %bb.0: ; %entry 483; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 484; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 485; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 486; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 487; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 488; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 489; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 490; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 491; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 492; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 493; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 494; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 495; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 496; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 497; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 498; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 499; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 500; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 501; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 502; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 503; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 504; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 505; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 506; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 507; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 508; GFX10W32-NEXT: ; implicit-def: $vgpr0 509; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 510; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 511; GFX10W32-NEXT: ; %bb.1: 512; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 513; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 514; GFX10W32-NEXT: s_mov_b32 s5, s6 515; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 516; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc 517; GFX10W32-NEXT: .LBB2_2: 518; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 519; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 520; GFX10W32-NEXT: s_waitcnt vmcnt(0) 521; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 522; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 523; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 524; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 525; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 526; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 527; GFX10W32-NEXT: s_endpgm 528entry: 529 %lane = call i32 @llvm.amdgcn.workitem.id.x() 530 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 531 store i32 %old, i32 addrspace(1)* %out 532 ret void 533} 534 535define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 536; GFX6-LABEL: add_i32_varying_offset: 537; GFX6: ; %bb.0: ; %entry 538; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 539; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 540; GFX6-NEXT: v_mov_b32_e32 v1, 1 541; GFX6-NEXT: s_waitcnt lgkmcnt(0) 542; GFX6-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 543; GFX6-NEXT: s_mov_b32 s3, 0xf000 544; GFX6-NEXT: s_mov_b32 s2, -1 545; GFX6-NEXT: s_waitcnt vmcnt(0) 546; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 547; GFX6-NEXT: s_endpgm 548; 549; GFX8-LABEL: add_i32_varying_offset: 550; GFX8: ; %bb.0: ; %entry 551; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 552; GFX8-NEXT: v_mov_b32_e32 v2, 1 553; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 554; GFX8-NEXT: s_waitcnt lgkmcnt(0) 555; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc 556; GFX8-NEXT: v_mov_b32_e32 v0, s0 557; GFX8-NEXT: v_mov_b32_e32 v1, s1 558; GFX8-NEXT: s_waitcnt vmcnt(0) 559; GFX8-NEXT: flat_store_dword v[0:1], v2 560; GFX8-NEXT: s_endpgm 561; 562; GFX9-LABEL: add_i32_varying_offset: 563; GFX9: ; %bb.0: ; %entry 564; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 565; GFX9-NEXT: v_mov_b32_e32 v1, 1 566; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 567; GFX9-NEXT: s_waitcnt lgkmcnt(0) 568; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 569; GFX9-NEXT: v_mov_b32_e32 v0, 0 570; GFX9-NEXT: s_waitcnt vmcnt(0) 571; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 572; GFX9-NEXT: s_endpgm 573; 574; GFX10-LABEL: add_i32_varying_offset: 575; GFX10: ; %bb.0: ; %entry 576; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 577; GFX10-NEXT: v_mov_b32_e32 v1, 1 578; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 579; GFX10-NEXT: s_waitcnt lgkmcnt(0) 580; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc 581; GFX10-NEXT: v_mov_b32_e32 v0, 0 582; GFX10-NEXT: s_waitcnt vmcnt(0) 583; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 584; GFX10-NEXT: s_endpgm 585entry: 586 %lane = call i32 @llvm.amdgcn.workitem.id.x() 587 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 588 store i32 %old, i32 addrspace(1)* %out 589 ret void 590} 591 592define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { 593; GFX6-LABEL: sub_i32_constant: 594; GFX6: ; %bb.0: ; %entry 595; GFX6-NEXT: s_mov_b64 s[2:3], exec 596; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 597; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 598; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 599; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 600; GFX6-NEXT: ; implicit-def: $vgpr1 601; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 602; GFX6-NEXT: s_cbranch_execz .LBB4_2 603; GFX6-NEXT: ; %bb.1: 604; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 605; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 606; GFX6-NEXT: s_mul_i32 s0, s0, 5 607; GFX6-NEXT: v_mov_b32_e32 v1, s0 608; GFX6-NEXT: s_waitcnt lgkmcnt(0) 609; GFX6-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 610; GFX6-NEXT: .LBB4_2: 611; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 612; GFX6-NEXT: s_waitcnt vmcnt(0) 613; GFX6-NEXT: v_readfirstlane_b32 s0, v1 614; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0 615; GFX6-NEXT: s_mov_b32 s7, 0xf000 616; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 617; GFX6-NEXT: s_mov_b32 s6, -1 618; GFX6-NEXT: s_waitcnt lgkmcnt(0) 619; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 620; GFX6-NEXT: s_endpgm 621; 622; GFX8-LABEL: sub_i32_constant: 623; GFX8: ; %bb.0: ; %entry 624; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 625; GFX8-NEXT: s_mov_b64 s[6:7], exec 626; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 627; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 628; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 629; GFX8-NEXT: ; implicit-def: $vgpr1 630; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 631; GFX8-NEXT: s_cbranch_execz .LBB4_2 632; GFX8-NEXT: ; %bb.1: 633; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 634; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 635; GFX8-NEXT: s_mul_i32 s0, s0, 5 636; GFX8-NEXT: v_mov_b32_e32 v1, s0 637; GFX8-NEXT: s_waitcnt lgkmcnt(0) 638; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 639; GFX8-NEXT: .LBB4_2: 640; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 641; GFX8-NEXT: s_waitcnt vmcnt(0) 642; GFX8-NEXT: v_readfirstlane_b32 s0, v1 643; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 644; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 645; GFX8-NEXT: s_waitcnt lgkmcnt(0) 646; GFX8-NEXT: v_mov_b32_e32 v0, s2 647; GFX8-NEXT: v_mov_b32_e32 v1, s3 648; GFX8-NEXT: flat_store_dword v[0:1], v2 649; GFX8-NEXT: s_endpgm 650; 651; GFX9-LABEL: sub_i32_constant: 652; GFX9: ; %bb.0: ; %entry 653; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 654; GFX9-NEXT: s_mov_b64 s[6:7], exec 655; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 656; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 657; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 658; GFX9-NEXT: ; implicit-def: $vgpr1 659; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 660; GFX9-NEXT: s_cbranch_execz .LBB4_2 661; GFX9-NEXT: ; %bb.1: 662; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 663; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 664; GFX9-NEXT: s_mul_i32 s0, s0, 5 665; GFX9-NEXT: v_mov_b32_e32 v1, s0 666; GFX9-NEXT: s_waitcnt lgkmcnt(0) 667; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 668; GFX9-NEXT: .LBB4_2: 669; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 670; GFX9-NEXT: s_waitcnt vmcnt(0) 671; GFX9-NEXT: v_readfirstlane_b32 s0, v1 672; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 673; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 674; GFX9-NEXT: v_mov_b32_e32 v1, 0 675; GFX9-NEXT: s_waitcnt lgkmcnt(0) 676; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 677; GFX9-NEXT: s_endpgm 678; 679; GFX10W64-LABEL: sub_i32_constant: 680; GFX10W64: ; %bb.0: ; %entry 681; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 682; GFX10W64-NEXT: s_mov_b64 s[6:7], exec 683; GFX10W64-NEXT: ; implicit-def: $vgpr1 684; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 685; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 686; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 687; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 688; GFX10W64-NEXT: s_cbranch_execz .LBB4_2 689; GFX10W64-NEXT: ; %bb.1: 690; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 691; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] 692; GFX10W64-NEXT: s_mul_i32 s0, s0, 5 693; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 694; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 695; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 696; GFX10W64-NEXT: .LBB4_2: 697; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 698; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 699; GFX10W64-NEXT: s_waitcnt vmcnt(0) 700; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 701; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 702; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 703; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 704; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 705; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 706; GFX10W64-NEXT: s_endpgm 707; 708; GFX10W32-LABEL: sub_i32_constant: 709; GFX10W32: ; %bb.0: ; %entry 710; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 711; GFX10W32-NEXT: s_mov_b32 s5, exec_lo 712; GFX10W32-NEXT: ; implicit-def: $vgpr1 713; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 714; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 715; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 716; GFX10W32-NEXT: s_cbranch_execz .LBB4_2 717; GFX10W32-NEXT: ; %bb.1: 718; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 719; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s5 720; GFX10W32-NEXT: s_mul_i32 s0, s0, 5 721; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 722; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 723; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 724; GFX10W32-NEXT: .LBB4_2: 725; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 726; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 727; GFX10W32-NEXT: s_waitcnt vmcnt(0) 728; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 729; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 730; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 731; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 732; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 733; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 734; GFX10W32-NEXT: s_endpgm 735entry: 736 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 737 store i32 %old, i32 addrspace(1)* %out 738 ret void 739} 740 741define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { 742; GFX6-LABEL: sub_i32_uniform: 743; GFX6: ; %bb.0: ; %entry 744; GFX6-NEXT: s_mov_b64 s[2:3], exec 745; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 746; GFX6-NEXT: s_load_dword s8, s[0:1], 0x11 747; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 748; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 749; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 750; GFX6-NEXT: ; implicit-def: $vgpr1 751; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc 752; GFX6-NEXT: s_cbranch_execz .LBB5_2 753; GFX6-NEXT: ; %bb.1: 754; GFX6-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 755; GFX6-NEXT: s_bcnt1_i32_b64 s0, s[2:3] 756; GFX6-NEXT: s_waitcnt lgkmcnt(0) 757; GFX6-NEXT: s_mul_i32 s0, s8, s0 758; GFX6-NEXT: v_mov_b32_e32 v1, s0 759; GFX6-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 760; GFX6-NEXT: .LBB5_2: 761; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 762; GFX6-NEXT: s_waitcnt vmcnt(0) 763; GFX6-NEXT: v_readfirstlane_b32 s0, v1 764; GFX6-NEXT: s_waitcnt lgkmcnt(0) 765; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 766; GFX6-NEXT: s_mov_b32 s7, 0xf000 767; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 768; GFX6-NEXT: s_mov_b32 s6, -1 769; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 770; GFX6-NEXT: s_endpgm 771; 772; GFX8-LABEL: sub_i32_uniform: 773; GFX8: ; %bb.0: ; %entry 774; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 775; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 776; GFX8-NEXT: s_mov_b64 s[4:5], exec 777; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 778; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 779; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 780; GFX8-NEXT: ; implicit-def: $vgpr1 781; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 782; GFX8-NEXT: s_cbranch_execz .LBB5_2 783; GFX8-NEXT: ; %bb.1: 784; GFX8-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 785; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 786; GFX8-NEXT: s_waitcnt lgkmcnt(0) 787; GFX8-NEXT: s_mul_i32 s0, s8, s0 788; GFX8-NEXT: v_mov_b32_e32 v1, s0 789; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 790; GFX8-NEXT: .LBB5_2: 791; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 792; GFX8-NEXT: s_waitcnt lgkmcnt(0) 793; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 794; GFX8-NEXT: s_waitcnt vmcnt(0) 795; GFX8-NEXT: v_readfirstlane_b32 s0, v1 796; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 797; GFX8-NEXT: v_mov_b32_e32 v0, s2 798; GFX8-NEXT: v_mov_b32_e32 v1, s3 799; GFX8-NEXT: flat_store_dword v[0:1], v2 800; GFX8-NEXT: s_endpgm 801; 802; GFX9-LABEL: sub_i32_uniform: 803; GFX9: ; %bb.0: ; %entry 804; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 805; GFX9-NEXT: s_load_dword s8, s[0:1], 0x44 806; GFX9-NEXT: s_mov_b64 s[4:5], exec 807; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 808; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 809; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 810; GFX9-NEXT: ; implicit-def: $vgpr1 811; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 812; GFX9-NEXT: s_cbranch_execz .LBB5_2 813; GFX9-NEXT: ; %bb.1: 814; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 815; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 816; GFX9-NEXT: s_waitcnt lgkmcnt(0) 817; GFX9-NEXT: s_mul_i32 s0, s8, s0 818; GFX9-NEXT: v_mov_b32_e32 v1, s0 819; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 820; GFX9-NEXT: .LBB5_2: 821; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 822; GFX9-NEXT: s_waitcnt lgkmcnt(0) 823; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 824; GFX9-NEXT: s_waitcnt vmcnt(0) 825; GFX9-NEXT: v_readfirstlane_b32 s0, v1 826; GFX9-NEXT: v_mov_b32_e32 v1, 0 827; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 828; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 829; GFX9-NEXT: s_endpgm 830; 831; GFX10W64-LABEL: sub_i32_uniform: 832; GFX10W64: ; %bb.0: ; %entry 833; GFX10W64-NEXT: s_clause 0x1 834; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 835; GFX10W64-NEXT: s_load_dword s8, s[0:1], 0x44 836; GFX10W64-NEXT: s_mov_b64 s[4:5], exec 837; GFX10W64-NEXT: ; implicit-def: $vgpr1 838; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 839; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 840; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 841; GFX10W64-NEXT: s_and_saveexec_b64 s[6:7], vcc 842; GFX10W64-NEXT: s_cbranch_execz .LBB5_2 843; GFX10W64-NEXT: ; %bb.1: 844; GFX10W64-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 845; GFX10W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] 846; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 847; GFX10W64-NEXT: s_mul_i32 s0, s8, s0 848; GFX10W64-NEXT: v_mov_b32_e32 v1, s0 849; GFX10W64-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc 850; GFX10W64-NEXT: .LBB5_2: 851; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 852; GFX10W64-NEXT: s_or_b64 exec, exec, s[6:7] 853; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 854; GFX10W64-NEXT: v_mul_lo_u32 v0, s8, v0 855; GFX10W64-NEXT: s_waitcnt vmcnt(0) 856; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1 857; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 858; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 859; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3] 860; GFX10W64-NEXT: s_endpgm 861; 862; GFX10W32-LABEL: sub_i32_uniform: 863; GFX10W32: ; %bb.0: ; %entry 864; GFX10W32-NEXT: s_clause 0x1 865; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 866; GFX10W32-NEXT: s_load_dword s4, s[0:1], 0x44 867; GFX10W32-NEXT: s_mov_b32 s6, exec_lo 868; GFX10W32-NEXT: ; implicit-def: $vgpr1 869; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 870; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 871; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo 872; GFX10W32-NEXT: s_cbranch_execz .LBB5_2 873; GFX10W32-NEXT: ; %bb.1: 874; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 875; GFX10W32-NEXT: s_bcnt1_i32_b32 s0, s6 876; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 877; GFX10W32-NEXT: s_mul_i32 s0, s4, s0 878; GFX10W32-NEXT: v_mov_b32_e32 v1, s0 879; GFX10W32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc 880; GFX10W32-NEXT: .LBB5_2: 881; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 882; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 883; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 884; GFX10W32-NEXT: v_mul_lo_u32 v0, s4, v0 885; GFX10W32-NEXT: s_waitcnt vmcnt(0) 886; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1 887; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 888; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 889; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] 890; GFX10W32-NEXT: s_endpgm 891entry: 892 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) 893 store i32 %old, i32 addrspace(1)* %out 894 ret void 895} 896 897define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { 898; GFX6-LABEL: sub_i32_varying_vdata: 899; GFX6: ; %bb.0: ; %entry 900; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 901; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 902; GFX6-NEXT: s_waitcnt lgkmcnt(0) 903; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc 904; GFX6-NEXT: s_mov_b32 s3, 0xf000 905; GFX6-NEXT: s_mov_b32 s2, -1 906; GFX6-NEXT: s_waitcnt vmcnt(0) 907; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 908; GFX6-NEXT: s_endpgm 909; 910; GFX8-LABEL: sub_i32_varying_vdata: 911; GFX8: ; %bb.0: ; %entry 912; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 913; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 914; GFX8-NEXT: v_mov_b32_e32 v1, 0 915; GFX8-NEXT: s_mov_b64 exec, s[4:5] 916; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 917; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 918; GFX8-NEXT: v_mov_b32_e32 v2, v0 919; GFX8-NEXT: s_not_b64 exec, exec 920; GFX8-NEXT: v_mov_b32_e32 v2, 0 921; GFX8-NEXT: s_not_b64 exec, exec 922; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 923; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 924; GFX8-NEXT: s_nop 1 925; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 926; GFX8-NEXT: s_nop 1 927; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 928; GFX8-NEXT: s_nop 1 929; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 930; GFX8-NEXT: s_nop 1 931; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 932; GFX8-NEXT: s_nop 1 933; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 934; GFX8-NEXT: v_readlane_b32 s6, v2, 63 935; GFX8-NEXT: s_nop 0 936; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 937; GFX8-NEXT: s_mov_b64 exec, s[4:5] 938; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 939; GFX8-NEXT: ; implicit-def: $vgpr0 940; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 941; GFX8-NEXT: s_cbranch_execz .LBB6_2 942; GFX8-NEXT: ; %bb.1: 943; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 944; GFX8-NEXT: v_mov_b32_e32 v0, s6 945; GFX8-NEXT: s_waitcnt lgkmcnt(0) 946; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 947; GFX8-NEXT: .LBB6_2: 948; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 949; GFX8-NEXT: s_waitcnt vmcnt(0) 950; GFX8-NEXT: v_readfirstlane_b32 s0, v0 951; GFX8-NEXT: v_mov_b32_e32 v0, v1 952; GFX8-NEXT: s_waitcnt lgkmcnt(0) 953; GFX8-NEXT: v_mov_b32_e32 v4, s3 954; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 955; GFX8-NEXT: v_mov_b32_e32 v3, s2 956; GFX8-NEXT: flat_store_dword v[3:4], v0 957; GFX8-NEXT: s_endpgm 958; 959; GFX9-LABEL: sub_i32_varying_vdata: 960; GFX9: ; %bb.0: ; %entry 961; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 962; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 963; GFX9-NEXT: v_mov_b32_e32 v1, 0 964; GFX9-NEXT: s_mov_b64 exec, s[4:5] 965; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 966; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 967; GFX9-NEXT: v_mov_b32_e32 v2, v0 968; GFX9-NEXT: s_not_b64 exec, exec 969; GFX9-NEXT: v_mov_b32_e32 v2, 0 970; GFX9-NEXT: s_not_b64 exec, exec 971; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 972; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 973; GFX9-NEXT: s_nop 1 974; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 975; GFX9-NEXT: s_nop 1 976; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 977; GFX9-NEXT: s_nop 1 978; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 979; GFX9-NEXT: s_nop 1 980; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 981; GFX9-NEXT: s_nop 1 982; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 983; GFX9-NEXT: v_readlane_b32 s6, v2, 63 984; GFX9-NEXT: s_nop 0 985; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 986; GFX9-NEXT: s_mov_b64 exec, s[4:5] 987; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 988; GFX9-NEXT: ; implicit-def: $vgpr0 989; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 990; GFX9-NEXT: s_cbranch_execz .LBB6_2 991; GFX9-NEXT: ; %bb.1: 992; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 993; GFX9-NEXT: v_mov_b32_e32 v0, s6 994; GFX9-NEXT: s_waitcnt lgkmcnt(0) 995; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 996; GFX9-NEXT: .LBB6_2: 997; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 998; GFX9-NEXT: s_waitcnt vmcnt(0) 999; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1000; GFX9-NEXT: v_mov_b32_e32 v0, v1 1001; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1002; GFX9-NEXT: v_mov_b32_e32 v3, 0 1003; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 1005; GFX9-NEXT: s_endpgm 1006; 1007; GFX10W64-LABEL: sub_i32_varying_vdata: 1008; GFX10W64: ; %bb.0: ; %entry 1009; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 1010; GFX10W64-NEXT: s_not_b64 exec, exec 1011; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 1012; GFX10W64-NEXT: s_not_b64 exec, exec 1013; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 1014; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1015; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 1016; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1017; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1018; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1019; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 1020; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1021; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1022; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 1023; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 1024; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1025; GFX10W64-NEXT: v_readlane_b32 s6, v1, 15 1026; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1027; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] 1028; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1029; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1030; GFX10W64-NEXT: v_readlane_b32 s7, v1, 31 1031; GFX10W64-NEXT: v_writelane_b32 v3, s6, 16 1032; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1033; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1034; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1035; GFX10W64-NEXT: v_readlane_b32 s6, v1, 63 1036; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 1037; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 1038; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1039; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1040; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 1041; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 1042; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] 1043; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1044; GFX10W64-NEXT: ; implicit-def: $vgpr0 1045; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1046; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 1047; GFX10W64-NEXT: ; %bb.1: 1048; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1049; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 1050; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1051; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1052; GFX10W64-NEXT: .LBB6_2: 1053; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 1054; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] 1055; GFX10W64-NEXT: s_waitcnt vmcnt(0) 1056; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 1057; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 1058; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 1059; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1060; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) 1061; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] 1062; GFX10W64-NEXT: s_endpgm 1063; 1064; GFX10W32-LABEL: sub_i32_varying_vdata: 1065; GFX10W32: ; %bb.0: ; %entry 1066; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 1067; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1068; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 1069; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo 1070; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 1071; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1072; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1073; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1074; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1075; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 1076; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1077; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 1078; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1079; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1080; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1081; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 1082; GFX10W32-NEXT: v_readlane_b32 s6, v1, 31 1083; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1084; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 1085; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1086; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1087; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 1088; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 1089; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 1090; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1091; GFX10W32-NEXT: ; implicit-def: $vgpr0 1092; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo 1093; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 1094; GFX10W32-NEXT: ; %bb.1: 1095; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 1096; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 1097; GFX10W32-NEXT: s_mov_b32 s5, s6 1098; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc 1100; GFX10W32-NEXT: .LBB6_2: 1101; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 1102; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 1103; GFX10W32-NEXT: s_waitcnt vmcnt(0) 1104; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 1105; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 1106; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 1107; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1108; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) 1109; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] 1110; GFX10W32-NEXT: s_endpgm 1111entry: 1112 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1113 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) 1114 store i32 %old, i32 addrspace(1)* %out 1115 ret void 1116} 1117 1118define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { 1119; GFX6-LABEL: sub_i32_varying_offset: 1120; GFX6: ; %bb.0: ; %entry 1121; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1122; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1123; GFX6-NEXT: v_mov_b32_e32 v1, 1 1124; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX6-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1126; GFX6-NEXT: s_mov_b32 s3, 0xf000 1127; GFX6-NEXT: s_mov_b32 s2, -1 1128; GFX6-NEXT: s_waitcnt vmcnt(0) 1129; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 1130; GFX6-NEXT: s_endpgm 1131; 1132; GFX8-LABEL: sub_i32_varying_offset: 1133; GFX8: ; %bb.0: ; %entry 1134; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1135; GFX8-NEXT: v_mov_b32_e32 v2, 1 1136; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1137; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc 1139; GFX8-NEXT: v_mov_b32_e32 v0, s0 1140; GFX8-NEXT: v_mov_b32_e32 v1, s1 1141; GFX8-NEXT: s_waitcnt vmcnt(0) 1142; GFX8-NEXT: flat_store_dword v[0:1], v2 1143; GFX8-NEXT: s_endpgm 1144; 1145; GFX9-LABEL: sub_i32_varying_offset: 1146; GFX9: ; %bb.0: ; %entry 1147; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1148; GFX9-NEXT: v_mov_b32_e32 v1, 1 1149; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1150; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1151; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1152; GFX9-NEXT: v_mov_b32_e32 v0, 0 1153; GFX9-NEXT: s_waitcnt vmcnt(0) 1154; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1155; GFX9-NEXT: s_endpgm 1156; 1157; GFX10-LABEL: sub_i32_varying_offset: 1158; GFX10: ; %bb.0: ; %entry 1159; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1160; GFX10-NEXT: v_mov_b32_e32 v1, 1 1161; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1162; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1163; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc 1164; GFX10-NEXT: v_mov_b32_e32 v0, 0 1165; GFX10-NEXT: s_waitcnt vmcnt(0) 1166; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1167; GFX10-NEXT: s_endpgm 1168entry: 1169 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1170 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) 1171 store i32 %old, i32 addrspace(1)* %out 1172 ret void 1173} 1174