1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11 12@local_var32 = addrspace(3) global i32 undef, align 4 13@local_var64 = addrspace(3) global i64 undef, align 8 14 15; Show what the atomic optimization pass will do for local pointers. 16 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 18; 19; 20; GFX7LESS-LABEL: add_i32_constant: 21; GFX7LESS: ; %bb.0: ; %entry 22; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 23; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 24; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 25; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 26; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 27; GFX7LESS-NEXT: ; implicit-def: $vgpr1 28; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 29; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 30; GFX7LESS-NEXT: ; %bb.1: 31; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 32; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 33; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 34; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 35; GFX7LESS-NEXT: s_mov_b32 m0, -1 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: .LBB0_2: 40; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 41; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 42; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 43; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 44; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 45; GFX7LESS-NEXT: s_mov_b32 s2, -1 46; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; GFX7LESS-NEXT: s_endpgm 48; 49; GFX8-LABEL: add_i32_constant: 50; GFX8: ; %bb.0: ; %entry 51; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 52; GFX8-NEXT: s_mov_b64 s[2:3], exec 53; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 54; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 55; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 56; GFX8-NEXT: ; implicit-def: $vgpr1 57; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 58; GFX8-NEXT: s_cbranch_execz .LBB0_2 59; GFX8-NEXT: ; %bb.1: 60; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 61; GFX8-NEXT: s_mul_i32 s2, s2, 5 62; GFX8-NEXT: v_mov_b32_e32 v1, 0 63; GFX8-NEXT: v_mov_b32_e32 v2, s2 64; GFX8-NEXT: s_mov_b32 m0, -1 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: .LBB0_2: 69; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 70; GFX8-NEXT: s_waitcnt lgkmcnt(0) 71; GFX8-NEXT: v_readfirstlane_b32 s2, v1 72; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 73; GFX8-NEXT: s_mov_b32 s3, 0xf000 74; GFX8-NEXT: s_mov_b32 s2, -1 75; GFX8-NEXT: s_nop 1 76; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX8-NEXT: s_endpgm 78; 79; GFX9-LABEL: add_i32_constant: 80; GFX9: ; %bb.0: ; %entry 81; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 82; GFX9-NEXT: s_mov_b64 s[2:3], exec 83; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 84; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 85; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 86; GFX9-NEXT: ; implicit-def: $vgpr1 87; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX9-NEXT: s_cbranch_execz .LBB0_2 89; GFX9-NEXT: ; %bb.1: 90; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 91; GFX9-NEXT: s_mul_i32 s2, s2, 5 92; GFX9-NEXT: v_mov_b32_e32 v1, 0 93; GFX9-NEXT: v_mov_b32_e32 v2, s2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: .LBB0_2: 98; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 99; GFX9-NEXT: s_waitcnt lgkmcnt(0) 100; GFX9-NEXT: v_readfirstlane_b32 s2, v1 101; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 102; GFX9-NEXT: s_mov_b32 s3, 0xf000 103; GFX9-NEXT: s_mov_b32 s2, -1 104; GFX9-NEXT: s_nop 1 105; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; GFX9-NEXT: s_endpgm 107; 108; GFX1064-LABEL: add_i32_constant: 109; GFX1064: ; %bb.0: ; %entry 110; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 111; GFX1064-NEXT: s_mov_b64 s[2:3], exec 112; GFX1064-NEXT: ; implicit-def: $vgpr1 113; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 114; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 115; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 116; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 117; GFX1064-NEXT: s_cbranch_execz .LBB0_2 118; GFX1064-NEXT: ; %bb.1: 119; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 120; GFX1064-NEXT: v_mov_b32_e32 v1, 0 121; GFX1064-NEXT: s_mul_i32 s2, s2, 5 122; GFX1064-NEXT: v_mov_b32_e32 v2, s2 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 125; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 126; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 127; GFX1064-NEXT: buffer_gl0_inv 128; GFX1064-NEXT: .LBB0_2: 129; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 130; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 131; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 132; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 133; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 134; GFX1064-NEXT: s_mov_b32 s2, -1 135; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 136; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 137; GFX1064-NEXT: s_endpgm 138; 139; GFX1032-LABEL: add_i32_constant: 140; GFX1032: ; %bb.0: ; %entry 141; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 142; GFX1032-NEXT: s_mov_b32 s3, exec_lo 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz .LBB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 150; GFX1032-NEXT: v_mov_b32_e32 v1, 0 151; GFX1032-NEXT: s_mul_i32 s3, s3, 5 152; GFX1032-NEXT: v_mov_b32_e32 v2, s3 153; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 154; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 155; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 156; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 157; GFX1032-NEXT: buffer_gl0_inv 158; GFX1032-NEXT: .LBB0_2: 159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 166; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 167; GFX1032-NEXT: s_endpgm 168; 169; GFX1164-LABEL: add_i32_constant: 170; GFX1164: ; %bb.0: ; %entry 171; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 172; GFX1164-NEXT: s_mov_b64 s[2:3], exec 173; GFX1164-NEXT: s_mov_b64 s[4:5], exec 174; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 175; GFX1164-NEXT: ; implicit-def: $vgpr1 176; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 177; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 178; GFX1164-NEXT: s_cbranch_execz .LBB0_2 179; GFX1164-NEXT: ; %bb.1: 180; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 181; GFX1164-NEXT: v_mov_b32_e32 v1, 0 182; GFX1164-NEXT: s_mul_i32 s2, s2, 5 183; GFX1164-NEXT: v_mov_b32_e32 v2, s2 184; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 185; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 186; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 187; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 188; GFX1164-NEXT: buffer_gl0_inv 189; GFX1164-NEXT: .LBB0_2: 190; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 191; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 192; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 193; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 194; GFX1164-NEXT: s_mov_b32 s2, -1 195; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 196; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 197; GFX1164-NEXT: s_endpgm 198; 199; GFX1132-LABEL: add_i32_constant: 200; GFX1132: ; %bb.0: ; %entry 201; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 202; GFX1132-NEXT: s_mov_b32 s3, exec_lo 203; GFX1132-NEXT: s_mov_b32 s2, exec_lo 204; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 205; GFX1132-NEXT: ; implicit-def: $vgpr1 206; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 207; GFX1132-NEXT: s_cbranch_execz .LBB0_2 208; GFX1132-NEXT: ; %bb.1: 209; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 210; GFX1132-NEXT: v_mov_b32_e32 v1, 0 211; GFX1132-NEXT: s_mul_i32 s3, s3, 5 212; GFX1132-NEXT: v_mov_b32_e32 v2, s3 213; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 214; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 215; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 216; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 217; GFX1132-NEXT: buffer_gl0_inv 218; GFX1132-NEXT: .LBB0_2: 219; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 220; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 221; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 222; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 223; GFX1132-NEXT: s_mov_b32 s2, -1 224; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 225; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 226; GFX1132-NEXT: s_endpgm 227entry: 228 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 229 store i32 %old, i32 addrspace(1)* %out 230 ret void 231} 232 233define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 234; 235; 236; GFX7LESS-LABEL: add_i32_uniform: 237; GFX7LESS: ; %bb.0: ; %entry 238; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 239; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 240; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 241; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 242; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 243; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 244; GFX7LESS-NEXT: ; implicit-def: $vgpr1 245; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 246; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 247; GFX7LESS-NEXT: ; %bb.1: 248; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 249; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 250; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 251; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 252; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 253; GFX7LESS-NEXT: s_mov_b32 m0, -1 254; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 255; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 256; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 257; GFX7LESS-NEXT: .LBB1_2: 258; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 259; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 260; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 261; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 262; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 263; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 264; GFX7LESS-NEXT: s_mov_b32 s6, -1 265; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 266; GFX7LESS-NEXT: s_endpgm 267; 268; GFX8-LABEL: add_i32_uniform: 269; GFX8: ; %bb.0: ; %entry 270; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 271; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 272; GFX8-NEXT: s_mov_b64 s[2:3], exec 273; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 274; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 275; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 276; GFX8-NEXT: ; implicit-def: $vgpr1 277; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 278; GFX8-NEXT: s_cbranch_execz .LBB1_2 279; GFX8-NEXT: ; %bb.1: 280; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 281; GFX8-NEXT: s_waitcnt lgkmcnt(0) 282; GFX8-NEXT: s_mul_i32 s2, s6, s2 283; GFX8-NEXT: v_mov_b32_e32 v1, 0 284; GFX8-NEXT: v_mov_b32_e32 v2, s2 285; GFX8-NEXT: s_mov_b32 m0, -1 286; GFX8-NEXT: s_waitcnt lgkmcnt(0) 287; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 288; GFX8-NEXT: s_waitcnt lgkmcnt(0) 289; GFX8-NEXT: .LBB1_2: 290; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 291; GFX8-NEXT: s_waitcnt lgkmcnt(0) 292; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 293; GFX8-NEXT: v_readfirstlane_b32 s0, v1 294; GFX8-NEXT: s_mov_b32 s7, 0xf000 295; GFX8-NEXT: s_mov_b32 s6, -1 296; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 297; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; GFX8-NEXT: s_endpgm 299; 300; GFX9-LABEL: add_i32_uniform: 301; GFX9: ; %bb.0: ; %entry 302; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 303; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 304; GFX9-NEXT: s_mov_b64 s[2:3], exec 305; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 306; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 307; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 308; GFX9-NEXT: ; implicit-def: $vgpr1 309; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 310; GFX9-NEXT: s_cbranch_execz .LBB1_2 311; GFX9-NEXT: ; %bb.1: 312; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 314; GFX9-NEXT: s_mul_i32 s2, s6, s2 315; GFX9-NEXT: v_mov_b32_e32 v1, 0 316; GFX9-NEXT: v_mov_b32_e32 v2, s2 317; GFX9-NEXT: s_waitcnt lgkmcnt(0) 318; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 319; GFX9-NEXT: s_waitcnt lgkmcnt(0) 320; GFX9-NEXT: .LBB1_2: 321; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 323; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 324; GFX9-NEXT: v_readfirstlane_b32 s0, v1 325; GFX9-NEXT: s_mov_b32 s7, 0xf000 326; GFX9-NEXT: s_mov_b32 s6, -1 327; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 328; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 329; GFX9-NEXT: s_endpgm 330; 331; GFX1064-LABEL: add_i32_uniform: 332; GFX1064: ; %bb.0: ; %entry 333; GFX1064-NEXT: s_clause 0x1 334; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 335; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 336; GFX1064-NEXT: s_mov_b64 s[2:3], exec 337; GFX1064-NEXT: ; implicit-def: $vgpr1 338; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 339; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 340; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 341; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 342; GFX1064-NEXT: s_cbranch_execz .LBB1_2 343; GFX1064-NEXT: ; %bb.1: 344; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 345; GFX1064-NEXT: v_mov_b32_e32 v1, 0 346; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 347; GFX1064-NEXT: s_mul_i32 s2, s6, s2 348; GFX1064-NEXT: v_mov_b32_e32 v2, s2 349; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 350; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 351; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 352; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 353; GFX1064-NEXT: buffer_gl0_inv 354; GFX1064-NEXT: .LBB1_2: 355; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 356; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 357; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 358; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 359; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 360; GFX1064-NEXT: v_mad_u64_u32 v[0:1], null, s6, v0, s[0:1] 361; GFX1064-NEXT: s_mov_b32 s6, -1 362; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 363; GFX1064-NEXT: s_endpgm 364; 365; GFX1032-LABEL: add_i32_uniform: 366; GFX1032: ; %bb.0: ; %entry 367; GFX1032-NEXT: s_clause 0x1 368; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 369; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 370; GFX1032-NEXT: s_mov_b32 s3, exec_lo 371; GFX1032-NEXT: ; implicit-def: $vgpr1 372; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 373; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 374; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 375; GFX1032-NEXT: s_cbranch_execz .LBB1_2 376; GFX1032-NEXT: ; %bb.1: 377; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 378; GFX1032-NEXT: v_mov_b32_e32 v1, 0 379; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 380; GFX1032-NEXT: s_mul_i32 s1, s2, s1 381; GFX1032-NEXT: v_mov_b32_e32 v2, s1 382; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 383; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 384; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 385; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 386; GFX1032-NEXT: buffer_gl0_inv 387; GFX1032-NEXT: .LBB1_2: 388; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 389; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 390; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 391; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 392; GFX1032-NEXT: s_mov_b32 s6, -1 393; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 394; GFX1032-NEXT: v_mad_u64_u32 v[0:1], null, s2, v0, s[0:1] 395; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 396; GFX1032-NEXT: s_endpgm 397; 398; GFX1164-LABEL: add_i32_uniform: 399; GFX1164: ; %bb.0: ; %entry 400; GFX1164-NEXT: s_clause 0x1 401; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 402; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 403; GFX1164-NEXT: s_mov_b64 s[2:3], exec 404; GFX1164-NEXT: s_mov_b64 s[0:1], exec 405; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 406; GFX1164-NEXT: ; implicit-def: $vgpr1 407; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 408; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 409; GFX1164-NEXT: s_cbranch_execz .LBB1_2 410; GFX1164-NEXT: ; %bb.1: 411; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 412; GFX1164-NEXT: v_mov_b32_e32 v1, 0 413; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 414; GFX1164-NEXT: s_mul_i32 s2, s6, s2 415; GFX1164-NEXT: v_mov_b32_e32 v2, s2 416; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 417; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 418; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 419; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 420; GFX1164-NEXT: buffer_gl0_inv 421; GFX1164-NEXT: .LBB1_2: 422; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 423; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 424; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 425; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 426; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1] 427; GFX1164-NEXT: s_mov_b32 s6, -1 428; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 429; GFX1164-NEXT: s_endpgm 430; 431; GFX1132-LABEL: add_i32_uniform: 432; GFX1132: ; %bb.0: ; %entry 433; GFX1132-NEXT: s_clause 0x1 434; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 435; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 436; GFX1132-NEXT: s_mov_b32 s2, exec_lo 437; GFX1132-NEXT: s_mov_b32 s1, exec_lo 438; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 439; GFX1132-NEXT: ; implicit-def: $vgpr1 440; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 441; GFX1132-NEXT: s_cbranch_execz .LBB1_2 442; GFX1132-NEXT: ; %bb.1: 443; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 444; GFX1132-NEXT: v_mov_b32_e32 v1, 0 445; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 446; GFX1132-NEXT: s_mul_i32 s2, s0, s2 447; GFX1132-NEXT: v_mov_b32_e32 v2, s2 448; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 449; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 450; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 451; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 452; GFX1132-NEXT: buffer_gl0_inv 453; GFX1132-NEXT: .LBB1_2: 454; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 455; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 456; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 457; GFX1132-NEXT: s_mov_b32 s6, -1 458; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 459; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] 460; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 461; GFX1132-NEXT: s_endpgm 462entry: 463 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 464 store i32 %old, i32 addrspace(1)* %out 465 ret void 466} 467 468define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 469; 470; 471; GFX7LESS-LABEL: add_i32_varying: 472; GFX7LESS: ; %bb.0: ; %entry 473; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 474; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 475; GFX7LESS-NEXT: s_mov_b32 m0, -1 476; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 477; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 478; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 479; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 480; GFX7LESS-NEXT: s_mov_b32 s2, -1 481; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 482; GFX7LESS-NEXT: s_endpgm 483; 484; GFX8-LABEL: add_i32_varying: 485; GFX8: ; %bb.0: ; %entry 486; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 487; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 488; GFX8-NEXT: v_mov_b32_e32 v1, 0 489; GFX8-NEXT: s_mov_b64 exec, s[2:3] 490; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 491; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 492; GFX8-NEXT: v_mov_b32_e32 v2, v0 493; GFX8-NEXT: s_not_b64 exec, exec 494; GFX8-NEXT: v_mov_b32_e32 v2, 0 495; GFX8-NEXT: s_not_b64 exec, exec 496; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 497; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 498; GFX8-NEXT: s_nop 1 499; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 500; GFX8-NEXT: s_nop 1 501; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 502; GFX8-NEXT: s_nop 1 503; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 504; GFX8-NEXT: s_nop 1 505; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 506; GFX8-NEXT: s_nop 1 507; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 508; GFX8-NEXT: v_readlane_b32 s4, v2, 63 509; GFX8-NEXT: s_nop 0 510; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 511; GFX8-NEXT: s_mov_b64 exec, s[2:3] 512; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 513; GFX8-NEXT: ; implicit-def: $vgpr0 514; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 515; GFX8-NEXT: s_cbranch_execz .LBB2_2 516; GFX8-NEXT: ; %bb.1: 517; GFX8-NEXT: v_mov_b32_e32 v0, 0 518; GFX8-NEXT: v_mov_b32_e32 v3, s4 519; GFX8-NEXT: s_mov_b32 m0, -1 520; GFX8-NEXT: s_waitcnt lgkmcnt(0) 521; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 522; GFX8-NEXT: s_waitcnt lgkmcnt(0) 523; GFX8-NEXT: .LBB2_2: 524; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 525; GFX8-NEXT: s_waitcnt lgkmcnt(0) 526; GFX8-NEXT: v_readfirstlane_b32 s2, v0 527; GFX8-NEXT: v_mov_b32_e32 v0, v1 528; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 529; GFX8-NEXT: s_mov_b32 s3, 0xf000 530; GFX8-NEXT: s_mov_b32 s2, -1 531; GFX8-NEXT: s_nop 0 532; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 533; GFX8-NEXT: s_endpgm 534; 535; GFX9-LABEL: add_i32_varying: 536; GFX9: ; %bb.0: ; %entry 537; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 538; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 539; GFX9-NEXT: v_mov_b32_e32 v1, 0 540; GFX9-NEXT: s_mov_b64 exec, s[2:3] 541; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 542; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 543; GFX9-NEXT: v_mov_b32_e32 v2, v0 544; GFX9-NEXT: s_not_b64 exec, exec 545; GFX9-NEXT: v_mov_b32_e32 v2, 0 546; GFX9-NEXT: s_not_b64 exec, exec 547; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 548; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 549; GFX9-NEXT: s_nop 1 550; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 551; GFX9-NEXT: s_nop 1 552; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 553; GFX9-NEXT: s_nop 1 554; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 555; GFX9-NEXT: s_nop 1 556; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 557; GFX9-NEXT: s_nop 1 558; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 559; GFX9-NEXT: v_readlane_b32 s4, v2, 63 560; GFX9-NEXT: s_nop 0 561; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 562; GFX9-NEXT: s_mov_b64 exec, s[2:3] 563; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 564; GFX9-NEXT: ; implicit-def: $vgpr0 565; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 566; GFX9-NEXT: s_cbranch_execz .LBB2_2 567; GFX9-NEXT: ; %bb.1: 568; GFX9-NEXT: v_mov_b32_e32 v0, 0 569; GFX9-NEXT: v_mov_b32_e32 v3, s4 570; GFX9-NEXT: s_waitcnt lgkmcnt(0) 571; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 572; GFX9-NEXT: s_waitcnt lgkmcnt(0) 573; GFX9-NEXT: .LBB2_2: 574; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 575; GFX9-NEXT: s_waitcnt lgkmcnt(0) 576; GFX9-NEXT: v_readfirstlane_b32 s2, v0 577; GFX9-NEXT: v_mov_b32_e32 v0, v1 578; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 579; GFX9-NEXT: s_mov_b32 s3, 0xf000 580; GFX9-NEXT: s_mov_b32 s2, -1 581; GFX9-NEXT: s_nop 0 582; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 583; GFX9-NEXT: s_endpgm 584; 585; GFX1064-LABEL: add_i32_varying: 586; GFX1064: ; %bb.0: ; %entry 587; GFX1064-NEXT: v_mov_b32_e32 v1, v0 588; GFX1064-NEXT: s_not_b64 exec, exec 589; GFX1064-NEXT: v_mov_b32_e32 v1, 0 590; GFX1064-NEXT: s_not_b64 exec, exec 591; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 592; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 593; GFX1064-NEXT: v_mov_b32_e32 v3, 0 594; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 595; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 596; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 597; GFX1064-NEXT: v_mov_b32_e32 v2, v1 598; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 599; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 600; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 601; GFX1064-NEXT: v_mov_b32_e32 v2, s4 602; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 603; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 604; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 605; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 606; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 607; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 608; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 609; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 610; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 611; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 612; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 613; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 614; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 615; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 616; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 617; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 618; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 619; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 620; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 621; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 622; GFX1064-NEXT: s_mov_b32 s2, -1 623; GFX1064-NEXT: ; implicit-def: $vgpr0 624; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 625; GFX1064-NEXT: s_cbranch_execz .LBB2_2 626; GFX1064-NEXT: ; %bb.1: 627; GFX1064-NEXT: v_mov_b32_e32 v0, 0 628; GFX1064-NEXT: v_mov_b32_e32 v4, s7 629; GFX1064-NEXT: s_mov_b32 s3, s7 630; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 631; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 632; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 633; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 634; GFX1064-NEXT: buffer_gl0_inv 635; GFX1064-NEXT: .LBB2_2: 636; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 637; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 638; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 639; GFX1064-NEXT: v_mov_b32_e32 v0, v3 640; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 641; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 642; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 643; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 644; GFX1064-NEXT: s_endpgm 645; 646; GFX1032-LABEL: add_i32_varying: 647; GFX1032: ; %bb.0: ; %entry 648; GFX1032-NEXT: v_mov_b32_e32 v1, v0 649; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 650; GFX1032-NEXT: v_mov_b32_e32 v1, 0 651; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 652; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 653; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 654; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 655; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 656; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 657; GFX1032-NEXT: v_mov_b32_e32 v2, v1 658; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 659; GFX1032-NEXT: s_mov_b32 exec_lo, s2 660; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 661; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 662; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 663; GFX1032-NEXT: v_mov_b32_e32 v3, 0 664; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 665; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 666; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 667; GFX1032-NEXT: s_mov_b32 exec_lo, s2 668; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 669; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 670; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 671; GFX1032-NEXT: s_mov_b32 exec_lo, s2 672; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 673; GFX1032-NEXT: s_mov_b32 s2, -1 674; GFX1032-NEXT: ; implicit-def: $vgpr0 675; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 676; GFX1032-NEXT: s_cbranch_execz .LBB2_2 677; GFX1032-NEXT: ; %bb.1: 678; GFX1032-NEXT: v_mov_b32_e32 v0, 0 679; GFX1032-NEXT: v_mov_b32_e32 v4, s4 680; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 681; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 682; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 683; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 684; GFX1032-NEXT: buffer_gl0_inv 685; GFX1032-NEXT: .LBB2_2: 686; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 687; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 688; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 689; GFX1032-NEXT: v_mov_b32_e32 v0, v3 690; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 691; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 692; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 693; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 694; GFX1032-NEXT: s_endpgm 695; 696; GFX1164-LABEL: add_i32_varying: 697; GFX1164: ; %bb.0: ; %entry 698; GFX1164-NEXT: v_mov_b32_e32 v1, v0 699; GFX1164-NEXT: s_not_b64 exec, exec 700; GFX1164-NEXT: v_mov_b32_e32 v1, 0 701; GFX1164-NEXT: s_not_b64 exec, exec 702; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 703; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 704; GFX1164-NEXT: v_mov_b32_e32 v3, 0 705; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 707; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 708; GFX1164-NEXT: v_mov_b32_e32 v2, v1 709; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 710; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 711; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 712; GFX1164-NEXT: v_mov_b32_e32 v2, s4 713; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 714; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 715; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 716; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 717; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 718; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 719; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 720; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 721; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 722; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 723; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 724; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 725; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 726; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 727; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 728; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 729; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 730; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 731; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 732; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 733; GFX1164-NEXT: s_mov_b32 s2, -1 734; GFX1164-NEXT: ; implicit-def: $vgpr0 735; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 736; GFX1164-NEXT: s_cbranch_execz .LBB2_2 737; GFX1164-NEXT: ; %bb.1: 738; GFX1164-NEXT: v_mov_b32_e32 v0, 0 739; GFX1164-NEXT: v_mov_b32_e32 v4, s7 740; GFX1164-NEXT: s_mov_b32 s3, s7 741; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 742; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 743; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v4 744; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 745; GFX1164-NEXT: buffer_gl0_inv 746; GFX1164-NEXT: .LBB2_2: 747; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 748; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 749; GFX1164-NEXT: v_mov_b32_e32 v0, v3 750; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 751; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 752; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 753; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 754; GFX1164-NEXT: s_endpgm 755; 756; GFX1132-LABEL: add_i32_varying: 757; GFX1132: ; %bb.0: ; %entry 758; GFX1132-NEXT: v_mov_b32_e32 v1, v0 759; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 760; GFX1132-NEXT: v_mov_b32_e32 v1, 0 761; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 762; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 763; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 764; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 765; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 766; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 767; GFX1132-NEXT: v_mov_b32_e32 v2, v1 768; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 769; GFX1132-NEXT: s_mov_b32 exec_lo, s2 770; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 771; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 772; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 773; GFX1132-NEXT: v_mov_b32_e32 v3, 0 774; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 775; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 776; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 777; GFX1132-NEXT: s_mov_b32 exec_lo, s2 778; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 779; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 780; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 781; GFX1132-NEXT: s_mov_b32 exec_lo, s2 782; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 783; GFX1132-NEXT: s_mov_b32 s2, -1 784; GFX1132-NEXT: ; implicit-def: $vgpr0 785; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 786; GFX1132-NEXT: s_cbranch_execz .LBB2_2 787; GFX1132-NEXT: ; %bb.1: 788; GFX1132-NEXT: v_mov_b32_e32 v0, 0 789; GFX1132-NEXT: v_mov_b32_e32 v4, s4 790; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 791; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 792; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v4 793; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 794; GFX1132-NEXT: buffer_gl0_inv 795; GFX1132-NEXT: .LBB2_2: 796; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 797; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 798; GFX1132-NEXT: v_mov_b32_e32 v0, v3 799; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 800; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 801; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 802; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 803; GFX1132-NEXT: s_endpgm 804entry: 805 %lane = call i32 @llvm.amdgcn.workitem.id.x() 806 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 807 store i32 %old, i32 addrspace(1)* %out 808 ret void 809} 810 811define amdgpu_kernel void @add_i32_varying_nouse() { 812; GFX7LESS-LABEL: add_i32_varying_nouse: 813; GFX7LESS: ; %bb.0: ; %entry 814; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 815; GFX7LESS-NEXT: s_mov_b32 m0, -1 816; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 817; GFX7LESS-NEXT: ds_add_u32 v1, v0 818; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 819; GFX7LESS-NEXT: s_endpgm 820; 821; GFX8-LABEL: add_i32_varying_nouse: 822; GFX8: ; %bb.0: ; %entry 823; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 824; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 825; GFX8-NEXT: v_mov_b32_e32 v1, v0 826; GFX8-NEXT: s_not_b64 exec, exec 827; GFX8-NEXT: v_mov_b32_e32 v1, 0 828; GFX8-NEXT: s_not_b64 exec, exec 829; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 830; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 831; GFX8-NEXT: s_nop 1 832; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 833; GFX8-NEXT: s_nop 1 834; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 835; GFX8-NEXT: s_nop 1 836; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 837; GFX8-NEXT: s_nop 1 838; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 839; GFX8-NEXT: s_nop 1 840; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 841; GFX8-NEXT: v_readlane_b32 s2, v1, 63 842; GFX8-NEXT: s_mov_b64 exec, s[0:1] 843; GFX8-NEXT: s_mov_b32 s0, s2 844; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 845; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 846; GFX8-NEXT: s_cbranch_execz .LBB3_2 847; GFX8-NEXT: ; %bb.1: 848; GFX8-NEXT: v_mov_b32_e32 v0, 0 849; GFX8-NEXT: v_mov_b32_e32 v2, s0 850; GFX8-NEXT: s_mov_b32 m0, -1 851; GFX8-NEXT: s_waitcnt lgkmcnt(0) 852; GFX8-NEXT: ds_add_u32 v0, v2 853; GFX8-NEXT: s_waitcnt lgkmcnt(0) 854; GFX8-NEXT: .LBB3_2: 855; GFX8-NEXT: s_endpgm 856; 857; GFX9-LABEL: add_i32_varying_nouse: 858; GFX9: ; %bb.0: ; %entry 859; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 860; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 861; GFX9-NEXT: v_mov_b32_e32 v1, v0 862; GFX9-NEXT: s_not_b64 exec, exec 863; GFX9-NEXT: v_mov_b32_e32 v1, 0 864; GFX9-NEXT: s_not_b64 exec, exec 865; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 866; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 867; GFX9-NEXT: s_nop 1 868; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 869; GFX9-NEXT: s_nop 1 870; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 871; GFX9-NEXT: s_nop 1 872; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 873; GFX9-NEXT: s_nop 1 874; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 875; GFX9-NEXT: s_nop 1 876; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 877; GFX9-NEXT: v_readlane_b32 s2, v1, 63 878; GFX9-NEXT: s_mov_b64 exec, s[0:1] 879; GFX9-NEXT: s_mov_b32 s0, s2 880; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 881; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 882; GFX9-NEXT: s_cbranch_execz .LBB3_2 883; GFX9-NEXT: ; %bb.1: 884; GFX9-NEXT: v_mov_b32_e32 v0, 0 885; GFX9-NEXT: v_mov_b32_e32 v2, s0 886; GFX9-NEXT: s_waitcnt lgkmcnt(0) 887; GFX9-NEXT: ds_add_u32 v0, v2 888; GFX9-NEXT: s_waitcnt lgkmcnt(0) 889; GFX9-NEXT: .LBB3_2: 890; GFX9-NEXT: s_endpgm 891; 892; GFX1064-LABEL: add_i32_varying_nouse: 893; GFX1064: ; %bb.0: ; %entry 894; GFX1064-NEXT: v_mov_b32_e32 v1, v0 895; GFX1064-NEXT: s_not_b64 exec, exec 896; GFX1064-NEXT: v_mov_b32_e32 v1, 0 897; GFX1064-NEXT: s_not_b64 exec, exec 898; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 899; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 900; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 901; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 902; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 903; GFX1064-NEXT: v_mov_b32_e32 v2, v1 904; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 905; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 906; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 907; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 908; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 909; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 910; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 911; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 912; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 913; GFX1064-NEXT: s_add_i32 s0, s2, s3 914; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 915; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 916; GFX1064-NEXT: s_cbranch_execz .LBB3_2 917; GFX1064-NEXT: ; %bb.1: 918; GFX1064-NEXT: v_mov_b32_e32 v0, 0 919; GFX1064-NEXT: v_mov_b32_e32 v3, s0 920; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 921; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 922; GFX1064-NEXT: ds_add_u32 v0, v3 923; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 924; GFX1064-NEXT: buffer_gl0_inv 925; GFX1064-NEXT: .LBB3_2: 926; GFX1064-NEXT: s_endpgm 927; 928; GFX1032-LABEL: add_i32_varying_nouse: 929; GFX1032: ; %bb.0: ; %entry 930; GFX1032-NEXT: v_mov_b32_e32 v1, v0 931; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 932; GFX1032-NEXT: v_mov_b32_e32 v1, 0 933; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 934; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 935; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 936; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 937; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 938; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 939; GFX1032-NEXT: v_mov_b32_e32 v2, v1 940; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 941; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 942; GFX1032-NEXT: s_mov_b32 exec_lo, s0 943; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 944; GFX1032-NEXT: v_mov_b32_e32 v0, v1 945; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 946; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 947; GFX1032-NEXT: s_cbranch_execz .LBB3_2 948; GFX1032-NEXT: ; %bb.1: 949; GFX1032-NEXT: v_mov_b32_e32 v3, 0 950; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 951; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 952; GFX1032-NEXT: ds_add_u32 v3, v0 953; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 954; GFX1032-NEXT: buffer_gl0_inv 955; GFX1032-NEXT: .LBB3_2: 956; GFX1032-NEXT: s_endpgm 957; 958; GFX1164-LABEL: add_i32_varying_nouse: 959; GFX1164: ; %bb.0: ; %entry 960; GFX1164-NEXT: v_mov_b32_e32 v1, v0 961; GFX1164-NEXT: s_not_b64 exec, exec 962; GFX1164-NEXT: v_mov_b32_e32 v1, 0 963; GFX1164-NEXT: s_not_b64 exec, exec 964; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 965; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 966; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 967; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 968; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 969; GFX1164-NEXT: v_mov_b32_e32 v2, v1 970; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 971; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 972; GFX1164-NEXT: v_permlane64_b32 v2, v1 973; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 974; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 975; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 976; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 977; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 978; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 979; GFX1164-NEXT: v_mov_b32_e32 v0, v1 980; GFX1164-NEXT: s_mov_b64 s[0:1], exec 981; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 982; GFX1164-NEXT: s_cbranch_execz .LBB3_2 983; GFX1164-NEXT: ; %bb.1: 984; GFX1164-NEXT: v_mov_b32_e32 v3, 0 985; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 986; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 987; GFX1164-NEXT: ds_add_u32 v3, v0 988; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 989; GFX1164-NEXT: buffer_gl0_inv 990; GFX1164-NEXT: .LBB3_2: 991; GFX1164-NEXT: s_endpgm 992; 993; GFX1132-LABEL: add_i32_varying_nouse: 994; GFX1132: ; %bb.0: ; %entry 995; GFX1132-NEXT: v_mov_b32_e32 v1, v0 996; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 997; GFX1132-NEXT: v_mov_b32_e32 v1, 0 998; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 999; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 1000; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1001; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1002; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1003; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1004; GFX1132-NEXT: v_mov_b32_e32 v2, v1 1005; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1006; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 1007; GFX1132-NEXT: s_mov_b32 exec_lo, s0 1008; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1009; GFX1132-NEXT: v_mov_b32_e32 v0, v1 1010; GFX1132-NEXT: s_mov_b32 s0, exec_lo 1011; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 1012; GFX1132-NEXT: s_cbranch_execz .LBB3_2 1013; GFX1132-NEXT: ; %bb.1: 1014; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1015; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1016; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1017; GFX1132-NEXT: ds_add_u32 v3, v0 1018; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX1132-NEXT: buffer_gl0_inv 1020; GFX1132-NEXT: .LBB3_2: 1021; GFX1132-NEXT: s_endpgm 1022entry: 1023 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1024 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1025 ret void 1026} 1027 1028define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1029; 1030; 1031; GFX7LESS-LABEL: add_i64_constant: 1032; GFX7LESS: ; %bb.0: ; %entry 1033; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1034; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1035; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1036; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1037; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1038; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1039; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1040; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1041; GFX7LESS-NEXT: ; %bb.1: 1042; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1043; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1044; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1045; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1046; GFX7LESS-NEXT: s_mov_b32 m0, -1 1047; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1048; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1049; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1050; GFX7LESS-NEXT: .LBB4_2: 1051; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1052; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1054; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1055; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1056; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1057; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1058; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1059; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1060; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1061; GFX7LESS-NEXT: s_mov_b32 s2, -1 1062; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1063; GFX7LESS-NEXT: s_endpgm 1064; 1065; GFX8-LABEL: add_i64_constant: 1066; GFX8: ; %bb.0: ; %entry 1067; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1068; GFX8-NEXT: s_mov_b64 s[4:5], exec 1069; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1070; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1071; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1072; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1073; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1074; GFX8-NEXT: s_cbranch_execz .LBB4_2 1075; GFX8-NEXT: ; %bb.1: 1076; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1077; GFX8-NEXT: s_mul_i32 s4, s4, 5 1078; GFX8-NEXT: v_mov_b32_e32 v0, s4 1079; GFX8-NEXT: v_mov_b32_e32 v1, 0 1080; GFX8-NEXT: s_mov_b32 m0, -1 1081; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1083; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1084; GFX8-NEXT: .LBB4_2: 1085; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1086; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1087; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1088; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1089; GFX8-NEXT: v_mov_b32_e32 v0, s2 1090; GFX8-NEXT: v_mov_b32_e32 v1, s3 1091; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1092; GFX8-NEXT: s_mov_b32 s3, 0xf000 1093; GFX8-NEXT: s_mov_b32 s2, -1 1094; GFX8-NEXT: s_nop 2 1095; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1096; GFX8-NEXT: s_endpgm 1097; 1098; GFX9-LABEL: add_i64_constant: 1099; GFX9: ; %bb.0: ; %entry 1100; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1101; GFX9-NEXT: s_mov_b64 s[4:5], exec 1102; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1103; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1104; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1105; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1106; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1107; GFX9-NEXT: s_cbranch_execz .LBB4_2 1108; GFX9-NEXT: ; %bb.1: 1109; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1110; GFX9-NEXT: s_mul_i32 s4, s4, 5 1111; GFX9-NEXT: v_mov_b32_e32 v0, s4 1112; GFX9-NEXT: v_mov_b32_e32 v1, 0 1113; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1115; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX9-NEXT: .LBB4_2: 1117; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1119; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1120; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1121; GFX9-NEXT: v_mov_b32_e32 v0, s2 1122; GFX9-NEXT: v_mov_b32_e32 v1, s3 1123; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1124; GFX9-NEXT: s_mov_b32 s3, 0xf000 1125; GFX9-NEXT: s_mov_b32 s2, -1 1126; GFX9-NEXT: s_nop 2 1127; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1128; GFX9-NEXT: s_endpgm 1129; 1130; GFX1064-LABEL: add_i64_constant: 1131; GFX1064: ; %bb.0: ; %entry 1132; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1133; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1134; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1135; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1136; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1137; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1138; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1139; GFX1064-NEXT: s_cbranch_execz .LBB4_2 1140; GFX1064-NEXT: ; %bb.1: 1141; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1142; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1143; GFX1064-NEXT: s_mul_i32 s4, s4, 5 1144; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1145; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1146; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1147; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1148; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX1064-NEXT: buffer_gl0_inv 1150; GFX1064-NEXT: .LBB4_2: 1151; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1152; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1153; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1154; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1155; GFX1064-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1156; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1157; GFX1064-NEXT: s_mov_b32 s2, -1 1158; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1159; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1160; GFX1064-NEXT: s_endpgm 1161; 1162; GFX1032-LABEL: add_i64_constant: 1163; GFX1032: ; %bb.0: ; %entry 1164; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1165; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1166; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1167; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1168; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1169; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1170; GFX1032-NEXT: s_cbranch_execz .LBB4_2 1171; GFX1032-NEXT: ; %bb.1: 1172; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1173; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1174; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1175; GFX1032-NEXT: v_mov_b32_e32 v0, s3 1176; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1177; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1178; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1179; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1180; GFX1032-NEXT: buffer_gl0_inv 1181; GFX1032-NEXT: .LBB4_2: 1182; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1183; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1184; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1185; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1186; GFX1032-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1187; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1188; GFX1032-NEXT: s_mov_b32 s2, -1 1189; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1190; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1191; GFX1032-NEXT: s_endpgm 1192; 1193; GFX1164-LABEL: add_i64_constant: 1194; GFX1164: ; %bb.0: ; %entry 1195; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1196; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1197; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1198; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1199; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1200; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1201; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1202; GFX1164-NEXT: s_cbranch_execz .LBB4_2 1203; GFX1164-NEXT: ; %bb.1: 1204; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1205; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1206; GFX1164-NEXT: s_mul_i32 s4, s4, 5 1207; GFX1164-NEXT: v_mov_b32_e32 v0, s4 1208; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1209; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1210; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1211; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1212; GFX1164-NEXT: buffer_gl0_inv 1213; GFX1164-NEXT: .LBB4_2: 1214; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 1215; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1216; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1217; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1218; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1219; GFX1164-NEXT: s_mov_b32 s2, -1 1220; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1221; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1222; GFX1164-NEXT: s_endpgm 1223; 1224; GFX1132-LABEL: add_i64_constant: 1225; GFX1132: ; %bb.0: ; %entry 1226; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1227; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1228; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1229; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1230; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1231; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1232; GFX1132-NEXT: s_cbranch_execz .LBB4_2 1233; GFX1132-NEXT: ; %bb.1: 1234; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1235; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1236; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1237; GFX1132-NEXT: v_mov_b32_e32 v0, s3 1238; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1239; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1240; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1241; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1242; GFX1132-NEXT: buffer_gl0_inv 1243; GFX1132-NEXT: .LBB4_2: 1244; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1245; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1246; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1247; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1248; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1249; GFX1132-NEXT: s_mov_b32 s2, -1 1250; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1251; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1252; GFX1132-NEXT: s_endpgm 1253entry: 1254 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1255 store i64 %old, i64 addrspace(1)* %out 1256 ret void 1257} 1258 1259define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1260; 1261; 1262; GFX7LESS-LABEL: add_i64_uniform: 1263; GFX7LESS: ; %bb.0: ; %entry 1264; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1265; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1266; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1267; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1268; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1269; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1270; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1271; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 1272; GFX7LESS-NEXT: ; %bb.1: 1273; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1274; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 1275; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1276; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1277; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1278; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 1279; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1280; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1281; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1282; GFX7LESS-NEXT: s_mov_b32 m0, -1 1283; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1284; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1285; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1286; GFX7LESS-NEXT: .LBB5_2: 1287; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1288; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1289; GFX7LESS-NEXT: s_mov_b32 s6, -1 1290; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX7LESS-NEXT: s_mov_b32 s4, s0 1292; GFX7LESS-NEXT: s_mov_b32 s5, s1 1293; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 1294; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1295; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 1296; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 1297; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 1298; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1299; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 1300; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1301; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1302; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1303; GFX7LESS-NEXT: s_endpgm 1304; 1305; GFX8-LABEL: add_i64_uniform: 1306; GFX8: ; %bb.0: ; %entry 1307; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1308; GFX8-NEXT: s_mov_b64 s[6:7], exec 1309; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1310; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1311; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1312; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1313; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1314; GFX8-NEXT: s_cbranch_execz .LBB5_2 1315; GFX8-NEXT: ; %bb.1: 1316; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 1317; GFX8-NEXT: v_mov_b32_e32 v0, s8 1318; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1319; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 1320; GFX8-NEXT: s_mul_i32 s6, s3, s8 1321; GFX8-NEXT: v_mov_b32_e32 v3, 0 1322; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1323; GFX8-NEXT: s_mov_b32 m0, -1 1324; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1325; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1326; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1327; GFX8-NEXT: .LBB5_2: 1328; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1329; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1330; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1331; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1332; GFX8-NEXT: v_mov_b32_e32 v0, s4 1333; GFX8-NEXT: v_mov_b32_e32 v1, s5 1334; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 1335; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 1336; GFX8-NEXT: s_mov_b32 s7, 0xf000 1337; GFX8-NEXT: s_mov_b32 s6, -1 1338; GFX8-NEXT: s_mov_b32 s4, s0 1339; GFX8-NEXT: s_mov_b32 s5, s1 1340; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1341; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1342; GFX8-NEXT: s_endpgm 1343; 1344; GFX9-LABEL: add_i64_uniform: 1345; GFX9: ; %bb.0: ; %entry 1346; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1347; GFX9-NEXT: s_mov_b64 s[6:7], exec 1348; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1349; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1350; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1351; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1352; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1353; GFX9-NEXT: s_cbranch_execz .LBB5_2 1354; GFX9-NEXT: ; %bb.1: 1355; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1356; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1357; GFX9-NEXT: s_mul_i32 s7, s3, s6 1358; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1359; GFX9-NEXT: s_add_i32 s8, s8, s7 1360; GFX9-NEXT: s_mul_i32 s6, s2, s6 1361; GFX9-NEXT: v_mov_b32_e32 v0, s6 1362; GFX9-NEXT: v_mov_b32_e32 v1, s8 1363; GFX9-NEXT: v_mov_b32_e32 v3, 0 1364; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1365; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX9-NEXT: .LBB5_2: 1368; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1369; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1370; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1371; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1372; GFX9-NEXT: v_mov_b32_e32 v0, s4 1373; GFX9-NEXT: v_mov_b32_e32 v1, s5 1374; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] 1375; GFX9-NEXT: s_mov_b32 s7, 0xf000 1376; GFX9-NEXT: s_mov_b32 s6, -1 1377; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1378; GFX9-NEXT: s_mov_b32 s4, s0 1379; GFX9-NEXT: s_mov_b32 s5, s1 1380; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1381; GFX9-NEXT: s_endpgm 1382; 1383; GFX1064-LABEL: add_i64_uniform: 1384; GFX1064: ; %bb.0: ; %entry 1385; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1386; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1387; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1388; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1389; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1390; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1391; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1392; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1393; GFX1064-NEXT: ; %bb.1: 1394; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1395; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1396; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1398; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1399; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1400; GFX1064-NEXT: s_add_i32 s8, s8, s7 1401; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1402; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1403; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1404; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1405; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1406; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1407; GFX1064-NEXT: buffer_gl0_inv 1408; GFX1064-NEXT: .LBB5_2: 1409; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1410; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1411; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1412; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1413; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1414; GFX1064-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1415; GFX1064-NEXT: s_mov_b32 s2, -1 1416; GFX1064-NEXT: v_mad_u64_u32 v[1:2], null, s3, v2, v[1:2] 1417; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1418; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1419; GFX1064-NEXT: s_endpgm 1420; 1421; GFX1032-LABEL: add_i64_uniform: 1422; GFX1032: ; %bb.0: ; %entry 1423; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1424; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1425; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1426; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1427; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1428; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1429; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1430; GFX1032-NEXT: ; %bb.1: 1431; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1432; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1433; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1434; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1435; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1436; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1437; GFX1032-NEXT: s_add_i32 s7, s7, s6 1438; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1439; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1440; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1441; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1442; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1443; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1444; GFX1032-NEXT: buffer_gl0_inv 1445; GFX1032-NEXT: .LBB5_2: 1446; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1447; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1448; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1449; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1450; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX1032-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1452; GFX1032-NEXT: s_mov_b32 s2, -1 1453; GFX1032-NEXT: v_mad_u64_u32 v[1:2], null, s3, v2, v[1:2] 1454; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1455; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1456; GFX1032-NEXT: s_endpgm 1457; 1458; GFX1164-LABEL: add_i64_uniform: 1459; GFX1164: ; %bb.0: ; %entry 1460; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1461; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1462; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1463; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1464; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1465; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1466; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1467; GFX1164-NEXT: s_cbranch_execz .LBB5_2 1468; GFX1164-NEXT: ; %bb.1: 1469; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1470; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1471; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1472; GFX1164-NEXT: s_mul_i32 s7, s3, s6 1473; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 1474; GFX1164-NEXT: s_mul_i32 s6, s2, s6 1475; GFX1164-NEXT: s_add_i32 s8, s8, s7 1476; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1477; GFX1164-NEXT: v_mov_b32_e32 v1, s8 1478; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1479; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1480; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1481; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1482; GFX1164-NEXT: buffer_gl0_inv 1483; GFX1164-NEXT: .LBB5_2: 1484; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1485; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 1486; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 1487; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1488; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1489; GFX1164-NEXT: s_mov_b32 s2, -1 1490; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1491; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1492; GFX1164-NEXT: v_mov_b32_e32 v1, v3 1493; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1494; GFX1164-NEXT: s_endpgm 1495; 1496; GFX1132-LABEL: add_i64_uniform: 1497; GFX1132: ; %bb.0: ; %entry 1498; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1499; GFX1132-NEXT: s_mov_b32 s5, exec_lo 1500; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1501; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1502; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1503; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1504; GFX1132-NEXT: s_cbranch_execz .LBB5_2 1505; GFX1132-NEXT: ; %bb.1: 1506; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 1507; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1508; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1509; GFX1132-NEXT: s_mul_i32 s6, s3, s5 1510; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 1511; GFX1132-NEXT: s_mul_i32 s5, s2, s5 1512; GFX1132-NEXT: s_add_i32 s7, s7, s6 1513; GFX1132-NEXT: v_mov_b32_e32 v0, s5 1514; GFX1132-NEXT: v_mov_b32_e32 v1, s7 1515; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1516; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1517; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1518; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1519; GFX1132-NEXT: buffer_gl0_inv 1520; GFX1132-NEXT: .LBB5_2: 1521; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1522; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 1523; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 1524; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1525; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1526; GFX1132-NEXT: s_mov_b32 s2, -1 1527; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1528; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1529; GFX1132-NEXT: v_mov_b32_e32 v1, v3 1530; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1531; GFX1132-NEXT: s_endpgm 1532entry: 1533 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1534 store i64 %old, i64 addrspace(1)* %out 1535 ret void 1536} 1537 1538define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1539; 1540; 1541; GFX7LESS-LABEL: add_i64_varying: 1542; GFX7LESS: ; %bb.0: ; %entry 1543; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1544; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1545; GFX7LESS-NEXT: s_mov_b32 m0, -1 1546; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1547; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1548; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1549; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1550; GFX7LESS-NEXT: s_mov_b32 s2, -1 1551; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1552; GFX7LESS-NEXT: s_endpgm 1553; 1554; GFX8-LABEL: add_i64_varying: 1555; GFX8: ; %bb.0: ; %entry 1556; GFX8-NEXT: v_mov_b32_e32 v1, 0 1557; GFX8-NEXT: s_mov_b32 m0, -1 1558; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1559; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1560; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1561; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1562; GFX8-NEXT: s_mov_b32 s3, 0xf000 1563; GFX8-NEXT: s_mov_b32 s2, -1 1564; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1565; GFX8-NEXT: s_endpgm 1566; 1567; GFX9-LABEL: add_i64_varying: 1568; GFX9: ; %bb.0: ; %entry 1569; GFX9-NEXT: v_mov_b32_e32 v1, 0 1570; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1571; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1572; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1573; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1574; GFX9-NEXT: s_mov_b32 s3, 0xf000 1575; GFX9-NEXT: s_mov_b32 s2, -1 1576; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1577; GFX9-NEXT: s_endpgm 1578; 1579; GFX10-LABEL: add_i64_varying: 1580; GFX10: ; %bb.0: ; %entry 1581; GFX10-NEXT: v_mov_b32_e32 v1, 0 1582; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1583; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1584; GFX10-NEXT: s_mov_b32 s2, -1 1585; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1586; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1587; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1588; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1589; GFX10-NEXT: buffer_gl0_inv 1590; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1591; GFX10-NEXT: s_endpgm 1592; 1593; GFX11-LABEL: add_i64_varying: 1594; GFX11: ; %bb.0: ; %entry 1595; GFX11-NEXT: v_mov_b32_e32 v1, 0 1596; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1597; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1598; GFX11-NEXT: s_mov_b32 s2, -1 1599; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1600; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1601; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1602; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1603; GFX11-NEXT: buffer_gl0_inv 1604; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1605; GFX11-NEXT: s_endpgm 1606entry: 1607 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1608 %zext = zext i32 %lane to i64 1609 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1610 store i64 %old, i64 addrspace(1)* %out 1611 ret void 1612} 1613 1614define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1615; 1616; 1617; GFX7LESS-LABEL: sub_i32_constant: 1618; GFX7LESS: ; %bb.0: ; %entry 1619; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1620; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1621; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1622; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1623; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1624; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1625; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1626; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1627; GFX7LESS-NEXT: ; %bb.1: 1628; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1629; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1630; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1631; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1632; GFX7LESS-NEXT: s_mov_b32 m0, -1 1633; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1634; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1635; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX7LESS-NEXT: .LBB7_2: 1637; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1638; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1639; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1640; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1641; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1642; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1643; GFX7LESS-NEXT: s_mov_b32 s2, -1 1644; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1645; GFX7LESS-NEXT: s_endpgm 1646; 1647; GFX8-LABEL: sub_i32_constant: 1648; GFX8: ; %bb.0: ; %entry 1649; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1650; GFX8-NEXT: s_mov_b64 s[2:3], exec 1651; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1652; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1653; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1654; GFX8-NEXT: ; implicit-def: $vgpr1 1655; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1656; GFX8-NEXT: s_cbranch_execz .LBB7_2 1657; GFX8-NEXT: ; %bb.1: 1658; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1659; GFX8-NEXT: s_mul_i32 s2, s2, 5 1660; GFX8-NEXT: v_mov_b32_e32 v1, 0 1661; GFX8-NEXT: v_mov_b32_e32 v2, s2 1662; GFX8-NEXT: s_mov_b32 m0, -1 1663; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1664; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1665; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1666; GFX8-NEXT: .LBB7_2: 1667; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1668; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1670; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1671; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1672; GFX8-NEXT: s_mov_b32 s3, 0xf000 1673; GFX8-NEXT: s_mov_b32 s2, -1 1674; GFX8-NEXT: s_nop 0 1675; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1676; GFX8-NEXT: s_endpgm 1677; 1678; GFX9-LABEL: sub_i32_constant: 1679; GFX9: ; %bb.0: ; %entry 1680; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1681; GFX9-NEXT: s_mov_b64 s[2:3], exec 1682; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1683; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1684; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1685; GFX9-NEXT: ; implicit-def: $vgpr1 1686; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1687; GFX9-NEXT: s_cbranch_execz .LBB7_2 1688; GFX9-NEXT: ; %bb.1: 1689; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1690; GFX9-NEXT: s_mul_i32 s2, s2, 5 1691; GFX9-NEXT: v_mov_b32_e32 v1, 0 1692; GFX9-NEXT: v_mov_b32_e32 v2, s2 1693; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1694; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1695; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1696; GFX9-NEXT: .LBB7_2: 1697; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1698; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1700; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1701; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1702; GFX9-NEXT: s_mov_b32 s3, 0xf000 1703; GFX9-NEXT: s_mov_b32 s2, -1 1704; GFX9-NEXT: s_nop 0 1705; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1706; GFX9-NEXT: s_endpgm 1707; 1708; GFX1064-LABEL: sub_i32_constant: 1709; GFX1064: ; %bb.0: ; %entry 1710; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1711; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1712; GFX1064-NEXT: ; implicit-def: $vgpr1 1713; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1714; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1715; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1716; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1717; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1718; GFX1064-NEXT: ; %bb.1: 1719; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1720; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1721; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1722; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1723; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1724; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1725; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1726; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX1064-NEXT: buffer_gl0_inv 1728; GFX1064-NEXT: .LBB7_2: 1729; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1730; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1731; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1732; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1733; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1734; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1735; GFX1064-NEXT: s_mov_b32 s2, -1 1736; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1737; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1738; GFX1064-NEXT: s_endpgm 1739; 1740; GFX1032-LABEL: sub_i32_constant: 1741; GFX1032: ; %bb.0: ; %entry 1742; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1743; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1744; GFX1032-NEXT: ; implicit-def: $vgpr1 1745; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1746; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1747; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1748; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1749; GFX1032-NEXT: ; %bb.1: 1750; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1751; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1752; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1753; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1754; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1755; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1756; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1757; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1758; GFX1032-NEXT: buffer_gl0_inv 1759; GFX1032-NEXT: .LBB7_2: 1760; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1761; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1762; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1763; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1764; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1765; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1766; GFX1032-NEXT: s_mov_b32 s2, -1 1767; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1769; GFX1032-NEXT: s_endpgm 1770; 1771; GFX1164-LABEL: sub_i32_constant: 1772; GFX1164: ; %bb.0: ; %entry 1773; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1774; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1775; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1776; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1777; GFX1164-NEXT: ; implicit-def: $vgpr1 1778; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1779; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 1780; GFX1164-NEXT: s_cbranch_execz .LBB7_2 1781; GFX1164-NEXT: ; %bb.1: 1782; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1783; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1784; GFX1164-NEXT: s_mul_i32 s2, s2, 5 1785; GFX1164-NEXT: v_mov_b32_e32 v2, s2 1786; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1787; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1788; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 1789; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1790; GFX1164-NEXT: buffer_gl0_inv 1791; GFX1164-NEXT: .LBB7_2: 1792; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1793; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 1794; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1795; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1796; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1797; GFX1164-NEXT: s_mov_b32 s2, -1 1798; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1799; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1800; GFX1164-NEXT: s_endpgm 1801; 1802; GFX1132-LABEL: sub_i32_constant: 1803; GFX1132: ; %bb.0: ; %entry 1804; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1805; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1806; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1807; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1808; GFX1132-NEXT: ; implicit-def: $vgpr1 1809; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 1810; GFX1132-NEXT: s_cbranch_execz .LBB7_2 1811; GFX1132-NEXT: ; %bb.1: 1812; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1813; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1814; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1815; GFX1132-NEXT: v_mov_b32_e32 v2, s3 1816; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1817; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1818; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 1819; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX1132-NEXT: buffer_gl0_inv 1821; GFX1132-NEXT: .LBB7_2: 1822; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1823; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 1824; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1825; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1826; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1827; GFX1132-NEXT: s_mov_b32 s2, -1 1828; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1829; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1830; GFX1132-NEXT: s_endpgm 1831entry: 1832 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1833 store i32 %old, i32 addrspace(1)* %out 1834 ret void 1835} 1836 1837define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1838; 1839; 1840; GFX7LESS-LABEL: sub_i32_uniform: 1841; GFX7LESS: ; %bb.0: ; %entry 1842; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1843; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1844; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1845; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1846; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1847; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1848; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1849; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1850; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1851; GFX7LESS-NEXT: ; %bb.1: 1852; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1853; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1854; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1855; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1856; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1857; GFX7LESS-NEXT: s_mov_b32 m0, -1 1858; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1859; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1860; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1861; GFX7LESS-NEXT: .LBB8_2: 1862; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1863; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1864; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1865; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1866; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1867; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1868; GFX7LESS-NEXT: s_mov_b32 s6, -1 1869; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1870; GFX7LESS-NEXT: s_endpgm 1871; 1872; GFX8-LABEL: sub_i32_uniform: 1873; GFX8: ; %bb.0: ; %entry 1874; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1875; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1876; GFX8-NEXT: s_mov_b64 s[2:3], exec 1877; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1878; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1879; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1880; GFX8-NEXT: ; implicit-def: $vgpr1 1881; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1882; GFX8-NEXT: s_cbranch_execz .LBB8_2 1883; GFX8-NEXT: ; %bb.1: 1884; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1885; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1886; GFX8-NEXT: s_mul_i32 s2, s6, s2 1887; GFX8-NEXT: v_mov_b32_e32 v1, 0 1888; GFX8-NEXT: v_mov_b32_e32 v2, s2 1889; GFX8-NEXT: s_mov_b32 m0, -1 1890; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1891; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1892; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1893; GFX8-NEXT: .LBB8_2: 1894; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1895; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1896; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1897; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1898; GFX8-NEXT: s_mov_b32 s7, 0xf000 1899; GFX8-NEXT: s_mov_b32 s6, -1 1900; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1901; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1902; GFX8-NEXT: s_endpgm 1903; 1904; GFX9-LABEL: sub_i32_uniform: 1905; GFX9: ; %bb.0: ; %entry 1906; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1907; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1908; GFX9-NEXT: s_mov_b64 s[2:3], exec 1909; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1910; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1911; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1912; GFX9-NEXT: ; implicit-def: $vgpr1 1913; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1914; GFX9-NEXT: s_cbranch_execz .LBB8_2 1915; GFX9-NEXT: ; %bb.1: 1916; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1918; GFX9-NEXT: s_mul_i32 s2, s6, s2 1919; GFX9-NEXT: v_mov_b32_e32 v1, 0 1920; GFX9-NEXT: v_mov_b32_e32 v2, s2 1921; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1922; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1923; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1924; GFX9-NEXT: .LBB8_2: 1925; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1926; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1927; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1928; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1929; GFX9-NEXT: s_mov_b32 s7, 0xf000 1930; GFX9-NEXT: s_mov_b32 s6, -1 1931; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1932; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1933; GFX9-NEXT: s_endpgm 1934; 1935; GFX1064-LABEL: sub_i32_uniform: 1936; GFX1064: ; %bb.0: ; %entry 1937; GFX1064-NEXT: s_clause 0x1 1938; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1939; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 1940; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1941; GFX1064-NEXT: ; implicit-def: $vgpr1 1942; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1943; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1944; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1945; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1946; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1947; GFX1064-NEXT: ; %bb.1: 1948; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1949; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1950; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1951; GFX1064-NEXT: s_mul_i32 s2, s6, s2 1952; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1953; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1954; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1955; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1956; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1957; GFX1064-NEXT: buffer_gl0_inv 1958; GFX1064-NEXT: .LBB8_2: 1959; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1960; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1961; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1962; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 1963; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1964; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1965; GFX1064-NEXT: s_mov_b32 s6, -1 1966; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1967; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1968; GFX1064-NEXT: s_endpgm 1969; 1970; GFX1032-LABEL: sub_i32_uniform: 1971; GFX1032: ; %bb.0: ; %entry 1972; GFX1032-NEXT: s_clause 0x1 1973; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1974; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1975; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1976; GFX1032-NEXT: ; implicit-def: $vgpr1 1977; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1978; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1979; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1980; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1981; GFX1032-NEXT: ; %bb.1: 1982; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1983; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1984; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1986; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1987; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1988; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1989; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1990; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1991; GFX1032-NEXT: buffer_gl0_inv 1992; GFX1032-NEXT: .LBB8_2: 1993; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1994; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1995; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1996; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1997; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1998; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1999; GFX1032-NEXT: s_mov_b32 s6, -1 2000; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2001; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 2002; GFX1032-NEXT: s_endpgm 2003; 2004; GFX1164-LABEL: sub_i32_uniform: 2005; GFX1164: ; %bb.0: ; %entry 2006; GFX1164-NEXT: s_clause 0x1 2007; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2008; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 2009; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2010; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2011; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2012; GFX1164-NEXT: ; implicit-def: $vgpr1 2013; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2014; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 2015; GFX1164-NEXT: s_cbranch_execz .LBB8_2 2016; GFX1164-NEXT: ; %bb.1: 2017; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2018; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2019; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2020; GFX1164-NEXT: s_mul_i32 s2, s6, s2 2021; GFX1164-NEXT: v_mov_b32_e32 v2, s2 2022; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2023; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2024; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 2025; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2026; GFX1164-NEXT: buffer_gl0_inv 2027; GFX1164-NEXT: .LBB8_2: 2028; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 2029; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2030; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 2031; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 2032; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 2033; GFX1164-NEXT: s_mov_b32 s6, -1 2034; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2035; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2036; GFX1164-NEXT: s_endpgm 2037; 2038; GFX1132-LABEL: sub_i32_uniform: 2039; GFX1132: ; %bb.0: ; %entry 2040; GFX1132-NEXT: s_clause 0x1 2041; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2042; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 2043; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2044; GFX1132-NEXT: s_mov_b32 s1, exec_lo 2045; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2046; GFX1132-NEXT: ; implicit-def: $vgpr1 2047; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 2048; GFX1132-NEXT: s_cbranch_execz .LBB8_2 2049; GFX1132-NEXT: ; %bb.1: 2050; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 2051; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2052; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX1132-NEXT: s_mul_i32 s2, s0, s2 2054; GFX1132-NEXT: v_mov_b32_e32 v2, s2 2055; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2056; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2057; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 2058; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2059; GFX1132-NEXT: buffer_gl0_inv 2060; GFX1132-NEXT: .LBB8_2: 2061; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 2062; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2063; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 2064; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 2065; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 2066; GFX1132-NEXT: s_mov_b32 s6, -1 2067; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2068; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2069; GFX1132-NEXT: s_endpgm 2070entry: 2071 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 2072 store i32 %old, i32 addrspace(1)* %out 2073 ret void 2074} 2075 2076define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 2077; 2078; 2079; GFX7LESS-LABEL: sub_i32_varying: 2080; GFX7LESS: ; %bb.0: ; %entry 2081; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2082; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2083; GFX7LESS-NEXT: s_mov_b32 m0, -1 2084; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2085; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 2086; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2087; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2088; GFX7LESS-NEXT: s_mov_b32 s2, -1 2089; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2090; GFX7LESS-NEXT: s_endpgm 2091; 2092; GFX8-LABEL: sub_i32_varying: 2093; GFX8: ; %bb.0: ; %entry 2094; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2095; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2096; GFX8-NEXT: v_mov_b32_e32 v1, 0 2097; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2098; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2099; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2100; GFX8-NEXT: v_mov_b32_e32 v2, v0 2101; GFX8-NEXT: s_not_b64 exec, exec 2102; GFX8-NEXT: v_mov_b32_e32 v2, 0 2103; GFX8-NEXT: s_not_b64 exec, exec 2104; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2105; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2106; GFX8-NEXT: s_nop 1 2107; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2108; GFX8-NEXT: s_nop 1 2109; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2110; GFX8-NEXT: s_nop 1 2111; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2112; GFX8-NEXT: s_nop 1 2113; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2114; GFX8-NEXT: s_nop 1 2115; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2116; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2117; GFX8-NEXT: s_nop 0 2118; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2119; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2120; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2121; GFX8-NEXT: ; implicit-def: $vgpr0 2122; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2123; GFX8-NEXT: s_cbranch_execz .LBB9_2 2124; GFX8-NEXT: ; %bb.1: 2125; GFX8-NEXT: v_mov_b32_e32 v0, 0 2126; GFX8-NEXT: v_mov_b32_e32 v3, s4 2127; GFX8-NEXT: s_mov_b32 m0, -1 2128; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2129; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 2130; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2131; GFX8-NEXT: .LBB9_2: 2132; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2133; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2134; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2135; GFX8-NEXT: v_mov_b32_e32 v0, v1 2136; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2137; GFX8-NEXT: s_mov_b32 s3, 0xf000 2138; GFX8-NEXT: s_mov_b32 s2, -1 2139; GFX8-NEXT: s_nop 0 2140; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2141; GFX8-NEXT: s_endpgm 2142; 2143; GFX9-LABEL: sub_i32_varying: 2144; GFX9: ; %bb.0: ; %entry 2145; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2146; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2147; GFX9-NEXT: v_mov_b32_e32 v1, 0 2148; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2149; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2150; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2151; GFX9-NEXT: v_mov_b32_e32 v2, v0 2152; GFX9-NEXT: s_not_b64 exec, exec 2153; GFX9-NEXT: v_mov_b32_e32 v2, 0 2154; GFX9-NEXT: s_not_b64 exec, exec 2155; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2156; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2157; GFX9-NEXT: s_nop 1 2158; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2159; GFX9-NEXT: s_nop 1 2160; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2161; GFX9-NEXT: s_nop 1 2162; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2163; GFX9-NEXT: s_nop 1 2164; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2165; GFX9-NEXT: s_nop 1 2166; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2167; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2168; GFX9-NEXT: s_nop 0 2169; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2170; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2171; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2172; GFX9-NEXT: ; implicit-def: $vgpr0 2173; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2174; GFX9-NEXT: s_cbranch_execz .LBB9_2 2175; GFX9-NEXT: ; %bb.1: 2176; GFX9-NEXT: v_mov_b32_e32 v0, 0 2177; GFX9-NEXT: v_mov_b32_e32 v3, s4 2178; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2179; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2180; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2181; GFX9-NEXT: .LBB9_2: 2182; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2183; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2184; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2185; GFX9-NEXT: v_mov_b32_e32 v0, v1 2186; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2187; GFX9-NEXT: s_mov_b32 s3, 0xf000 2188; GFX9-NEXT: s_mov_b32 s2, -1 2189; GFX9-NEXT: s_nop 0 2190; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2191; GFX9-NEXT: s_endpgm 2192; 2193; GFX1064-LABEL: sub_i32_varying: 2194; GFX1064: ; %bb.0: ; %entry 2195; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2196; GFX1064-NEXT: s_not_b64 exec, exec 2197; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2198; GFX1064-NEXT: s_not_b64 exec, exec 2199; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2200; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2201; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2202; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2203; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2204; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2205; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2206; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2207; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2208; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2209; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2210; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2211; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2212; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2213; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2214; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2215; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2216; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2217; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2218; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2219; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2220; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2221; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2222; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2223; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2224; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2225; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2226; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2227; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2228; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2229; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2230; GFX1064-NEXT: s_mov_b32 s2, -1 2231; GFX1064-NEXT: ; implicit-def: $vgpr0 2232; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2233; GFX1064-NEXT: s_cbranch_execz .LBB9_2 2234; GFX1064-NEXT: ; %bb.1: 2235; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2236; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2237; GFX1064-NEXT: s_mov_b32 s3, s7 2238; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2239; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2240; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 2241; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2242; GFX1064-NEXT: buffer_gl0_inv 2243; GFX1064-NEXT: .LBB9_2: 2244; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2245; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2246; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2247; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2248; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2249; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2250; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2251; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2252; GFX1064-NEXT: s_endpgm 2253; 2254; GFX1032-LABEL: sub_i32_varying: 2255; GFX1032: ; %bb.0: ; %entry 2256; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2257; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2258; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2259; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2260; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2261; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2262; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2263; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2264; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2265; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2266; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2267; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2268; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2269; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2270; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2271; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2272; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2273; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2274; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2275; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2276; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2277; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2278; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2279; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2280; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2281; GFX1032-NEXT: s_mov_b32 s2, -1 2282; GFX1032-NEXT: ; implicit-def: $vgpr0 2283; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2284; GFX1032-NEXT: s_cbranch_execz .LBB9_2 2285; GFX1032-NEXT: ; %bb.1: 2286; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2287; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2288; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2289; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2290; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 2291; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2292; GFX1032-NEXT: buffer_gl0_inv 2293; GFX1032-NEXT: .LBB9_2: 2294; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2295; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2296; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2297; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2298; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2299; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2300; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2301; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2302; GFX1032-NEXT: s_endpgm 2303; 2304; GFX1164-LABEL: sub_i32_varying: 2305; GFX1164: ; %bb.0: ; %entry 2306; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2307; GFX1164-NEXT: s_not_b64 exec, exec 2308; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2309; GFX1164-NEXT: s_not_b64 exec, exec 2310; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2311; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2312; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2313; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2314; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2315; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2316; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2317; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2318; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2319; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 2320; GFX1164-NEXT: v_mov_b32_e32 v2, s4 2321; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2322; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 2323; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2324; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2325; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2326; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2327; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 2328; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 2329; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2330; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2331; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2332; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 2333; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 2334; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 2335; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2336; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2337; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 2338; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 2339; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 2340; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2341; GFX1164-NEXT: s_mov_b32 s2, -1 2342; GFX1164-NEXT: ; implicit-def: $vgpr0 2343; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 2344; GFX1164-NEXT: s_cbranch_execz .LBB9_2 2345; GFX1164-NEXT: ; %bb.1: 2346; GFX1164-NEXT: v_mov_b32_e32 v0, 0 2347; GFX1164-NEXT: v_mov_b32_e32 v4, s7 2348; GFX1164-NEXT: s_mov_b32 s3, s7 2349; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2350; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2351; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v4 2352; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2353; GFX1164-NEXT: buffer_gl0_inv 2354; GFX1164-NEXT: .LBB9_2: 2355; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 2356; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 2357; GFX1164-NEXT: v_mov_b32_e32 v0, v3 2358; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2359; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2360; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2361; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2362; GFX1164-NEXT: s_endpgm 2363; 2364; GFX1132-LABEL: sub_i32_varying: 2365; GFX1132: ; %bb.0: ; %entry 2366; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2367; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2368; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2369; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2370; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2371; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2372; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2373; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2374; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2375; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2376; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2377; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2378; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2379; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2380; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2381; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2382; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 2383; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 2384; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2385; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2386; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2387; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2388; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 2389; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2390; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2391; GFX1132-NEXT: s_mov_b32 s2, -1 2392; GFX1132-NEXT: ; implicit-def: $vgpr0 2393; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 2394; GFX1132-NEXT: s_cbranch_execz .LBB9_2 2395; GFX1132-NEXT: ; %bb.1: 2396; GFX1132-NEXT: v_mov_b32_e32 v0, 0 2397; GFX1132-NEXT: v_mov_b32_e32 v4, s4 2398; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2399; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2400; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v4 2401; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2402; GFX1132-NEXT: buffer_gl0_inv 2403; GFX1132-NEXT: .LBB9_2: 2404; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 2405; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 2406; GFX1132-NEXT: v_mov_b32_e32 v0, v3 2407; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2408; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2409; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2410; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2411; GFX1132-NEXT: s_endpgm 2412entry: 2413 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2414 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2415 store i32 %old, i32 addrspace(1)* %out 2416 ret void 2417} 2418 2419define amdgpu_kernel void @sub_i32_varying_nouse() { 2420; GFX7LESS-LABEL: sub_i32_varying_nouse: 2421; GFX7LESS: ; %bb.0: ; %entry 2422; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2423; GFX7LESS-NEXT: s_mov_b32 m0, -1 2424; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2425; GFX7LESS-NEXT: ds_sub_u32 v1, v0 2426; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2427; GFX7LESS-NEXT: s_endpgm 2428; 2429; GFX8-LABEL: sub_i32_varying_nouse: 2430; GFX8: ; %bb.0: ; %entry 2431; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2432; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2433; GFX8-NEXT: v_mov_b32_e32 v1, v0 2434; GFX8-NEXT: s_not_b64 exec, exec 2435; GFX8-NEXT: v_mov_b32_e32 v1, 0 2436; GFX8-NEXT: s_not_b64 exec, exec 2437; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 2438; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2439; GFX8-NEXT: s_nop 1 2440; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2441; GFX8-NEXT: s_nop 1 2442; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2443; GFX8-NEXT: s_nop 1 2444; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2445; GFX8-NEXT: s_nop 1 2446; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2447; GFX8-NEXT: s_nop 1 2448; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2449; GFX8-NEXT: v_readlane_b32 s2, v1, 63 2450; GFX8-NEXT: s_mov_b64 exec, s[0:1] 2451; GFX8-NEXT: s_mov_b32 s0, s2 2452; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2453; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2454; GFX8-NEXT: s_cbranch_execz .LBB10_2 2455; GFX8-NEXT: ; %bb.1: 2456; GFX8-NEXT: v_mov_b32_e32 v0, 0 2457; GFX8-NEXT: v_mov_b32_e32 v2, s0 2458; GFX8-NEXT: s_mov_b32 m0, -1 2459; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2460; GFX8-NEXT: ds_sub_u32 v0, v2 2461; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2462; GFX8-NEXT: .LBB10_2: 2463; GFX8-NEXT: s_endpgm 2464; 2465; GFX9-LABEL: sub_i32_varying_nouse: 2466; GFX9: ; %bb.0: ; %entry 2467; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2468; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2469; GFX9-NEXT: v_mov_b32_e32 v1, v0 2470; GFX9-NEXT: s_not_b64 exec, exec 2471; GFX9-NEXT: v_mov_b32_e32 v1, 0 2472; GFX9-NEXT: s_not_b64 exec, exec 2473; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 2474; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2475; GFX9-NEXT: s_nop 1 2476; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2477; GFX9-NEXT: s_nop 1 2478; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2479; GFX9-NEXT: s_nop 1 2480; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2481; GFX9-NEXT: s_nop 1 2482; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2483; GFX9-NEXT: s_nop 1 2484; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2485; GFX9-NEXT: v_readlane_b32 s2, v1, 63 2486; GFX9-NEXT: s_mov_b64 exec, s[0:1] 2487; GFX9-NEXT: s_mov_b32 s0, s2 2488; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2489; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2490; GFX9-NEXT: s_cbranch_execz .LBB10_2 2491; GFX9-NEXT: ; %bb.1: 2492; GFX9-NEXT: v_mov_b32_e32 v0, 0 2493; GFX9-NEXT: v_mov_b32_e32 v2, s0 2494; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2495; GFX9-NEXT: ds_sub_u32 v0, v2 2496; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2497; GFX9-NEXT: .LBB10_2: 2498; GFX9-NEXT: s_endpgm 2499; 2500; GFX1064-LABEL: sub_i32_varying_nouse: 2501; GFX1064: ; %bb.0: ; %entry 2502; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2503; GFX1064-NEXT: s_not_b64 exec, exec 2504; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2505; GFX1064-NEXT: s_not_b64 exec, exec 2506; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2507; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2508; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2509; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2510; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2511; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2512; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2513; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 2514; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2515; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2516; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2517; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 2518; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 2519; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2520; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2521; GFX1064-NEXT: s_add_i32 s0, s2, s3 2522; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2523; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2524; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2525; GFX1064-NEXT: ; %bb.1: 2526; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2527; GFX1064-NEXT: v_mov_b32_e32 v3, s0 2528; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2529; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2530; GFX1064-NEXT: ds_sub_u32 v0, v3 2531; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2532; GFX1064-NEXT: buffer_gl0_inv 2533; GFX1064-NEXT: .LBB10_2: 2534; GFX1064-NEXT: s_endpgm 2535; 2536; GFX1032-LABEL: sub_i32_varying_nouse: 2537; GFX1032: ; %bb.0: ; %entry 2538; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2539; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2540; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2541; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2542; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 2543; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2544; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2545; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2546; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2547; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2548; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2549; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 2550; GFX1032-NEXT: s_mov_b32 exec_lo, s0 2551; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2552; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2553; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2554; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2555; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2556; GFX1032-NEXT: ; %bb.1: 2557; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2558; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2559; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2560; GFX1032-NEXT: ds_sub_u32 v3, v0 2561; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2562; GFX1032-NEXT: buffer_gl0_inv 2563; GFX1032-NEXT: .LBB10_2: 2564; GFX1032-NEXT: s_endpgm 2565; 2566; GFX1164-LABEL: sub_i32_varying_nouse: 2567; GFX1164: ; %bb.0: ; %entry 2568; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2569; GFX1164-NEXT: s_not_b64 exec, exec 2570; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2571; GFX1164-NEXT: s_not_b64 exec, exec 2572; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2573; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2574; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2575; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2576; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2577; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2578; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2579; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2580; GFX1164-NEXT: v_permlane64_b32 v2, v1 2581; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2582; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2583; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2584; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2585; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2586; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 2587; GFX1164-NEXT: v_mov_b32_e32 v0, v1 2588; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2589; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 2590; GFX1164-NEXT: s_cbranch_execz .LBB10_2 2591; GFX1164-NEXT: ; %bb.1: 2592; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2593; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2594; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2595; GFX1164-NEXT: ds_sub_u32 v3, v0 2596; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2597; GFX1164-NEXT: buffer_gl0_inv 2598; GFX1164-NEXT: .LBB10_2: 2599; GFX1164-NEXT: s_endpgm 2600; 2601; GFX1132-LABEL: sub_i32_varying_nouse: 2602; GFX1132: ; %bb.0: ; %entry 2603; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2604; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2605; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2606; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2607; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 2608; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2609; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2610; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2611; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2612; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2613; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2614; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 2615; GFX1132-NEXT: s_mov_b32 exec_lo, s0 2616; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2617; GFX1132-NEXT: v_mov_b32_e32 v0, v1 2618; GFX1132-NEXT: s_mov_b32 s0, exec_lo 2619; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 2620; GFX1132-NEXT: s_cbranch_execz .LBB10_2 2621; GFX1132-NEXT: ; %bb.1: 2622; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2623; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2624; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2625; GFX1132-NEXT: ds_sub_u32 v3, v0 2626; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2627; GFX1132-NEXT: buffer_gl0_inv 2628; GFX1132-NEXT: .LBB10_2: 2629; GFX1132-NEXT: s_endpgm 2630entry: 2631 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2632 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2633 ret void 2634} 2635 2636define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2637; 2638; 2639; GFX7LESS-LABEL: sub_i64_constant: 2640; GFX7LESS: ; %bb.0: ; %entry 2641; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2642; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2643; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2644; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 2645; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2646; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2647; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2648; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 2649; GFX7LESS-NEXT: ; %bb.1: 2650; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2651; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 2652; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2653; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 2654; GFX7LESS-NEXT: s_mov_b32 m0, -1 2655; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2656; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2657; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2658; GFX7LESS-NEXT: .LBB11_2: 2659; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2660; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2661; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 2662; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 2663; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2664; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2665; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2666; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2667; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2668; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2669; GFX7LESS-NEXT: s_mov_b32 s2, -1 2670; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2671; GFX7LESS-NEXT: s_endpgm 2672; 2673; GFX8-LABEL: sub_i64_constant: 2674; GFX8: ; %bb.0: ; %entry 2675; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2676; GFX8-NEXT: s_mov_b64 s[4:5], exec 2677; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2678; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2679; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2680; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2681; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2682; GFX8-NEXT: s_cbranch_execz .LBB11_2 2683; GFX8-NEXT: ; %bb.1: 2684; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2685; GFX8-NEXT: s_mul_i32 s4, s4, 5 2686; GFX8-NEXT: v_mov_b32_e32 v0, s4 2687; GFX8-NEXT: v_mov_b32_e32 v1, 0 2688; GFX8-NEXT: s_mov_b32 m0, -1 2689; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2690; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2691; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2692; GFX8-NEXT: .LBB11_2: 2693; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2694; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2695; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2696; GFX8-NEXT: v_readfirstlane_b32 s3, v1 2697; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2698; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2699; GFX8-NEXT: v_mov_b32_e32 v2, s3 2700; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2701; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2702; GFX8-NEXT: s_mov_b32 s3, 0xf000 2703; GFX8-NEXT: s_mov_b32 s2, -1 2704; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2705; GFX8-NEXT: s_endpgm 2706; 2707; GFX9-LABEL: sub_i64_constant: 2708; GFX9: ; %bb.0: ; %entry 2709; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2710; GFX9-NEXT: s_mov_b64 s[4:5], exec 2711; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2712; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2713; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2714; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2715; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2716; GFX9-NEXT: s_cbranch_execz .LBB11_2 2717; GFX9-NEXT: ; %bb.1: 2718; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2719; GFX9-NEXT: s_mul_i32 s4, s4, 5 2720; GFX9-NEXT: v_mov_b32_e32 v0, s4 2721; GFX9-NEXT: v_mov_b32_e32 v1, 0 2722; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2723; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2724; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2725; GFX9-NEXT: .LBB11_2: 2726; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2727; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2728; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2729; GFX9-NEXT: v_readfirstlane_b32 s3, v1 2730; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2731; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2732; GFX9-NEXT: v_mov_b32_e32 v2, s3 2733; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2734; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2735; GFX9-NEXT: s_mov_b32 s3, 0xf000 2736; GFX9-NEXT: s_mov_b32 s2, -1 2737; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2738; GFX9-NEXT: s_endpgm 2739; 2740; GFX1064-LABEL: sub_i64_constant: 2741; GFX1064: ; %bb.0: ; %entry 2742; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2743; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2744; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2745; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2746; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2747; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2748; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2749; GFX1064-NEXT: s_cbranch_execz .LBB11_2 2750; GFX1064-NEXT: ; %bb.1: 2751; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2752; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2753; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2754; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2755; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2756; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2757; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2758; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2759; GFX1064-NEXT: buffer_gl0_inv 2760; GFX1064-NEXT: .LBB11_2: 2761; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2762; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2763; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2764; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2765; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2766; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2767; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2768; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2769; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2770; GFX1064-NEXT: s_mov_b32 s2, -1 2771; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2772; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2773; GFX1064-NEXT: s_endpgm 2774; 2775; GFX1032-LABEL: sub_i64_constant: 2776; GFX1032: ; %bb.0: ; %entry 2777; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2778; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2779; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2780; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2781; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2782; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2783; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2784; GFX1032-NEXT: ; %bb.1: 2785; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2786; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2787; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2788; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2789; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2790; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2791; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2792; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2793; GFX1032-NEXT: buffer_gl0_inv 2794; GFX1032-NEXT: .LBB11_2: 2795; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2796; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2797; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2798; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2799; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2800; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2801; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2802; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2803; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2804; GFX1032-NEXT: s_mov_b32 s2, -1 2805; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2806; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2807; GFX1032-NEXT: s_endpgm 2808; 2809; GFX1164-LABEL: sub_i64_constant: 2810; GFX1164: ; %bb.0: ; %entry 2811; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2812; GFX1164-NEXT: s_mov_b64 s[4:5], exec 2813; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2814; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2815; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2816; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2817; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2818; GFX1164-NEXT: s_cbranch_execz .LBB11_2 2819; GFX1164-NEXT: ; %bb.1: 2820; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2821; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2822; GFX1164-NEXT: s_mul_i32 s4, s4, 5 2823; GFX1164-NEXT: v_mov_b32_e32 v0, s4 2824; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2825; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2826; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2827; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2828; GFX1164-NEXT: buffer_gl0_inv 2829; GFX1164-NEXT: .LBB11_2: 2830; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 2831; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2832; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2833; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 2834; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2835; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2836; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2837; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2838; GFX1164-NEXT: s_mov_b32 s2, -1 2839; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2840; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2841; GFX1164-NEXT: s_endpgm 2842; 2843; GFX1132-LABEL: sub_i64_constant: 2844; GFX1132: ; %bb.0: ; %entry 2845; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2846; GFX1132-NEXT: s_mov_b32 s3, exec_lo 2847; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2848; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2849; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 2850; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 2851; GFX1132-NEXT: s_cbranch_execz .LBB11_2 2852; GFX1132-NEXT: ; %bb.1: 2853; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 2854; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2855; GFX1132-NEXT: s_mul_i32 s3, s3, 5 2856; GFX1132-NEXT: v_mov_b32_e32 v0, s3 2857; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2858; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2859; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2860; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2861; GFX1132-NEXT: buffer_gl0_inv 2862; GFX1132-NEXT: .LBB11_2: 2863; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 2864; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2865; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2866; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 2867; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2868; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2869; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2870; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2871; GFX1132-NEXT: s_mov_b32 s2, -1 2872; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2873; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2874; GFX1132-NEXT: s_endpgm 2875entry: 2876 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2877 store i64 %old, i64 addrspace(1)* %out 2878 ret void 2879} 2880 2881define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2882; 2883; 2884; GFX7LESS-LABEL: sub_i64_uniform: 2885; GFX7LESS: ; %bb.0: ; %entry 2886; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2887; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2888; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2889; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 2890; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2891; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2892; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2893; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 2894; GFX7LESS-NEXT: ; %bb.1: 2895; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2896; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 2897; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2898; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2899; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2900; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 2901; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2902; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 2903; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2904; GFX7LESS-NEXT: s_mov_b32 m0, -1 2905; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2906; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2907; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2908; GFX7LESS-NEXT: .LBB12_2: 2909; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2910; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2911; GFX7LESS-NEXT: s_mov_b32 s6, -1 2912; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2913; GFX7LESS-NEXT: s_mov_b32 s4, s0 2914; GFX7LESS-NEXT: s_mov_b32 s5, s1 2915; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 2916; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 2917; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 2918; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 2919; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 2920; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 2921; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 2922; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 2923; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2924; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2925; GFX7LESS-NEXT: s_endpgm 2926; 2927; GFX8-LABEL: sub_i64_uniform: 2928; GFX8: ; %bb.0: ; %entry 2929; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2930; GFX8-NEXT: s_mov_b64 s[6:7], exec 2931; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2932; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2933; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2934; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2935; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2936; GFX8-NEXT: s_cbranch_execz .LBB12_2 2937; GFX8-NEXT: ; %bb.1: 2938; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 2939; GFX8-NEXT: v_mov_b32_e32 v0, s8 2940; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2941; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 2942; GFX8-NEXT: s_mul_i32 s6, s3, s8 2943; GFX8-NEXT: v_mov_b32_e32 v3, 0 2944; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 2945; GFX8-NEXT: s_mov_b32 m0, -1 2946; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2947; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2949; GFX8-NEXT: .LBB12_2: 2950; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2951; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2952; GFX8-NEXT: s_mov_b32 s4, s0 2953; GFX8-NEXT: s_mov_b32 s5, s1 2954; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 2955; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2956; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2957; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2958; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 2959; GFX8-NEXT: v_mov_b32_e32 v3, s1 2960; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 2961; GFX8-NEXT: s_mov_b32 s7, 0xf000 2962; GFX8-NEXT: s_mov_b32 s6, -1 2963; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2964; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2965; GFX8-NEXT: s_endpgm 2966; 2967; GFX9-LABEL: sub_i64_uniform: 2968; GFX9: ; %bb.0: ; %entry 2969; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2970; GFX9-NEXT: s_mov_b64 s[6:7], exec 2971; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2972; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2973; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2974; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2975; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2976; GFX9-NEXT: s_cbranch_execz .LBB12_2 2977; GFX9-NEXT: ; %bb.1: 2978; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2979; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2980; GFX9-NEXT: s_mul_i32 s7, s3, s6 2981; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2982; GFX9-NEXT: s_add_i32 s8, s8, s7 2983; GFX9-NEXT: s_mul_i32 s6, s2, s6 2984; GFX9-NEXT: v_mov_b32_e32 v0, s6 2985; GFX9-NEXT: v_mov_b32_e32 v1, s8 2986; GFX9-NEXT: v_mov_b32_e32 v3, 0 2987; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2988; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2989; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2990; GFX9-NEXT: .LBB12_2: 2991; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2992; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2993; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 2994; GFX9-NEXT: s_mov_b32 s4, s0 2995; GFX9-NEXT: s_mov_b32 s5, s1 2996; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 2997; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2998; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2999; GFX9-NEXT: v_mov_b32_e32 v1, v4 3000; GFX9-NEXT: v_mov_b32_e32 v2, s1 3001; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 3002; GFX9-NEXT: s_mov_b32 s7, 0xf000 3003; GFX9-NEXT: s_mov_b32 s6, -1 3004; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 3005; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3006; GFX9-NEXT: s_endpgm 3007; 3008; GFX1064-LABEL: sub_i64_uniform: 3009; GFX1064: ; %bb.0: ; %entry 3010; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3011; GFX1064-NEXT: s_mov_b64 s[6:7], exec 3012; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3013; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3014; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3015; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3016; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3017; GFX1064-NEXT: s_cbranch_execz .LBB12_2 3018; GFX1064-NEXT: ; %bb.1: 3019; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3020; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3021; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3022; GFX1064-NEXT: s_mul_i32 s7, s3, s6 3023; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 3024; GFX1064-NEXT: s_mul_i32 s6, s2, s6 3025; GFX1064-NEXT: s_add_i32 s8, s8, s7 3026; GFX1064-NEXT: v_mov_b32_e32 v0, s6 3027; GFX1064-NEXT: v_mov_b32_e32 v1, s8 3028; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3029; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3030; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3031; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3032; GFX1064-NEXT: buffer_gl0_inv 3033; GFX1064-NEXT: .LBB12_2: 3034; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3035; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3036; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3037; GFX1064-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3038; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3039; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 3040; GFX1064-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[4:5] 3041; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3042; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3043; GFX1064-NEXT: s_mov_b32 s2, -1 3044; GFX1064-NEXT: v_mov_b32_e32 v1, v4 3045; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3046; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3047; GFX1064-NEXT: s_endpgm 3048; 3049; GFX1032-LABEL: sub_i64_uniform: 3050; GFX1032: ; %bb.0: ; %entry 3051; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3052; GFX1032-NEXT: s_mov_b32 s5, exec_lo 3053; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3054; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3055; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 3056; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3057; GFX1032-NEXT: s_cbranch_execz .LBB12_2 3058; GFX1032-NEXT: ; %bb.1: 3059; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 3060; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3061; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3062; GFX1032-NEXT: s_mul_i32 s6, s3, s5 3063; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 3064; GFX1032-NEXT: s_mul_i32 s5, s2, s5 3065; GFX1032-NEXT: s_add_i32 s7, s7, s6 3066; GFX1032-NEXT: v_mov_b32_e32 v0, s5 3067; GFX1032-NEXT: v_mov_b32_e32 v1, s7 3068; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3069; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3070; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3071; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3072; GFX1032-NEXT: buffer_gl0_inv 3073; GFX1032-NEXT: .LBB12_2: 3074; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3075; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3076; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3077; GFX1032-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3078; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3079; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 3080; GFX1032-NEXT: v_mad_u64_u32 v[4:5], null, s3, v2, v[4:5] 3081; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3082; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3083; GFX1032-NEXT: s_mov_b32 s2, -1 3084; GFX1032-NEXT: v_mov_b32_e32 v1, v4 3085; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3086; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3087; GFX1032-NEXT: s_endpgm 3088; 3089; GFX1164-LABEL: sub_i64_uniform: 3090; GFX1164: ; %bb.0: ; %entry 3091; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3092; GFX1164-NEXT: s_mov_b64 s[6:7], exec 3093; GFX1164-NEXT: s_mov_b64 s[4:5], exec 3094; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3095; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3096; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 3097; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 3098; GFX1164-NEXT: s_cbranch_execz .LBB12_2 3099; GFX1164-NEXT: ; %bb.1: 3100; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3101; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3102; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3103; GFX1164-NEXT: s_mul_i32 s7, s3, s6 3104; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 3105; GFX1164-NEXT: s_mul_i32 s6, s2, s6 3106; GFX1164-NEXT: s_add_i32 s8, s8, s7 3107; GFX1164-NEXT: v_mov_b32_e32 v0, s6 3108; GFX1164-NEXT: v_mov_b32_e32 v1, s8 3109; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3110; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3111; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3112; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3113; GFX1164-NEXT: buffer_gl0_inv 3114; GFX1164-NEXT: .LBB12_2: 3115; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3116; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3117; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3118; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 3119; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 3120; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3121; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3122; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3123; GFX1164-NEXT: s_mov_b32 s2, -1 3124; GFX1164-NEXT: v_mov_b32_e32 v1, v5 3125; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3126; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3127; GFX1164-NEXT: s_endpgm 3128; 3129; GFX1132-LABEL: sub_i64_uniform: 3130; GFX1132: ; %bb.0: ; %entry 3131; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3132; GFX1132-NEXT: s_mov_b32 s5, exec_lo 3133; GFX1132-NEXT: s_mov_b32 s4, exec_lo 3134; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3135; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 3136; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 3137; GFX1132-NEXT: s_cbranch_execz .LBB12_2 3138; GFX1132-NEXT: ; %bb.1: 3139; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 3140; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3141; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3142; GFX1132-NEXT: s_mul_i32 s6, s3, s5 3143; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 3144; GFX1132-NEXT: s_mul_i32 s5, s2, s5 3145; GFX1132-NEXT: s_add_i32 s7, s7, s6 3146; GFX1132-NEXT: v_mov_b32_e32 v0, s5 3147; GFX1132-NEXT: v_mov_b32_e32 v1, s7 3148; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3149; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3150; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3151; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3152; GFX1132-NEXT: buffer_gl0_inv 3153; GFX1132-NEXT: .LBB12_2: 3154; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 3155; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3156; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3157; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 3158; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 3159; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3160; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3161; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3162; GFX1132-NEXT: s_mov_b32 s2, -1 3163; GFX1132-NEXT: v_mov_b32_e32 v1, v5 3164; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3165; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3166; GFX1132-NEXT: s_endpgm 3167entry: 3168 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 3169 store i64 %old, i64 addrspace(1)* %out 3170 ret void 3171} 3172 3173define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 3174; 3175; 3176; GFX7LESS-LABEL: sub_i64_varying: 3177; GFX7LESS: ; %bb.0: ; %entry 3178; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3179; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3180; GFX7LESS-NEXT: s_mov_b32 m0, -1 3181; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3182; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3183; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3184; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3185; GFX7LESS-NEXT: s_mov_b32 s2, -1 3186; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3187; GFX7LESS-NEXT: s_endpgm 3188; 3189; GFX8-LABEL: sub_i64_varying: 3190; GFX8: ; %bb.0: ; %entry 3191; GFX8-NEXT: v_mov_b32_e32 v1, 0 3192; GFX8-NEXT: s_mov_b32 m0, -1 3193; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3194; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3195; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3196; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3197; GFX8-NEXT: s_mov_b32 s3, 0xf000 3198; GFX8-NEXT: s_mov_b32 s2, -1 3199; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3200; GFX8-NEXT: s_endpgm 3201; 3202; GFX9-LABEL: sub_i64_varying: 3203; GFX9: ; %bb.0: ; %entry 3204; GFX9-NEXT: v_mov_b32_e32 v1, 0 3205; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3206; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3207; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3208; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3209; GFX9-NEXT: s_mov_b32 s3, 0xf000 3210; GFX9-NEXT: s_mov_b32 s2, -1 3211; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3212; GFX9-NEXT: s_endpgm 3213; 3214; GFX10-LABEL: sub_i64_varying: 3215; GFX10: ; %bb.0: ; %entry 3216; GFX10-NEXT: v_mov_b32_e32 v1, 0 3217; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3218; GFX10-NEXT: s_mov_b32 s3, 0x31016000 3219; GFX10-NEXT: s_mov_b32 s2, -1 3220; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3221; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3222; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3223; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3224; GFX10-NEXT: buffer_gl0_inv 3225; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3226; GFX10-NEXT: s_endpgm 3227; 3228; GFX11-LABEL: sub_i64_varying: 3229; GFX11: ; %bb.0: ; %entry 3230; GFX11-NEXT: v_mov_b32_e32 v1, 0 3231; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3232; GFX11-NEXT: s_mov_b32 s3, 0x31016000 3233; GFX11-NEXT: s_mov_b32 s2, -1 3234; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3235; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3236; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3237; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3238; GFX11-NEXT: buffer_gl0_inv 3239; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3240; GFX11-NEXT: s_endpgm 3241entry: 3242 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3243 %zext = zext i32 %lane to i64 3244 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 3245 store i64 %old, i64 addrspace(1)* %out 3246 ret void 3247} 3248 3249define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 3250; 3251; 3252; GFX7LESS-LABEL: and_i32_varying: 3253; GFX7LESS: ; %bb.0: ; %entry 3254; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3255; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3256; GFX7LESS-NEXT: s_mov_b32 m0, -1 3257; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3258; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 3259; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3260; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3261; GFX7LESS-NEXT: s_mov_b32 s2, -1 3262; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3263; GFX7LESS-NEXT: s_endpgm 3264; 3265; GFX8-LABEL: and_i32_varying: 3266; GFX8: ; %bb.0: ; %entry 3267; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3268; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3269; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3270; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3271; GFX8-NEXT: v_mov_b32_e32 v1, -1 3272; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3273; GFX8-NEXT: v_mov_b32_e32 v2, v0 3274; GFX8-NEXT: s_not_b64 exec, exec 3275; GFX8-NEXT: v_mov_b32_e32 v2, -1 3276; GFX8-NEXT: s_not_b64 exec, exec 3277; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3278; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3279; GFX8-NEXT: s_nop 1 3280; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3281; GFX8-NEXT: s_nop 1 3282; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3283; GFX8-NEXT: s_nop 1 3284; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3285; GFX8-NEXT: s_nop 1 3286; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3287; GFX8-NEXT: s_nop 1 3288; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3289; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3290; GFX8-NEXT: s_nop 0 3291; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3292; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3293; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3294; GFX8-NEXT: ; implicit-def: $vgpr0 3295; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3296; GFX8-NEXT: s_cbranch_execz .LBB14_2 3297; GFX8-NEXT: ; %bb.1: 3298; GFX8-NEXT: v_mov_b32_e32 v0, 0 3299; GFX8-NEXT: v_mov_b32_e32 v3, s4 3300; GFX8-NEXT: s_mov_b32 m0, -1 3301; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3302; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 3303; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3304; GFX8-NEXT: .LBB14_2: 3305; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3306; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3307; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3308; GFX8-NEXT: v_mov_b32_e32 v0, v1 3309; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 3310; GFX8-NEXT: s_mov_b32 s3, 0xf000 3311; GFX8-NEXT: s_mov_b32 s2, -1 3312; GFX8-NEXT: s_nop 0 3313; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3314; GFX8-NEXT: s_endpgm 3315; 3316; GFX9-LABEL: and_i32_varying: 3317; GFX9: ; %bb.0: ; %entry 3318; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3319; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3320; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3321; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3322; GFX9-NEXT: v_mov_b32_e32 v1, -1 3323; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3324; GFX9-NEXT: v_mov_b32_e32 v2, v0 3325; GFX9-NEXT: s_not_b64 exec, exec 3326; GFX9-NEXT: v_mov_b32_e32 v2, -1 3327; GFX9-NEXT: s_not_b64 exec, exec 3328; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3329; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3330; GFX9-NEXT: s_nop 1 3331; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3332; GFX9-NEXT: s_nop 1 3333; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3334; GFX9-NEXT: s_nop 1 3335; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3336; GFX9-NEXT: s_nop 1 3337; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3338; GFX9-NEXT: s_nop 1 3339; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3340; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3341; GFX9-NEXT: s_nop 0 3342; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3343; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3344; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3345; GFX9-NEXT: ; implicit-def: $vgpr0 3346; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3347; GFX9-NEXT: s_cbranch_execz .LBB14_2 3348; GFX9-NEXT: ; %bb.1: 3349; GFX9-NEXT: v_mov_b32_e32 v0, 0 3350; GFX9-NEXT: v_mov_b32_e32 v3, s4 3351; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3352; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 3353; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3354; GFX9-NEXT: .LBB14_2: 3355; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3356; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3357; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3358; GFX9-NEXT: v_mov_b32_e32 v0, v1 3359; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 3360; GFX9-NEXT: s_mov_b32 s3, 0xf000 3361; GFX9-NEXT: s_mov_b32 s2, -1 3362; GFX9-NEXT: s_nop 0 3363; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3364; GFX9-NEXT: s_endpgm 3365; 3366; GFX1064-LABEL: and_i32_varying: 3367; GFX1064: ; %bb.0: ; %entry 3368; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3369; GFX1064-NEXT: s_not_b64 exec, exec 3370; GFX1064-NEXT: v_mov_b32_e32 v1, -1 3371; GFX1064-NEXT: s_not_b64 exec, exec 3372; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3373; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3374; GFX1064-NEXT: v_mov_b32_e32 v3, -1 3375; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3376; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3377; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3378; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3379; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3380; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3381; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3382; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3383; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3384; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3385; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3386; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3387; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3388; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3389; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3390; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3391; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3392; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3393; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3394; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3395; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3396; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3397; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3398; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3399; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3400; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3401; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3402; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3403; GFX1064-NEXT: s_mov_b32 s2, -1 3404; GFX1064-NEXT: ; implicit-def: $vgpr0 3405; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3406; GFX1064-NEXT: s_cbranch_execz .LBB14_2 3407; GFX1064-NEXT: ; %bb.1: 3408; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3409; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3410; GFX1064-NEXT: s_mov_b32 s3, s7 3411; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3412; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3413; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 3414; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3415; GFX1064-NEXT: buffer_gl0_inv 3416; GFX1064-NEXT: .LBB14_2: 3417; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3418; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3419; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3420; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3421; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 3422; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3423; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3424; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3425; GFX1064-NEXT: s_endpgm 3426; 3427; GFX1032-LABEL: and_i32_varying: 3428; GFX1032: ; %bb.0: ; %entry 3429; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3430; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3431; GFX1032-NEXT: v_mov_b32_e32 v1, -1 3432; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3433; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3434; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3435; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3436; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3437; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3438; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3439; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3440; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3441; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3442; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3443; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3444; GFX1032-NEXT: v_mov_b32_e32 v3, -1 3445; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3446; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3447; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3448; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3449; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3450; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3451; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3452; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3453; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3454; GFX1032-NEXT: s_mov_b32 s2, -1 3455; GFX1032-NEXT: ; implicit-def: $vgpr0 3456; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3457; GFX1032-NEXT: s_cbranch_execz .LBB14_2 3458; GFX1032-NEXT: ; %bb.1: 3459; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3460; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3461; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3462; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3463; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 3464; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3465; GFX1032-NEXT: buffer_gl0_inv 3466; GFX1032-NEXT: .LBB14_2: 3467; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3468; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3469; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3470; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3471; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 3472; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3473; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3474; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3475; GFX1032-NEXT: s_endpgm 3476; 3477; GFX1164-LABEL: and_i32_varying: 3478; GFX1164: ; %bb.0: ; %entry 3479; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3480; GFX1164-NEXT: s_not_b64 exec, exec 3481; GFX1164-NEXT: v_mov_b32_e32 v1, -1 3482; GFX1164-NEXT: s_not_b64 exec, exec 3483; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3484; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3485; GFX1164-NEXT: v_mov_b32_e32 v3, -1 3486; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3487; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3488; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3489; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3490; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3491; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3492; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3493; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3494; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3495; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3496; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3497; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3498; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3499; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3500; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3501; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3502; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3503; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3504; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3505; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3506; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3507; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3508; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3509; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3510; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3511; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3512; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3513; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3514; GFX1164-NEXT: s_mov_b32 s2, -1 3515; GFX1164-NEXT: ; implicit-def: $vgpr0 3516; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3517; GFX1164-NEXT: s_cbranch_execz .LBB14_2 3518; GFX1164-NEXT: ; %bb.1: 3519; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3520; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3521; GFX1164-NEXT: s_mov_b32 s3, s7 3522; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3523; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3524; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 3525; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3526; GFX1164-NEXT: buffer_gl0_inv 3527; GFX1164-NEXT: .LBB14_2: 3528; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3529; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3530; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3531; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 3532; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3533; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3534; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3535; GFX1164-NEXT: s_endpgm 3536; 3537; GFX1132-LABEL: and_i32_varying: 3538; GFX1132: ; %bb.0: ; %entry 3539; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3540; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3541; GFX1132-NEXT: v_mov_b32_e32 v1, -1 3542; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3543; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3544; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3545; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3546; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3547; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3548; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3549; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3550; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3551; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3552; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3553; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3554; GFX1132-NEXT: v_mov_b32_e32 v3, -1 3555; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3556; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3557; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3558; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3559; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3560; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3561; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3562; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3563; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3564; GFX1132-NEXT: s_mov_b32 s2, -1 3565; GFX1132-NEXT: ; implicit-def: $vgpr0 3566; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3567; GFX1132-NEXT: s_cbranch_execz .LBB14_2 3568; GFX1132-NEXT: ; %bb.1: 3569; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3570; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3571; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3572; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3573; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 3574; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3575; GFX1132-NEXT: buffer_gl0_inv 3576; GFX1132-NEXT: .LBB14_2: 3577; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3578; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3579; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3580; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 3581; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3582; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3583; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3584; GFX1132-NEXT: s_endpgm 3585entry: 3586 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3587 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3588 store i32 %old, i32 addrspace(1)* %out 3589 ret void 3590} 3591 3592define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 3593; 3594; 3595; GFX7LESS-LABEL: or_i32_varying: 3596; GFX7LESS: ; %bb.0: ; %entry 3597; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3598; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3599; GFX7LESS-NEXT: s_mov_b32 m0, -1 3600; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3601; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 3602; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3603; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3604; GFX7LESS-NEXT: s_mov_b32 s2, -1 3605; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3606; GFX7LESS-NEXT: s_endpgm 3607; 3608; GFX8-LABEL: or_i32_varying: 3609; GFX8: ; %bb.0: ; %entry 3610; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3611; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3612; GFX8-NEXT: v_mov_b32_e32 v1, 0 3613; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3614; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3615; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3616; GFX8-NEXT: v_mov_b32_e32 v2, v0 3617; GFX8-NEXT: s_not_b64 exec, exec 3618; GFX8-NEXT: v_mov_b32_e32 v2, 0 3619; GFX8-NEXT: s_not_b64 exec, exec 3620; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3621; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3622; GFX8-NEXT: s_nop 1 3623; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3624; GFX8-NEXT: s_nop 1 3625; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3626; GFX8-NEXT: s_nop 1 3627; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3628; GFX8-NEXT: s_nop 1 3629; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3630; GFX8-NEXT: s_nop 1 3631; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3632; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3633; GFX8-NEXT: s_nop 0 3634; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3635; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3636; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3637; GFX8-NEXT: ; implicit-def: $vgpr0 3638; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3639; GFX8-NEXT: s_cbranch_execz .LBB15_2 3640; GFX8-NEXT: ; %bb.1: 3641; GFX8-NEXT: v_mov_b32_e32 v0, 0 3642; GFX8-NEXT: v_mov_b32_e32 v3, s4 3643; GFX8-NEXT: s_mov_b32 m0, -1 3644; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3645; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 3646; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3647; GFX8-NEXT: .LBB15_2: 3648; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3649; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3650; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3651; GFX8-NEXT: v_mov_b32_e32 v0, v1 3652; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 3653; GFX8-NEXT: s_mov_b32 s3, 0xf000 3654; GFX8-NEXT: s_mov_b32 s2, -1 3655; GFX8-NEXT: s_nop 0 3656; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3657; GFX8-NEXT: s_endpgm 3658; 3659; GFX9-LABEL: or_i32_varying: 3660; GFX9: ; %bb.0: ; %entry 3661; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3662; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3663; GFX9-NEXT: v_mov_b32_e32 v1, 0 3664; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3665; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3666; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3667; GFX9-NEXT: v_mov_b32_e32 v2, v0 3668; GFX9-NEXT: s_not_b64 exec, exec 3669; GFX9-NEXT: v_mov_b32_e32 v2, 0 3670; GFX9-NEXT: s_not_b64 exec, exec 3671; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3672; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3673; GFX9-NEXT: s_nop 1 3674; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3675; GFX9-NEXT: s_nop 1 3676; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3677; GFX9-NEXT: s_nop 1 3678; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3679; GFX9-NEXT: s_nop 1 3680; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3681; GFX9-NEXT: s_nop 1 3682; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3683; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3684; GFX9-NEXT: s_nop 0 3685; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3686; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3687; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3688; GFX9-NEXT: ; implicit-def: $vgpr0 3689; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3690; GFX9-NEXT: s_cbranch_execz .LBB15_2 3691; GFX9-NEXT: ; %bb.1: 3692; GFX9-NEXT: v_mov_b32_e32 v0, 0 3693; GFX9-NEXT: v_mov_b32_e32 v3, s4 3694; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3695; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3696; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3697; GFX9-NEXT: .LBB15_2: 3698; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3699; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3700; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3701; GFX9-NEXT: v_mov_b32_e32 v0, v1 3702; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3703; GFX9-NEXT: s_mov_b32 s3, 0xf000 3704; GFX9-NEXT: s_mov_b32 s2, -1 3705; GFX9-NEXT: s_nop 0 3706; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3707; GFX9-NEXT: s_endpgm 3708; 3709; GFX1064-LABEL: or_i32_varying: 3710; GFX1064: ; %bb.0: ; %entry 3711; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3712; GFX1064-NEXT: s_not_b64 exec, exec 3713; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3714; GFX1064-NEXT: s_not_b64 exec, exec 3715; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3716; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3717; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3718; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3719; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3720; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3721; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3722; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3723; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3724; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3725; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3726; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3727; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3728; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3729; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3730; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3731; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3732; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3733; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3734; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3735; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3736; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3737; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3738; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3739; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3740; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3741; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3742; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3743; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3744; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3745; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3746; GFX1064-NEXT: s_mov_b32 s2, -1 3747; GFX1064-NEXT: ; implicit-def: $vgpr0 3748; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3749; GFX1064-NEXT: s_cbranch_execz .LBB15_2 3750; GFX1064-NEXT: ; %bb.1: 3751; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3752; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3753; GFX1064-NEXT: s_mov_b32 s3, s7 3754; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3755; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3756; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 3757; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3758; GFX1064-NEXT: buffer_gl0_inv 3759; GFX1064-NEXT: .LBB15_2: 3760; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3761; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3762; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3763; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3764; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3765; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3766; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3767; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3768; GFX1064-NEXT: s_endpgm 3769; 3770; GFX1032-LABEL: or_i32_varying: 3771; GFX1032: ; %bb.0: ; %entry 3772; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3773; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3774; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3775; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3776; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3777; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3778; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3779; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3780; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3781; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3782; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3783; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3784; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3785; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3786; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3787; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3788; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3789; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3790; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3791; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3792; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3793; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3794; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3795; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3796; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3797; GFX1032-NEXT: s_mov_b32 s2, -1 3798; GFX1032-NEXT: ; implicit-def: $vgpr0 3799; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3800; GFX1032-NEXT: s_cbranch_execz .LBB15_2 3801; GFX1032-NEXT: ; %bb.1: 3802; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3803; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3804; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3805; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3806; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 3807; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3808; GFX1032-NEXT: buffer_gl0_inv 3809; GFX1032-NEXT: .LBB15_2: 3810; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3811; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3812; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3813; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3814; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3815; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3816; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3817; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3818; GFX1032-NEXT: s_endpgm 3819; 3820; GFX1164-LABEL: or_i32_varying: 3821; GFX1164: ; %bb.0: ; %entry 3822; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3823; GFX1164-NEXT: s_not_b64 exec, exec 3824; GFX1164-NEXT: v_mov_b32_e32 v1, 0 3825; GFX1164-NEXT: s_not_b64 exec, exec 3826; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3827; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3828; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3829; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3830; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3831; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3832; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3833; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3834; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3835; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3836; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3837; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3838; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3839; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3840; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3841; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3842; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3843; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3844; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3845; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3846; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3847; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3848; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3849; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3850; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3851; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3852; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3853; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3854; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3855; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3856; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3857; GFX1164-NEXT: s_mov_b32 s2, -1 3858; GFX1164-NEXT: ; implicit-def: $vgpr0 3859; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3860; GFX1164-NEXT: s_cbranch_execz .LBB15_2 3861; GFX1164-NEXT: ; %bb.1: 3862; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3863; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3864; GFX1164-NEXT: s_mov_b32 s3, s7 3865; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3866; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3867; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v4 3868; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3869; GFX1164-NEXT: buffer_gl0_inv 3870; GFX1164-NEXT: .LBB15_2: 3871; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3872; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3873; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3874; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 3875; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3876; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3877; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3878; GFX1164-NEXT: s_endpgm 3879; 3880; GFX1132-LABEL: or_i32_varying: 3881; GFX1132: ; %bb.0: ; %entry 3882; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3883; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3884; GFX1132-NEXT: v_mov_b32_e32 v1, 0 3885; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3886; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3887; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3888; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3889; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3890; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3891; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3892; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3893; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3894; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3895; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3896; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3897; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3898; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3899; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3900; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3901; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3902; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3903; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3904; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3905; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3906; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3907; GFX1132-NEXT: s_mov_b32 s2, -1 3908; GFX1132-NEXT: ; implicit-def: $vgpr0 3909; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3910; GFX1132-NEXT: s_cbranch_execz .LBB15_2 3911; GFX1132-NEXT: ; %bb.1: 3912; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3913; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3914; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3915; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3916; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v4 3917; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3918; GFX1132-NEXT: buffer_gl0_inv 3919; GFX1132-NEXT: .LBB15_2: 3920; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3921; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3922; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3923; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 3924; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3925; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3926; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3927; GFX1132-NEXT: s_endpgm 3928entry: 3929 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3930 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3931 store i32 %old, i32 addrspace(1)* %out 3932 ret void 3933} 3934 3935define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3936; 3937; 3938; GFX7LESS-LABEL: xor_i32_varying: 3939; GFX7LESS: ; %bb.0: ; %entry 3940; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3941; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3942; GFX7LESS-NEXT: s_mov_b32 m0, -1 3943; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3944; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3945; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3946; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3947; GFX7LESS-NEXT: s_mov_b32 s2, -1 3948; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3949; GFX7LESS-NEXT: s_endpgm 3950; 3951; GFX8-LABEL: xor_i32_varying: 3952; GFX8: ; %bb.0: ; %entry 3953; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3954; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3955; GFX8-NEXT: v_mov_b32_e32 v1, 0 3956; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3957; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3958; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3959; GFX8-NEXT: v_mov_b32_e32 v2, v0 3960; GFX8-NEXT: s_not_b64 exec, exec 3961; GFX8-NEXT: v_mov_b32_e32 v2, 0 3962; GFX8-NEXT: s_not_b64 exec, exec 3963; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3964; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3965; GFX8-NEXT: s_nop 1 3966; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3967; GFX8-NEXT: s_nop 1 3968; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3969; GFX8-NEXT: s_nop 1 3970; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3971; GFX8-NEXT: s_nop 1 3972; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3973; GFX8-NEXT: s_nop 1 3974; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3975; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3976; GFX8-NEXT: s_nop 0 3977; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3978; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3979; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3980; GFX8-NEXT: ; implicit-def: $vgpr0 3981; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3982; GFX8-NEXT: s_cbranch_execz .LBB16_2 3983; GFX8-NEXT: ; %bb.1: 3984; GFX8-NEXT: v_mov_b32_e32 v0, 0 3985; GFX8-NEXT: v_mov_b32_e32 v3, s4 3986; GFX8-NEXT: s_mov_b32 m0, -1 3987; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3988; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3989; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3990; GFX8-NEXT: .LBB16_2: 3991; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3992; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3993; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3994; GFX8-NEXT: v_mov_b32_e32 v0, v1 3995; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3996; GFX8-NEXT: s_mov_b32 s3, 0xf000 3997; GFX8-NEXT: s_mov_b32 s2, -1 3998; GFX8-NEXT: s_nop 0 3999; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4000; GFX8-NEXT: s_endpgm 4001; 4002; GFX9-LABEL: xor_i32_varying: 4003; GFX9: ; %bb.0: ; %entry 4004; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4005; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4006; GFX9-NEXT: v_mov_b32_e32 v1, 0 4007; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4008; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4009; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4010; GFX9-NEXT: v_mov_b32_e32 v2, v0 4011; GFX9-NEXT: s_not_b64 exec, exec 4012; GFX9-NEXT: v_mov_b32_e32 v2, 0 4013; GFX9-NEXT: s_not_b64 exec, exec 4014; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4015; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4016; GFX9-NEXT: s_nop 1 4017; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4018; GFX9-NEXT: s_nop 1 4019; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4020; GFX9-NEXT: s_nop 1 4021; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4022; GFX9-NEXT: s_nop 1 4023; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4024; GFX9-NEXT: s_nop 1 4025; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4026; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4027; GFX9-NEXT: s_nop 0 4028; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4029; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4030; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4031; GFX9-NEXT: ; implicit-def: $vgpr0 4032; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4033; GFX9-NEXT: s_cbranch_execz .LBB16_2 4034; GFX9-NEXT: ; %bb.1: 4035; GFX9-NEXT: v_mov_b32_e32 v0, 0 4036; GFX9-NEXT: v_mov_b32_e32 v3, s4 4037; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4038; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 4039; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4040; GFX9-NEXT: .LBB16_2: 4041; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4042; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4043; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4044; GFX9-NEXT: v_mov_b32_e32 v0, v1 4045; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 4046; GFX9-NEXT: s_mov_b32 s3, 0xf000 4047; GFX9-NEXT: s_mov_b32 s2, -1 4048; GFX9-NEXT: s_nop 0 4049; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4050; GFX9-NEXT: s_endpgm 4051; 4052; GFX1064-LABEL: xor_i32_varying: 4053; GFX1064: ; %bb.0: ; %entry 4054; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4055; GFX1064-NEXT: s_not_b64 exec, exec 4056; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4057; GFX1064-NEXT: s_not_b64 exec, exec 4058; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4059; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4060; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4061; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4062; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4063; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4064; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4065; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4066; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4067; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4068; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4069; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4070; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4071; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4072; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4073; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4074; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4075; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4076; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4077; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4078; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4079; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4080; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4081; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4082; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4083; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4084; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4085; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4086; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4087; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4088; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4089; GFX1064-NEXT: s_mov_b32 s2, -1 4090; GFX1064-NEXT: ; implicit-def: $vgpr0 4091; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4092; GFX1064-NEXT: s_cbranch_execz .LBB16_2 4093; GFX1064-NEXT: ; %bb.1: 4094; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4095; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4096; GFX1064-NEXT: s_mov_b32 s3, s7 4097; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4098; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4099; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 4100; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4101; GFX1064-NEXT: buffer_gl0_inv 4102; GFX1064-NEXT: .LBB16_2: 4103; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4104; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4105; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4106; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4107; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 4108; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4109; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4110; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4111; GFX1064-NEXT: s_endpgm 4112; 4113; GFX1032-LABEL: xor_i32_varying: 4114; GFX1032: ; %bb.0: ; %entry 4115; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4116; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4117; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4118; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4119; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4120; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4121; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4122; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4123; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4124; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4125; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4126; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4127; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4128; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4129; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4130; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4131; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4132; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4133; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4134; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4135; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4136; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4137; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4138; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4139; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4140; GFX1032-NEXT: s_mov_b32 s2, -1 4141; GFX1032-NEXT: ; implicit-def: $vgpr0 4142; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4143; GFX1032-NEXT: s_cbranch_execz .LBB16_2 4144; GFX1032-NEXT: ; %bb.1: 4145; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4146; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4147; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4148; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4149; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 4150; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4151; GFX1032-NEXT: buffer_gl0_inv 4152; GFX1032-NEXT: .LBB16_2: 4153; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4154; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4155; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4156; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4157; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 4158; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4159; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4160; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4161; GFX1032-NEXT: s_endpgm 4162; 4163; GFX1164-LABEL: xor_i32_varying: 4164; GFX1164: ; %bb.0: ; %entry 4165; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4166; GFX1164-NEXT: s_not_b64 exec, exec 4167; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4168; GFX1164-NEXT: s_not_b64 exec, exec 4169; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4170; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4171; GFX1164-NEXT: v_mov_b32_e32 v3, 0 4172; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4173; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4174; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4175; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4176; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4177; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4178; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4179; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4180; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4181; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4182; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4183; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4184; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4185; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4186; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4187; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4188; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4189; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4190; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4191; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4192; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4193; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4194; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4195; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4196; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4197; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4198; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4199; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4200; GFX1164-NEXT: s_mov_b32 s2, -1 4201; GFX1164-NEXT: ; implicit-def: $vgpr0 4202; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4203; GFX1164-NEXT: s_cbranch_execz .LBB16_2 4204; GFX1164-NEXT: ; %bb.1: 4205; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4206; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4207; GFX1164-NEXT: s_mov_b32 s3, s7 4208; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4209; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4210; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v4 4211; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4212; GFX1164-NEXT: buffer_gl0_inv 4213; GFX1164-NEXT: .LBB16_2: 4214; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4215; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4216; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4217; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 4218; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4219; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4220; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4221; GFX1164-NEXT: s_endpgm 4222; 4223; GFX1132-LABEL: xor_i32_varying: 4224; GFX1132: ; %bb.0: ; %entry 4225; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4226; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4227; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4228; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4229; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4230; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4231; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4232; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4233; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4234; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4235; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4236; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4237; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4238; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4239; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4240; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4241; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4242; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4243; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4244; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4245; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4246; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4247; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4248; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4249; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4250; GFX1132-NEXT: s_mov_b32 s2, -1 4251; GFX1132-NEXT: ; implicit-def: $vgpr0 4252; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4253; GFX1132-NEXT: s_cbranch_execz .LBB16_2 4254; GFX1132-NEXT: ; %bb.1: 4255; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4256; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4257; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4258; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4259; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v4 4260; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4261; GFX1132-NEXT: buffer_gl0_inv 4262; GFX1132-NEXT: .LBB16_2: 4263; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4264; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4265; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4266; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 4267; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4268; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4269; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4270; GFX1132-NEXT: s_endpgm 4271entry: 4272 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4273 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4274 store i32 %old, i32 addrspace(1)* %out 4275 ret void 4276} 4277 4278define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 4279; 4280; 4281; GFX7LESS-LABEL: max_i32_varying: 4282; GFX7LESS: ; %bb.0: ; %entry 4283; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4284; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4285; GFX7LESS-NEXT: s_mov_b32 m0, -1 4286; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4287; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 4288; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4289; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4290; GFX7LESS-NEXT: s_mov_b32 s2, -1 4291; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4292; GFX7LESS-NEXT: s_endpgm 4293; 4294; GFX8-LABEL: max_i32_varying: 4295; GFX8: ; %bb.0: ; %entry 4296; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4297; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4298; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4299; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4300; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4301; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4302; GFX8-NEXT: v_mov_b32_e32 v2, v0 4303; GFX8-NEXT: s_not_b64 exec, exec 4304; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 4305; GFX8-NEXT: s_not_b64 exec, exec 4306; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4307; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4308; GFX8-NEXT: s_nop 1 4309; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4310; GFX8-NEXT: s_nop 1 4311; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4312; GFX8-NEXT: s_nop 1 4313; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4314; GFX8-NEXT: s_nop 1 4315; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4316; GFX8-NEXT: s_nop 1 4317; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4318; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4319; GFX8-NEXT: s_nop 0 4320; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4321; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4322; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4323; GFX8-NEXT: ; implicit-def: $vgpr0 4324; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4325; GFX8-NEXT: s_cbranch_execz .LBB17_2 4326; GFX8-NEXT: ; %bb.1: 4327; GFX8-NEXT: v_mov_b32_e32 v0, 0 4328; GFX8-NEXT: v_mov_b32_e32 v3, s4 4329; GFX8-NEXT: s_mov_b32 m0, -1 4330; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4331; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 4332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX8-NEXT: .LBB17_2: 4334; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4335; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4336; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4337; GFX8-NEXT: v_mov_b32_e32 v0, v1 4338; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 4339; GFX8-NEXT: s_mov_b32 s3, 0xf000 4340; GFX8-NEXT: s_mov_b32 s2, -1 4341; GFX8-NEXT: s_nop 0 4342; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4343; GFX8-NEXT: s_endpgm 4344; 4345; GFX9-LABEL: max_i32_varying: 4346; GFX9: ; %bb.0: ; %entry 4347; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4348; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4349; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4350; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4351; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4352; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4353; GFX9-NEXT: v_mov_b32_e32 v2, v0 4354; GFX9-NEXT: s_not_b64 exec, exec 4355; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 4356; GFX9-NEXT: s_not_b64 exec, exec 4357; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4358; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4359; GFX9-NEXT: s_nop 1 4360; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4361; GFX9-NEXT: s_nop 1 4362; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4363; GFX9-NEXT: s_nop 1 4364; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4365; GFX9-NEXT: s_nop 1 4366; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4367; GFX9-NEXT: s_nop 1 4368; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4369; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4370; GFX9-NEXT: s_nop 0 4371; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4372; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4373; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4374; GFX9-NEXT: ; implicit-def: $vgpr0 4375; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4376; GFX9-NEXT: s_cbranch_execz .LBB17_2 4377; GFX9-NEXT: ; %bb.1: 4378; GFX9-NEXT: v_mov_b32_e32 v0, 0 4379; GFX9-NEXT: v_mov_b32_e32 v3, s4 4380; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4381; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 4382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4383; GFX9-NEXT: .LBB17_2: 4384; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4385; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4386; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4387; GFX9-NEXT: v_mov_b32_e32 v0, v1 4388; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 4389; GFX9-NEXT: s_mov_b32 s3, 0xf000 4390; GFX9-NEXT: s_mov_b32 s2, -1 4391; GFX9-NEXT: s_nop 0 4392; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4393; GFX9-NEXT: s_endpgm 4394; 4395; GFX1064-LABEL: max_i32_varying: 4396; GFX1064: ; %bb.0: ; %entry 4397; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4398; GFX1064-NEXT: s_not_b64 exec, exec 4399; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 4400; GFX1064-NEXT: s_not_b64 exec, exec 4401; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4402; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4403; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 4404; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4405; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4406; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4407; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4408; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4409; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4410; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4411; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4412; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4413; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4414; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4415; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4416; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4417; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4418; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4419; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4420; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4421; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4422; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4423; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4424; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4425; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4426; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4427; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4428; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4429; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4430; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4431; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4432; GFX1064-NEXT: s_mov_b32 s2, -1 4433; GFX1064-NEXT: ; implicit-def: $vgpr0 4434; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4435; GFX1064-NEXT: s_cbranch_execz .LBB17_2 4436; GFX1064-NEXT: ; %bb.1: 4437; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4438; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4439; GFX1064-NEXT: s_mov_b32 s3, s7 4440; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4441; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4442; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 4443; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4444; GFX1064-NEXT: buffer_gl0_inv 4445; GFX1064-NEXT: .LBB17_2: 4446; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4447; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4448; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4449; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4450; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 4451; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4452; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4453; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4454; GFX1064-NEXT: s_endpgm 4455; 4456; GFX1032-LABEL: max_i32_varying: 4457; GFX1032: ; %bb.0: ; %entry 4458; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4459; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4460; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 4461; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4462; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4463; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4464; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4465; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4466; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4467; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4468; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4469; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4470; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4471; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4472; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4473; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 4474; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4475; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4476; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4477; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4478; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4479; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4480; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4481; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4482; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4483; GFX1032-NEXT: s_mov_b32 s2, -1 4484; GFX1032-NEXT: ; implicit-def: $vgpr0 4485; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4486; GFX1032-NEXT: s_cbranch_execz .LBB17_2 4487; GFX1032-NEXT: ; %bb.1: 4488; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4489; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4490; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4491; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4492; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 4493; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4494; GFX1032-NEXT: buffer_gl0_inv 4495; GFX1032-NEXT: .LBB17_2: 4496; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4497; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4498; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4499; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4500; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 4501; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4502; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4503; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4504; GFX1032-NEXT: s_endpgm 4505; 4506; GFX1164-LABEL: max_i32_varying: 4507; GFX1164: ; %bb.0: ; %entry 4508; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4509; GFX1164-NEXT: s_not_b64 exec, exec 4510; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 4511; GFX1164-NEXT: s_not_b64 exec, exec 4512; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4513; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4514; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 4515; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4516; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4517; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4518; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4519; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4520; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4521; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4522; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4523; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4524; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4525; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4526; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4527; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4528; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4529; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4530; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4531; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4532; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4533; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4534; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4535; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4536; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4537; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4538; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4539; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4540; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4541; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4542; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4543; GFX1164-NEXT: s_mov_b32 s2, -1 4544; GFX1164-NEXT: ; implicit-def: $vgpr0 4545; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4546; GFX1164-NEXT: s_cbranch_execz .LBB17_2 4547; GFX1164-NEXT: ; %bb.1: 4548; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4549; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4550; GFX1164-NEXT: s_mov_b32 s3, s7 4551; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4552; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4553; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 4554; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4555; GFX1164-NEXT: buffer_gl0_inv 4556; GFX1164-NEXT: .LBB17_2: 4557; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4558; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4559; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4560; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 4561; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4562; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4563; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4564; GFX1164-NEXT: s_endpgm 4565; 4566; GFX1132-LABEL: max_i32_varying: 4567; GFX1132: ; %bb.0: ; %entry 4568; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4569; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4570; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 4571; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4572; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4573; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4574; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4575; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4576; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4577; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4578; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4579; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4580; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4581; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4582; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4583; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 4584; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4585; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4586; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4587; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4588; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4589; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4590; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4591; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4592; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4593; GFX1132-NEXT: s_mov_b32 s2, -1 4594; GFX1132-NEXT: ; implicit-def: $vgpr0 4595; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4596; GFX1132-NEXT: s_cbranch_execz .LBB17_2 4597; GFX1132-NEXT: ; %bb.1: 4598; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4599; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4600; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4601; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4602; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 4603; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4604; GFX1132-NEXT: buffer_gl0_inv 4605; GFX1132-NEXT: .LBB17_2: 4606; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4607; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4608; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4609; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 4610; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4611; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4612; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4613; GFX1132-NEXT: s_endpgm 4614entry: 4615 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4616 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4617 store i32 %old, i32 addrspace(1)* %out 4618 ret void 4619} 4620 4621define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 4622; 4623; 4624; GFX7LESS-LABEL: max_i64_constant: 4625; GFX7LESS: ; %bb.0: ; %entry 4626; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4627; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4628; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4629; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4630; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4631; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4632; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 4633; GFX7LESS-NEXT: ; %bb.1: 4634; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4635; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4636; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4637; GFX7LESS-NEXT: s_mov_b32 m0, -1 4638; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4639; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4640; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4641; GFX7LESS-NEXT: .LBB18_2: 4642; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4643; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4644; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4645; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4646; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 4647; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4648; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4649; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4650; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4651; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4652; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 4653; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4654; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4655; GFX7LESS-NEXT: s_mov_b32 s2, -1 4656; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4657; GFX7LESS-NEXT: s_endpgm 4658; 4659; GFX8-LABEL: max_i64_constant: 4660; GFX8: ; %bb.0: ; %entry 4661; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4662; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4663; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4664; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4665; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4666; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4667; GFX8-NEXT: s_cbranch_execz .LBB18_2 4668; GFX8-NEXT: ; %bb.1: 4669; GFX8-NEXT: v_mov_b32_e32 v0, 5 4670; GFX8-NEXT: v_mov_b32_e32 v2, 0 4671; GFX8-NEXT: v_mov_b32_e32 v1, 0 4672; GFX8-NEXT: s_mov_b32 m0, -1 4673; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4674; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4675; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4676; GFX8-NEXT: .LBB18_2: 4677; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4678; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4679; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4680; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 4681; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4682; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4683; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4684; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4685; GFX8-NEXT: v_mov_b32_e32 v2, s3 4686; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4687; GFX8-NEXT: v_mov_b32_e32 v2, s2 4688; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4689; GFX8-NEXT: s_mov_b32 s3, 0xf000 4690; GFX8-NEXT: s_mov_b32 s2, -1 4691; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4692; GFX8-NEXT: s_endpgm 4693; 4694; GFX9-LABEL: max_i64_constant: 4695; GFX9: ; %bb.0: ; %entry 4696; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4697; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4698; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4699; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4700; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4701; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4702; GFX9-NEXT: s_cbranch_execz .LBB18_2 4703; GFX9-NEXT: ; %bb.1: 4704; GFX9-NEXT: v_mov_b32_e32 v0, 5 4705; GFX9-NEXT: v_mov_b32_e32 v1, 0 4706; GFX9-NEXT: v_mov_b32_e32 v2, 0 4707; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4708; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4709; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4710; GFX9-NEXT: .LBB18_2: 4711; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4712; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4713; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4714; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 4715; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4716; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4717; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4718; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4719; GFX9-NEXT: v_mov_b32_e32 v2, s3 4720; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4721; GFX9-NEXT: v_mov_b32_e32 v2, s2 4722; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4723; GFX9-NEXT: s_mov_b32 s3, 0xf000 4724; GFX9-NEXT: s_mov_b32 s2, -1 4725; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4726; GFX9-NEXT: s_endpgm 4727; 4728; GFX1064-LABEL: max_i64_constant: 4729; GFX1064: ; %bb.0: ; %entry 4730; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4731; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4732; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4733; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4734; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4735; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4736; GFX1064-NEXT: s_cbranch_execz .LBB18_2 4737; GFX1064-NEXT: ; %bb.1: 4738; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4739; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4740; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4741; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4742; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4743; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4744; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4745; GFX1064-NEXT: buffer_gl0_inv 4746; GFX1064-NEXT: .LBB18_2: 4747; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4748; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4749; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4750; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4751; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4752; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4753; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4754; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4755; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4756; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4757; GFX1064-NEXT: s_mov_b32 s2, -1 4758; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4759; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4760; GFX1064-NEXT: s_endpgm 4761; 4762; GFX1032-LABEL: max_i64_constant: 4763; GFX1032: ; %bb.0: ; %entry 4764; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4765; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4766; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4767; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4768; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4769; GFX1032-NEXT: s_cbranch_execz .LBB18_2 4770; GFX1032-NEXT: ; %bb.1: 4771; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4772; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4773; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4774; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4775; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4776; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4777; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4778; GFX1032-NEXT: buffer_gl0_inv 4779; GFX1032-NEXT: .LBB18_2: 4780; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4781; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4782; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4783; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4784; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4785; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4786; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4787; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4788; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4789; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4790; GFX1032-NEXT: s_mov_b32 s2, -1 4791; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4792; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4793; GFX1032-NEXT: s_endpgm 4794; 4795; GFX1164-LABEL: max_i64_constant: 4796; GFX1164: ; %bb.0: ; %entry 4797; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4798; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4799; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4800; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4801; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 4802; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 4803; GFX1164-NEXT: s_cbranch_execz .LBB18_2 4804; GFX1164-NEXT: ; %bb.1: 4805; GFX1164-NEXT: v_mov_b32_e32 v0, 5 4806; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4807; GFX1164-NEXT: v_mov_b32_e32 v2, 0 4808; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4809; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4810; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4811; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4812; GFX1164-NEXT: buffer_gl0_inv 4813; GFX1164-NEXT: .LBB18_2: 4814; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 4815; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 4816; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 4817; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4818; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4819; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4820; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4821; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4822; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4823; GFX1164-NEXT: s_mov_b32 s2, -1 4824; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4825; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 4826; GFX1164-NEXT: s_endpgm 4827; 4828; GFX1132-LABEL: max_i64_constant: 4829; GFX1132: ; %bb.0: ; %entry 4830; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4831; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4832; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4833; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 4834; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 4835; GFX1132-NEXT: s_cbranch_execz .LBB18_2 4836; GFX1132-NEXT: ; %bb.1: 4837; GFX1132-NEXT: v_mov_b32_e32 v0, 5 4838; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4839; GFX1132-NEXT: v_mov_b32_e32 v2, 0 4840; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4841; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4842; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4843; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4844; GFX1132-NEXT: buffer_gl0_inv 4845; GFX1132-NEXT: .LBB18_2: 4846; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 4847; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 4848; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 4849; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4850; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4851; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4852; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4853; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4854; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4855; GFX1132-NEXT: s_mov_b32 s2, -1 4856; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4857; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 4858; GFX1132-NEXT: s_endpgm 4859entry: 4860 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 4861 store i64 %old, i64 addrspace(1)* %out 4862 ret void 4863} 4864 4865define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 4866; 4867; 4868; GFX7LESS-LABEL: min_i32_varying: 4869; GFX7LESS: ; %bb.0: ; %entry 4870; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4871; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4872; GFX7LESS-NEXT: s_mov_b32 m0, -1 4873; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4874; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 4875; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4876; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4877; GFX7LESS-NEXT: s_mov_b32 s2, -1 4878; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4879; GFX7LESS-NEXT: s_endpgm 4880; 4881; GFX8-LABEL: min_i32_varying: 4882; GFX8: ; %bb.0: ; %entry 4883; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4884; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4885; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4886; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4887; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 4888; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4889; GFX8-NEXT: v_mov_b32_e32 v2, v0 4890; GFX8-NEXT: s_not_b64 exec, exec 4891; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 4892; GFX8-NEXT: s_not_b64 exec, exec 4893; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4894; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4895; GFX8-NEXT: s_nop 1 4896; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4897; GFX8-NEXT: s_nop 1 4898; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4899; GFX8-NEXT: s_nop 1 4900; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4901; GFX8-NEXT: s_nop 1 4902; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4903; GFX8-NEXT: s_nop 1 4904; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4905; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4906; GFX8-NEXT: s_nop 0 4907; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4908; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4909; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4910; GFX8-NEXT: ; implicit-def: $vgpr0 4911; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4912; GFX8-NEXT: s_cbranch_execz .LBB19_2 4913; GFX8-NEXT: ; %bb.1: 4914; GFX8-NEXT: v_mov_b32_e32 v0, 0 4915; GFX8-NEXT: v_mov_b32_e32 v3, s4 4916; GFX8-NEXT: s_mov_b32 m0, -1 4917; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4918; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 4919; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4920; GFX8-NEXT: .LBB19_2: 4921; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4922; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4923; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4924; GFX8-NEXT: v_mov_b32_e32 v0, v1 4925; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 4926; GFX8-NEXT: s_mov_b32 s3, 0xf000 4927; GFX8-NEXT: s_mov_b32 s2, -1 4928; GFX8-NEXT: s_nop 0 4929; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4930; GFX8-NEXT: s_endpgm 4931; 4932; GFX9-LABEL: min_i32_varying: 4933; GFX9: ; %bb.0: ; %entry 4934; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4935; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4936; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4937; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4938; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 4939; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4940; GFX9-NEXT: v_mov_b32_e32 v2, v0 4941; GFX9-NEXT: s_not_b64 exec, exec 4942; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 4943; GFX9-NEXT: s_not_b64 exec, exec 4944; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4945; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4946; GFX9-NEXT: s_nop 1 4947; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4948; GFX9-NEXT: s_nop 1 4949; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4950; GFX9-NEXT: s_nop 1 4951; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4952; GFX9-NEXT: s_nop 1 4953; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4954; GFX9-NEXT: s_nop 1 4955; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4956; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4957; GFX9-NEXT: s_nop 0 4958; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4959; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4960; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4961; GFX9-NEXT: ; implicit-def: $vgpr0 4962; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4963; GFX9-NEXT: s_cbranch_execz .LBB19_2 4964; GFX9-NEXT: ; %bb.1: 4965; GFX9-NEXT: v_mov_b32_e32 v0, 0 4966; GFX9-NEXT: v_mov_b32_e32 v3, s4 4967; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4968; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 4969; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4970; GFX9-NEXT: .LBB19_2: 4971; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4972; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4973; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4974; GFX9-NEXT: v_mov_b32_e32 v0, v1 4975; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 4976; GFX9-NEXT: s_mov_b32 s3, 0xf000 4977; GFX9-NEXT: s_mov_b32 s2, -1 4978; GFX9-NEXT: s_nop 0 4979; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4980; GFX9-NEXT: s_endpgm 4981; 4982; GFX1064-LABEL: min_i32_varying: 4983; GFX1064: ; %bb.0: ; %entry 4984; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4985; GFX1064-NEXT: s_not_b64 exec, exec 4986; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 4987; GFX1064-NEXT: s_not_b64 exec, exec 4988; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4989; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4990; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 4991; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4992; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4993; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4994; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4995; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4996; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4997; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4998; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4999; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5000; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5001; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5002; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5003; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5004; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5005; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5006; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5007; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5008; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5009; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5010; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5011; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5012; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5013; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5014; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5015; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5016; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5017; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5018; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5019; GFX1064-NEXT: s_mov_b32 s2, -1 5020; GFX1064-NEXT: ; implicit-def: $vgpr0 5021; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5022; GFX1064-NEXT: s_cbranch_execz .LBB19_2 5023; GFX1064-NEXT: ; %bb.1: 5024; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5025; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5026; GFX1064-NEXT: s_mov_b32 s3, s7 5027; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5028; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5029; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 5030; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5031; GFX1064-NEXT: buffer_gl0_inv 5032; GFX1064-NEXT: .LBB19_2: 5033; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5034; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5035; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5036; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5037; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 5038; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5039; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5040; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5041; GFX1064-NEXT: s_endpgm 5042; 5043; GFX1032-LABEL: min_i32_varying: 5044; GFX1032: ; %bb.0: ; %entry 5045; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5046; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5047; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 5048; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5049; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5050; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5051; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5052; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5053; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5054; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5055; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5056; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5057; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5058; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5059; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5060; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 5061; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5062; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5063; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5064; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5065; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5066; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5067; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5068; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5069; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5070; GFX1032-NEXT: s_mov_b32 s2, -1 5071; GFX1032-NEXT: ; implicit-def: $vgpr0 5072; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5073; GFX1032-NEXT: s_cbranch_execz .LBB19_2 5074; GFX1032-NEXT: ; %bb.1: 5075; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5076; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5077; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5078; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5079; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 5080; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5081; GFX1032-NEXT: buffer_gl0_inv 5082; GFX1032-NEXT: .LBB19_2: 5083; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5084; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5085; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5086; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5087; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 5088; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5089; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5090; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5091; GFX1032-NEXT: s_endpgm 5092; 5093; GFX1164-LABEL: min_i32_varying: 5094; GFX1164: ; %bb.0: ; %entry 5095; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5096; GFX1164-NEXT: s_not_b64 exec, exec 5097; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 5098; GFX1164-NEXT: s_not_b64 exec, exec 5099; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5100; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5101; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 5102; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5103; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5104; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5105; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5106; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5107; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5108; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5109; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5110; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5111; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5112; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5113; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5114; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5115; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5116; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5117; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5118; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5119; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5120; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5121; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5122; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5123; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5124; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5125; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5126; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5127; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5128; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5129; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5130; GFX1164-NEXT: s_mov_b32 s2, -1 5131; GFX1164-NEXT: ; implicit-def: $vgpr0 5132; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5133; GFX1164-NEXT: s_cbranch_execz .LBB19_2 5134; GFX1164-NEXT: ; %bb.1: 5135; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5136; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5137; GFX1164-NEXT: s_mov_b32 s3, s7 5138; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5139; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5140; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 5141; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5142; GFX1164-NEXT: buffer_gl0_inv 5143; GFX1164-NEXT: .LBB19_2: 5144; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5145; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5146; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5147; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 5148; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5149; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5150; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5151; GFX1164-NEXT: s_endpgm 5152; 5153; GFX1132-LABEL: min_i32_varying: 5154; GFX1132: ; %bb.0: ; %entry 5155; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5156; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5157; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 5158; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5159; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5160; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5161; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5162; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5163; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5164; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5165; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5166; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5167; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5168; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5169; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5170; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 5171; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5172; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5173; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5174; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5175; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5176; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5177; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5178; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5179; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5180; GFX1132-NEXT: s_mov_b32 s2, -1 5181; GFX1132-NEXT: ; implicit-def: $vgpr0 5182; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5183; GFX1132-NEXT: s_cbranch_execz .LBB19_2 5184; GFX1132-NEXT: ; %bb.1: 5185; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5186; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5187; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5188; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5189; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 5190; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5191; GFX1132-NEXT: buffer_gl0_inv 5192; GFX1132-NEXT: .LBB19_2: 5193; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5194; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5195; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5196; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 5197; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5198; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5199; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5200; GFX1132-NEXT: s_endpgm 5201entry: 5202 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5203 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5204 store i32 %old, i32 addrspace(1)* %out 5205 ret void 5206} 5207 5208define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 5209; 5210; 5211; GFX7LESS-LABEL: min_i64_constant: 5212; GFX7LESS: ; %bb.0: ; %entry 5213; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5214; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5215; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5216; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5217; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5218; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5219; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 5220; GFX7LESS-NEXT: ; %bb.1: 5221; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5222; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5223; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5224; GFX7LESS-NEXT: s_mov_b32 m0, -1 5225; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5226; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5227; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5228; GFX7LESS-NEXT: .LBB20_2: 5229; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5230; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5231; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5232; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5233; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 5234; GFX7LESS-NEXT: s_mov_b32 s2, -1 5235; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5236; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5237; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 5238; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 5239; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5240; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5241; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5242; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5243; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5244; GFX7LESS-NEXT: s_endpgm 5245; 5246; GFX8-LABEL: min_i64_constant: 5247; GFX8: ; %bb.0: ; %entry 5248; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5249; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5250; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5251; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5252; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5253; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5254; GFX8-NEXT: s_cbranch_execz .LBB20_2 5255; GFX8-NEXT: ; %bb.1: 5256; GFX8-NEXT: v_mov_b32_e32 v0, 5 5257; GFX8-NEXT: v_mov_b32_e32 v2, 0 5258; GFX8-NEXT: v_mov_b32_e32 v1, 0 5259; GFX8-NEXT: s_mov_b32 m0, -1 5260; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5261; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5262; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5263; GFX8-NEXT: .LBB20_2: 5264; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5265; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5266; GFX8-NEXT: v_readfirstlane_b32 s4, v0 5267; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 5268; GFX8-NEXT: v_readfirstlane_b32 s5, v1 5269; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5270; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5271; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5272; GFX8-NEXT: v_mov_b32_e32 v2, s5 5273; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5274; GFX8-NEXT: v_mov_b32_e32 v2, s4 5275; GFX8-NEXT: s_mov_b32 s2, -1 5276; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5277; GFX8-NEXT: s_mov_b32 s3, 0xf000 5278; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5279; GFX8-NEXT: s_endpgm 5280; 5281; GFX9-LABEL: min_i64_constant: 5282; GFX9: ; %bb.0: ; %entry 5283; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5284; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5285; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5286; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5287; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5288; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5289; GFX9-NEXT: s_cbranch_execz .LBB20_2 5290; GFX9-NEXT: ; %bb.1: 5291; GFX9-NEXT: v_mov_b32_e32 v0, 5 5292; GFX9-NEXT: v_mov_b32_e32 v1, 0 5293; GFX9-NEXT: v_mov_b32_e32 v2, 0 5294; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5295; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5296; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5297; GFX9-NEXT: .LBB20_2: 5298; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5299; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5300; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5301; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 5302; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5303; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5304; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5305; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5306; GFX9-NEXT: v_mov_b32_e32 v2, s5 5307; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5308; GFX9-NEXT: v_mov_b32_e32 v2, s4 5309; GFX9-NEXT: s_mov_b32 s2, -1 5310; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5311; GFX9-NEXT: s_mov_b32 s3, 0xf000 5312; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5313; GFX9-NEXT: s_endpgm 5314; 5315; GFX1064-LABEL: min_i64_constant: 5316; GFX1064: ; %bb.0: ; %entry 5317; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5318; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5319; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5320; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5321; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5322; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5323; GFX1064-NEXT: s_cbranch_execz .LBB20_2 5324; GFX1064-NEXT: ; %bb.1: 5325; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5326; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5327; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5328; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5329; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5330; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5331; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5332; GFX1064-NEXT: buffer_gl0_inv 5333; GFX1064-NEXT: .LBB20_2: 5334; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5335; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5336; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5337; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5338; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5339; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5340; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5341; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5342; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5343; GFX1064-NEXT: s_mov_b32 s2, -1 5344; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5345; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5346; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5347; GFX1064-NEXT: s_endpgm 5348; 5349; GFX1032-LABEL: min_i64_constant: 5350; GFX1032: ; %bb.0: ; %entry 5351; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5352; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5353; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5354; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5355; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5356; GFX1032-NEXT: s_cbranch_execz .LBB20_2 5357; GFX1032-NEXT: ; %bb.1: 5358; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5359; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5360; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5361; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5362; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5363; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5364; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5365; GFX1032-NEXT: buffer_gl0_inv 5366; GFX1032-NEXT: .LBB20_2: 5367; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5368; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5369; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5370; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5371; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5372; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5373; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5374; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5375; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5376; GFX1032-NEXT: s_mov_b32 s2, -1 5377; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5378; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5379; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5380; GFX1032-NEXT: s_endpgm 5381; 5382; GFX1164-LABEL: min_i64_constant: 5383; GFX1164: ; %bb.0: ; %entry 5384; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5385; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5386; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5387; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5388; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5389; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5390; GFX1164-NEXT: s_cbranch_execz .LBB20_2 5391; GFX1164-NEXT: ; %bb.1: 5392; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5393; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5394; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5395; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5396; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5397; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5398; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5399; GFX1164-NEXT: buffer_gl0_inv 5400; GFX1164-NEXT: .LBB20_2: 5401; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5402; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5403; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5404; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5405; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5406; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5407; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5408; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5409; GFX1164-NEXT: s_mov_b32 s2, -1 5410; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5411; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5412; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5413; GFX1164-NEXT: s_endpgm 5414; 5415; GFX1132-LABEL: min_i64_constant: 5416; GFX1132: ; %bb.0: ; %entry 5417; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5418; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5419; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5420; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5421; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5422; GFX1132-NEXT: s_cbranch_execz .LBB20_2 5423; GFX1132-NEXT: ; %bb.1: 5424; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5425; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5426; GFX1132-NEXT: v_mov_b32_e32 v2, 0 5427; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5428; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5429; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5430; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5431; GFX1132-NEXT: buffer_gl0_inv 5432; GFX1132-NEXT: .LBB20_2: 5433; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5434; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5435; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5436; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5437; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5438; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5439; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5440; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5441; GFX1132-NEXT: s_mov_b32 s2, -1 5442; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5443; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5444; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5445; GFX1132-NEXT: s_endpgm 5446entry: 5447 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 5448 store i64 %old, i64 addrspace(1)* %out 5449 ret void 5450} 5451 5452define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 5453; 5454; 5455; GFX7LESS-LABEL: umax_i32_varying: 5456; GFX7LESS: ; %bb.0: ; %entry 5457; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5458; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5459; GFX7LESS-NEXT: s_mov_b32 m0, -1 5460; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5461; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 5462; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5463; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5464; GFX7LESS-NEXT: s_mov_b32 s2, -1 5465; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5466; GFX7LESS-NEXT: s_endpgm 5467; 5468; GFX8-LABEL: umax_i32_varying: 5469; GFX8: ; %bb.0: ; %entry 5470; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5471; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5472; GFX8-NEXT: v_mov_b32_e32 v1, 0 5473; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5474; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5475; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5476; GFX8-NEXT: v_mov_b32_e32 v2, v0 5477; GFX8-NEXT: s_not_b64 exec, exec 5478; GFX8-NEXT: v_mov_b32_e32 v2, 0 5479; GFX8-NEXT: s_not_b64 exec, exec 5480; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5481; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5482; GFX8-NEXT: s_nop 1 5483; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5484; GFX8-NEXT: s_nop 1 5485; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5486; GFX8-NEXT: s_nop 1 5487; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5488; GFX8-NEXT: s_nop 1 5489; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5490; GFX8-NEXT: s_nop 1 5491; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5492; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5493; GFX8-NEXT: s_nop 0 5494; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5495; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5496; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5497; GFX8-NEXT: ; implicit-def: $vgpr0 5498; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5499; GFX8-NEXT: s_cbranch_execz .LBB21_2 5500; GFX8-NEXT: ; %bb.1: 5501; GFX8-NEXT: v_mov_b32_e32 v0, 0 5502; GFX8-NEXT: v_mov_b32_e32 v3, s4 5503; GFX8-NEXT: s_mov_b32 m0, -1 5504; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5505; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 5506; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5507; GFX8-NEXT: .LBB21_2: 5508; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5509; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5510; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5511; GFX8-NEXT: v_mov_b32_e32 v0, v1 5512; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 5513; GFX8-NEXT: s_mov_b32 s3, 0xf000 5514; GFX8-NEXT: s_mov_b32 s2, -1 5515; GFX8-NEXT: s_nop 0 5516; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5517; GFX8-NEXT: s_endpgm 5518; 5519; GFX9-LABEL: umax_i32_varying: 5520; GFX9: ; %bb.0: ; %entry 5521; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5522; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5523; GFX9-NEXT: v_mov_b32_e32 v1, 0 5524; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5525; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5526; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5527; GFX9-NEXT: v_mov_b32_e32 v2, v0 5528; GFX9-NEXT: s_not_b64 exec, exec 5529; GFX9-NEXT: v_mov_b32_e32 v2, 0 5530; GFX9-NEXT: s_not_b64 exec, exec 5531; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5532; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5533; GFX9-NEXT: s_nop 1 5534; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5535; GFX9-NEXT: s_nop 1 5536; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5537; GFX9-NEXT: s_nop 1 5538; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5539; GFX9-NEXT: s_nop 1 5540; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5541; GFX9-NEXT: s_nop 1 5542; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5543; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5544; GFX9-NEXT: s_nop 0 5545; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5546; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5547; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5548; GFX9-NEXT: ; implicit-def: $vgpr0 5549; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5550; GFX9-NEXT: s_cbranch_execz .LBB21_2 5551; GFX9-NEXT: ; %bb.1: 5552; GFX9-NEXT: v_mov_b32_e32 v0, 0 5553; GFX9-NEXT: v_mov_b32_e32 v3, s4 5554; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5555; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 5556; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5557; GFX9-NEXT: .LBB21_2: 5558; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5559; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5560; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5561; GFX9-NEXT: v_mov_b32_e32 v0, v1 5562; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 5563; GFX9-NEXT: s_mov_b32 s3, 0xf000 5564; GFX9-NEXT: s_mov_b32 s2, -1 5565; GFX9-NEXT: s_nop 0 5566; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5567; GFX9-NEXT: s_endpgm 5568; 5569; GFX1064-LABEL: umax_i32_varying: 5570; GFX1064: ; %bb.0: ; %entry 5571; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5572; GFX1064-NEXT: s_not_b64 exec, exec 5573; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5574; GFX1064-NEXT: s_not_b64 exec, exec 5575; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5576; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5577; GFX1064-NEXT: v_mov_b32_e32 v3, 0 5578; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5579; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5580; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5581; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5582; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5583; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5584; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5585; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5586; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5587; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5588; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5589; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5590; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5591; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5592; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5593; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5594; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5595; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5596; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5597; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5598; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5599; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5600; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5601; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5602; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5603; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5604; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5605; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5606; GFX1064-NEXT: s_mov_b32 s2, -1 5607; GFX1064-NEXT: ; implicit-def: $vgpr0 5608; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5609; GFX1064-NEXT: s_cbranch_execz .LBB21_2 5610; GFX1064-NEXT: ; %bb.1: 5611; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5612; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5613; GFX1064-NEXT: s_mov_b32 s3, s7 5614; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5615; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5616; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 5617; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5618; GFX1064-NEXT: buffer_gl0_inv 5619; GFX1064-NEXT: .LBB21_2: 5620; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5621; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5622; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5623; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5624; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 5625; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5626; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5627; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5628; GFX1064-NEXT: s_endpgm 5629; 5630; GFX1032-LABEL: umax_i32_varying: 5631; GFX1032: ; %bb.0: ; %entry 5632; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5633; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5634; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5635; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5636; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5637; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5638; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5639; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5640; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5641; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5642; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5643; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5644; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5645; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5646; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5647; GFX1032-NEXT: v_mov_b32_e32 v3, 0 5648; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5649; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5650; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5651; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5652; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5653; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5654; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5655; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5656; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5657; GFX1032-NEXT: s_mov_b32 s2, -1 5658; GFX1032-NEXT: ; implicit-def: $vgpr0 5659; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5660; GFX1032-NEXT: s_cbranch_execz .LBB21_2 5661; GFX1032-NEXT: ; %bb.1: 5662; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5663; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5664; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5665; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5666; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 5667; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5668; GFX1032-NEXT: buffer_gl0_inv 5669; GFX1032-NEXT: .LBB21_2: 5670; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5671; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5672; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5673; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5674; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 5675; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5676; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5677; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5678; GFX1032-NEXT: s_endpgm 5679; 5680; GFX1164-LABEL: umax_i32_varying: 5681; GFX1164: ; %bb.0: ; %entry 5682; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5683; GFX1164-NEXT: s_not_b64 exec, exec 5684; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5685; GFX1164-NEXT: s_not_b64 exec, exec 5686; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5687; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5688; GFX1164-NEXT: v_mov_b32_e32 v3, 0 5689; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5690; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5691; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5692; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5693; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5694; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5695; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5696; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5697; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5698; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5699; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5700; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5701; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5702; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5703; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5704; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5705; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5706; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5707; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5708; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5709; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5710; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5711; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5712; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5713; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5714; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5715; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5716; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5717; GFX1164-NEXT: s_mov_b32 s2, -1 5718; GFX1164-NEXT: ; implicit-def: $vgpr0 5719; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5720; GFX1164-NEXT: s_cbranch_execz .LBB21_2 5721; GFX1164-NEXT: ; %bb.1: 5722; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5723; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5724; GFX1164-NEXT: s_mov_b32 s3, s7 5725; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5726; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5727; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v4 5728; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5729; GFX1164-NEXT: buffer_gl0_inv 5730; GFX1164-NEXT: .LBB21_2: 5731; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5732; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5733; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5734; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 5735; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5736; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5737; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5738; GFX1164-NEXT: s_endpgm 5739; 5740; GFX1132-LABEL: umax_i32_varying: 5741; GFX1132: ; %bb.0: ; %entry 5742; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5743; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5744; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5745; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5746; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5747; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5748; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5749; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5750; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5751; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5752; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5753; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5754; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5755; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5756; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5757; GFX1132-NEXT: v_mov_b32_e32 v3, 0 5758; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5759; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5760; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5761; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5762; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5763; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5764; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5765; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5766; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5767; GFX1132-NEXT: s_mov_b32 s2, -1 5768; GFX1132-NEXT: ; implicit-def: $vgpr0 5769; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5770; GFX1132-NEXT: s_cbranch_execz .LBB21_2 5771; GFX1132-NEXT: ; %bb.1: 5772; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5773; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5774; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5775; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5776; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v4 5777; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5778; GFX1132-NEXT: buffer_gl0_inv 5779; GFX1132-NEXT: .LBB21_2: 5780; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5781; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5782; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5783; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 5784; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5785; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5786; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5787; GFX1132-NEXT: s_endpgm 5788entry: 5789 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5790 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5791 store i32 %old, i32 addrspace(1)* %out 5792 ret void 5793} 5794 5795define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 5796; 5797; 5798; GFX7LESS-LABEL: umax_i64_constant: 5799; GFX7LESS: ; %bb.0: ; %entry 5800; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5801; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5802; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5803; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5804; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5805; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5806; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 5807; GFX7LESS-NEXT: ; %bb.1: 5808; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5809; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5810; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5811; GFX7LESS-NEXT: s_mov_b32 m0, -1 5812; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5813; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5814; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5815; GFX7LESS-NEXT: .LBB22_2: 5816; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5817; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5818; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5819; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5820; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5821; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5822; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5823; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 5824; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 5825; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5826; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 5827; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5828; GFX7LESS-NEXT: s_mov_b32 s2, -1 5829; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5830; GFX7LESS-NEXT: s_endpgm 5831; 5832; GFX8-LABEL: umax_i64_constant: 5833; GFX8: ; %bb.0: ; %entry 5834; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5835; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5836; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5837; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5838; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5839; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5840; GFX8-NEXT: s_cbranch_execz .LBB22_2 5841; GFX8-NEXT: ; %bb.1: 5842; GFX8-NEXT: v_mov_b32_e32 v0, 5 5843; GFX8-NEXT: v_mov_b32_e32 v2, 0 5844; GFX8-NEXT: v_mov_b32_e32 v1, 0 5845; GFX8-NEXT: s_mov_b32 m0, -1 5846; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5847; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5848; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5849; GFX8-NEXT: .LBB22_2: 5850; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5851; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5852; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5853; GFX8-NEXT: v_readfirstlane_b32 s3, v1 5854; GFX8-NEXT: v_mov_b32_e32 v1, 0 5855; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5856; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5857; GFX8-NEXT: v_mov_b32_e32 v2, s2 5858; GFX8-NEXT: v_mov_b32_e32 v1, s3 5859; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5860; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5861; GFX8-NEXT: s_mov_b32 s3, 0xf000 5862; GFX8-NEXT: s_mov_b32 s2, -1 5863; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5864; GFX8-NEXT: s_endpgm 5865; 5866; GFX9-LABEL: umax_i64_constant: 5867; GFX9: ; %bb.0: ; %entry 5868; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5869; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5870; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5871; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5872; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5873; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5874; GFX9-NEXT: s_cbranch_execz .LBB22_2 5875; GFX9-NEXT: ; %bb.1: 5876; GFX9-NEXT: v_mov_b32_e32 v0, 5 5877; GFX9-NEXT: v_mov_b32_e32 v1, 0 5878; GFX9-NEXT: v_mov_b32_e32 v2, 0 5879; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5880; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5881; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5882; GFX9-NEXT: .LBB22_2: 5883; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5884; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5885; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5886; GFX9-NEXT: v_readfirstlane_b32 s3, v1 5887; GFX9-NEXT: v_mov_b32_e32 v1, 0 5888; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5889; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5890; GFX9-NEXT: v_mov_b32_e32 v2, s2 5891; GFX9-NEXT: v_mov_b32_e32 v1, s3 5892; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5893; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5894; GFX9-NEXT: s_mov_b32 s3, 0xf000 5895; GFX9-NEXT: s_mov_b32 s2, -1 5896; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5897; GFX9-NEXT: s_endpgm 5898; 5899; GFX1064-LABEL: umax_i64_constant: 5900; GFX1064: ; %bb.0: ; %entry 5901; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5902; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5903; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5904; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5905; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5906; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5907; GFX1064-NEXT: s_cbranch_execz .LBB22_2 5908; GFX1064-NEXT: ; %bb.1: 5909; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5910; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5911; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5912; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5913; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5914; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5915; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5916; GFX1064-NEXT: buffer_gl0_inv 5917; GFX1064-NEXT: .LBB22_2: 5918; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5919; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5920; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5921; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5922; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5923; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5924; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5925; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5926; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 5927; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5928; GFX1064-NEXT: s_mov_b32 s2, -1 5929; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5930; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5931; GFX1064-NEXT: s_endpgm 5932; 5933; GFX1032-LABEL: umax_i64_constant: 5934; GFX1032: ; %bb.0: ; %entry 5935; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5936; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5937; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5938; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5939; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5940; GFX1032-NEXT: s_cbranch_execz .LBB22_2 5941; GFX1032-NEXT: ; %bb.1: 5942; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5943; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5944; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5945; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5946; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5947; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5948; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5949; GFX1032-NEXT: buffer_gl0_inv 5950; GFX1032-NEXT: .LBB22_2: 5951; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5952; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5953; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5954; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5955; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5956; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 5957; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 5958; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5959; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 5960; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5961; GFX1032-NEXT: s_mov_b32 s2, -1 5962; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5963; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5964; GFX1032-NEXT: s_endpgm 5965; 5966; GFX1164-LABEL: umax_i64_constant: 5967; GFX1164: ; %bb.0: ; %entry 5968; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5969; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5970; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5971; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5972; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5973; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5974; GFX1164-NEXT: s_cbranch_execz .LBB22_2 5975; GFX1164-NEXT: ; %bb.1: 5976; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5977; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5978; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5979; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5980; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5981; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5982; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5983; GFX1164-NEXT: buffer_gl0_inv 5984; GFX1164-NEXT: .LBB22_2: 5985; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5986; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5987; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5988; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5989; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5990; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5991; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5992; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 5993; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5994; GFX1164-NEXT: s_mov_b32 s2, -1 5995; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5996; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5997; GFX1164-NEXT: s_endpgm 5998; 5999; GFX1132-LABEL: umax_i64_constant: 6000; GFX1132: ; %bb.0: ; %entry 6001; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6002; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6003; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6004; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6005; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6006; GFX1132-NEXT: s_cbranch_execz .LBB22_2 6007; GFX1132-NEXT: ; %bb.1: 6008; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6009; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6010; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6011; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6012; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6013; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6014; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6015; GFX1132-NEXT: buffer_gl0_inv 6016; GFX1132-NEXT: .LBB22_2: 6017; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6018; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6019; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6020; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6021; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6022; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6023; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6024; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6025; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6026; GFX1132-NEXT: s_mov_b32 s2, -1 6027; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6028; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6029; GFX1132-NEXT: s_endpgm 6030entry: 6031 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 6032 store i64 %old, i64 addrspace(1)* %out 6033 ret void 6034} 6035 6036define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 6037; 6038; 6039; GFX7LESS-LABEL: umin_i32_varying: 6040; GFX7LESS: ; %bb.0: ; %entry 6041; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6042; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6043; GFX7LESS-NEXT: s_mov_b32 m0, -1 6044; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6045; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 6046; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6047; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6048; GFX7LESS-NEXT: s_mov_b32 s2, -1 6049; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 6050; GFX7LESS-NEXT: s_endpgm 6051; 6052; GFX8-LABEL: umin_i32_varying: 6053; GFX8: ; %bb.0: ; %entry 6054; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6055; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6056; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6057; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6058; GFX8-NEXT: v_mov_b32_e32 v1, -1 6059; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6060; GFX8-NEXT: v_mov_b32_e32 v2, v0 6061; GFX8-NEXT: s_not_b64 exec, exec 6062; GFX8-NEXT: v_mov_b32_e32 v2, -1 6063; GFX8-NEXT: s_not_b64 exec, exec 6064; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6065; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6066; GFX8-NEXT: s_nop 1 6067; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6068; GFX8-NEXT: s_nop 1 6069; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6070; GFX8-NEXT: s_nop 1 6071; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6072; GFX8-NEXT: s_nop 1 6073; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6074; GFX8-NEXT: s_nop 1 6075; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6076; GFX8-NEXT: v_readlane_b32 s4, v2, 63 6077; GFX8-NEXT: s_nop 0 6078; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6079; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6080; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6081; GFX8-NEXT: ; implicit-def: $vgpr0 6082; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6083; GFX8-NEXT: s_cbranch_execz .LBB23_2 6084; GFX8-NEXT: ; %bb.1: 6085; GFX8-NEXT: v_mov_b32_e32 v0, 0 6086; GFX8-NEXT: v_mov_b32_e32 v3, s4 6087; GFX8-NEXT: s_mov_b32 m0, -1 6088; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6089; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 6090; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6091; GFX8-NEXT: .LBB23_2: 6092; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6093; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6094; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6095; GFX8-NEXT: v_mov_b32_e32 v0, v1 6096; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 6097; GFX8-NEXT: s_mov_b32 s3, 0xf000 6098; GFX8-NEXT: s_mov_b32 s2, -1 6099; GFX8-NEXT: s_nop 0 6100; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 6101; GFX8-NEXT: s_endpgm 6102; 6103; GFX9-LABEL: umin_i32_varying: 6104; GFX9: ; %bb.0: ; %entry 6105; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6106; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6107; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6108; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6109; GFX9-NEXT: v_mov_b32_e32 v1, -1 6110; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6111; GFX9-NEXT: v_mov_b32_e32 v2, v0 6112; GFX9-NEXT: s_not_b64 exec, exec 6113; GFX9-NEXT: v_mov_b32_e32 v2, -1 6114; GFX9-NEXT: s_not_b64 exec, exec 6115; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6116; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6117; GFX9-NEXT: s_nop 1 6118; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6119; GFX9-NEXT: s_nop 1 6120; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6121; GFX9-NEXT: s_nop 1 6122; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6123; GFX9-NEXT: s_nop 1 6124; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6125; GFX9-NEXT: s_nop 1 6126; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6127; GFX9-NEXT: v_readlane_b32 s4, v2, 63 6128; GFX9-NEXT: s_nop 0 6129; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6130; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6131; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6132; GFX9-NEXT: ; implicit-def: $vgpr0 6133; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6134; GFX9-NEXT: s_cbranch_execz .LBB23_2 6135; GFX9-NEXT: ; %bb.1: 6136; GFX9-NEXT: v_mov_b32_e32 v0, 0 6137; GFX9-NEXT: v_mov_b32_e32 v3, s4 6138; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6139; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 6140; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6141; GFX9-NEXT: .LBB23_2: 6142; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6143; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6144; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6145; GFX9-NEXT: v_mov_b32_e32 v0, v1 6146; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 6147; GFX9-NEXT: s_mov_b32 s3, 0xf000 6148; GFX9-NEXT: s_mov_b32 s2, -1 6149; GFX9-NEXT: s_nop 0 6150; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 6151; GFX9-NEXT: s_endpgm 6152; 6153; GFX1064-LABEL: umin_i32_varying: 6154; GFX1064: ; %bb.0: ; %entry 6155; GFX1064-NEXT: v_mov_b32_e32 v1, v0 6156; GFX1064-NEXT: s_not_b64 exec, exec 6157; GFX1064-NEXT: v_mov_b32_e32 v1, -1 6158; GFX1064-NEXT: s_not_b64 exec, exec 6159; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6160; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6161; GFX1064-NEXT: v_mov_b32_e32 v3, -1 6162; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6163; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6164; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6165; GFX1064-NEXT: v_mov_b32_e32 v2, v1 6166; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6167; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6168; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 6169; GFX1064-NEXT: v_mov_b32_e32 v2, s4 6170; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6171; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 6172; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6173; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6174; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6175; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6176; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 6177; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 6178; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6179; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6180; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6181; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 6182; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 6183; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 6184; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6185; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6186; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 6187; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 6188; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 6189; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6190; GFX1064-NEXT: s_mov_b32 s2, -1 6191; GFX1064-NEXT: ; implicit-def: $vgpr0 6192; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 6193; GFX1064-NEXT: s_cbranch_execz .LBB23_2 6194; GFX1064-NEXT: ; %bb.1: 6195; GFX1064-NEXT: v_mov_b32_e32 v0, 0 6196; GFX1064-NEXT: v_mov_b32_e32 v4, s7 6197; GFX1064-NEXT: s_mov_b32 s3, s7 6198; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6199; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6200; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 6201; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6202; GFX1064-NEXT: buffer_gl0_inv 6203; GFX1064-NEXT: .LBB23_2: 6204; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6205; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 6206; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 6207; GFX1064-NEXT: v_mov_b32_e32 v0, v3 6208; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 6209; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6210; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6211; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 6212; GFX1064-NEXT: s_endpgm 6213; 6214; GFX1032-LABEL: umin_i32_varying: 6215; GFX1032: ; %bb.0: ; %entry 6216; GFX1032-NEXT: v_mov_b32_e32 v1, v0 6217; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6218; GFX1032-NEXT: v_mov_b32_e32 v1, -1 6219; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6220; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6221; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6222; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6223; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6224; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6225; GFX1032-NEXT: v_mov_b32_e32 v2, v1 6226; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6227; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6228; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6229; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6230; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6231; GFX1032-NEXT: v_mov_b32_e32 v3, -1 6232; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 6233; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 6234; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6235; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6236; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6237; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6238; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 6239; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6240; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6241; GFX1032-NEXT: s_mov_b32 s2, -1 6242; GFX1032-NEXT: ; implicit-def: $vgpr0 6243; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 6244; GFX1032-NEXT: s_cbranch_execz .LBB23_2 6245; GFX1032-NEXT: ; %bb.1: 6246; GFX1032-NEXT: v_mov_b32_e32 v0, 0 6247; GFX1032-NEXT: v_mov_b32_e32 v4, s4 6248; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6249; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6250; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 6251; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6252; GFX1032-NEXT: buffer_gl0_inv 6253; GFX1032-NEXT: .LBB23_2: 6254; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6255; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 6256; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 6257; GFX1032-NEXT: v_mov_b32_e32 v0, v3 6258; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 6259; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6260; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6261; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 6262; GFX1032-NEXT: s_endpgm 6263; 6264; GFX1164-LABEL: umin_i32_varying: 6265; GFX1164: ; %bb.0: ; %entry 6266; GFX1164-NEXT: v_mov_b32_e32 v1, v0 6267; GFX1164-NEXT: s_not_b64 exec, exec 6268; GFX1164-NEXT: v_mov_b32_e32 v1, -1 6269; GFX1164-NEXT: s_not_b64 exec, exec 6270; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6271; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6272; GFX1164-NEXT: v_mov_b32_e32 v3, -1 6273; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6274; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6275; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6276; GFX1164-NEXT: v_mov_b32_e32 v2, v1 6277; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6278; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6279; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 6280; GFX1164-NEXT: v_mov_b32_e32 v2, s4 6281; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6282; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 6283; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6284; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6285; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6286; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6287; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 6288; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 6289; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6290; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6291; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6292; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 6293; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 6294; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 6295; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6296; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6297; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 6298; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 6299; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 6300; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6301; GFX1164-NEXT: s_mov_b32 s2, -1 6302; GFX1164-NEXT: ; implicit-def: $vgpr0 6303; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 6304; GFX1164-NEXT: s_cbranch_execz .LBB23_2 6305; GFX1164-NEXT: ; %bb.1: 6306; GFX1164-NEXT: v_mov_b32_e32 v0, 0 6307; GFX1164-NEXT: v_mov_b32_e32 v4, s7 6308; GFX1164-NEXT: s_mov_b32 s3, s7 6309; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6310; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6311; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 6312; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6313; GFX1164-NEXT: buffer_gl0_inv 6314; GFX1164-NEXT: .LBB23_2: 6315; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 6316; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 6317; GFX1164-NEXT: v_mov_b32_e32 v0, v3 6318; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 6319; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6320; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6321; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6322; GFX1164-NEXT: s_endpgm 6323; 6324; GFX1132-LABEL: umin_i32_varying: 6325; GFX1132: ; %bb.0: ; %entry 6326; GFX1132-NEXT: v_mov_b32_e32 v1, v0 6327; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6328; GFX1132-NEXT: v_mov_b32_e32 v1, -1 6329; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6330; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6331; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6332; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6333; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6334; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6335; GFX1132-NEXT: v_mov_b32_e32 v2, v1 6336; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6337; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6338; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6339; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6340; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6341; GFX1132-NEXT: v_mov_b32_e32 v3, -1 6342; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 6343; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 6344; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6345; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6346; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6347; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6348; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 6349; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6350; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6351; GFX1132-NEXT: s_mov_b32 s2, -1 6352; GFX1132-NEXT: ; implicit-def: $vgpr0 6353; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 6354; GFX1132-NEXT: s_cbranch_execz .LBB23_2 6355; GFX1132-NEXT: ; %bb.1: 6356; GFX1132-NEXT: v_mov_b32_e32 v0, 0 6357; GFX1132-NEXT: v_mov_b32_e32 v4, s4 6358; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6359; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6360; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 6361; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6362; GFX1132-NEXT: buffer_gl0_inv 6363; GFX1132-NEXT: .LBB23_2: 6364; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 6365; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 6366; GFX1132-NEXT: v_mov_b32_e32 v0, v3 6367; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 6368; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6369; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6370; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6371; GFX1132-NEXT: s_endpgm 6372entry: 6373 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6374 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 6375 store i32 %old, i32 addrspace(1)* %out 6376 ret void 6377} 6378 6379define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 6380; 6381; 6382; GFX7LESS-LABEL: umin_i64_constant: 6383; GFX7LESS: ; %bb.0: ; %entry 6384; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6385; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6386; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6387; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6388; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6389; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6390; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 6391; GFX7LESS-NEXT: ; %bb.1: 6392; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6393; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6394; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6395; GFX7LESS-NEXT: s_mov_b32 m0, -1 6396; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6397; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6398; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6399; GFX7LESS-NEXT: .LBB24_2: 6400; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6401; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6402; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6403; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6404; GFX7LESS-NEXT: s_mov_b32 s2, -1 6405; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6406; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6407; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 6408; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6409; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6410; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6411; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6412; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6413; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6414; GFX7LESS-NEXT: s_endpgm 6415; 6416; GFX8-LABEL: umin_i64_constant: 6417; GFX8: ; %bb.0: ; %entry 6418; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6419; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6420; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6421; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6422; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6423; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6424; GFX8-NEXT: s_cbranch_execz .LBB24_2 6425; GFX8-NEXT: ; %bb.1: 6426; GFX8-NEXT: v_mov_b32_e32 v0, 5 6427; GFX8-NEXT: v_mov_b32_e32 v2, 0 6428; GFX8-NEXT: v_mov_b32_e32 v1, 0 6429; GFX8-NEXT: s_mov_b32 m0, -1 6430; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6431; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6432; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6433; GFX8-NEXT: .LBB24_2: 6434; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6435; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6436; GFX8-NEXT: v_readfirstlane_b32 s4, v0 6437; GFX8-NEXT: v_readfirstlane_b32 s5, v1 6438; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6439; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6440; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6441; GFX8-NEXT: v_mov_b32_e32 v2, s5 6442; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6443; GFX8-NEXT: v_mov_b32_e32 v2, s4 6444; GFX8-NEXT: s_mov_b32 s2, -1 6445; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6446; GFX8-NEXT: s_mov_b32 s3, 0xf000 6447; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6448; GFX8-NEXT: s_endpgm 6449; 6450; GFX9-LABEL: umin_i64_constant: 6451; GFX9: ; %bb.0: ; %entry 6452; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6453; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6454; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6455; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6456; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6457; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6458; GFX9-NEXT: s_cbranch_execz .LBB24_2 6459; GFX9-NEXT: ; %bb.1: 6460; GFX9-NEXT: v_mov_b32_e32 v0, 5 6461; GFX9-NEXT: v_mov_b32_e32 v1, 0 6462; GFX9-NEXT: v_mov_b32_e32 v2, 0 6463; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6464; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6465; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6466; GFX9-NEXT: .LBB24_2: 6467; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6468; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6469; GFX9-NEXT: v_readfirstlane_b32 s4, v0 6470; GFX9-NEXT: v_readfirstlane_b32 s5, v1 6471; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6472; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6473; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6474; GFX9-NEXT: v_mov_b32_e32 v2, s5 6475; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6476; GFX9-NEXT: v_mov_b32_e32 v2, s4 6477; GFX9-NEXT: s_mov_b32 s2, -1 6478; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6479; GFX9-NEXT: s_mov_b32 s3, 0xf000 6480; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6481; GFX9-NEXT: s_endpgm 6482; 6483; GFX1064-LABEL: umin_i64_constant: 6484; GFX1064: ; %bb.0: ; %entry 6485; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6486; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6487; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6488; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6489; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6490; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6491; GFX1064-NEXT: s_cbranch_execz .LBB24_2 6492; GFX1064-NEXT: ; %bb.1: 6493; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6494; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6495; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6496; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6497; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6498; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6499; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6500; GFX1064-NEXT: buffer_gl0_inv 6501; GFX1064-NEXT: .LBB24_2: 6502; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6503; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6504; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6505; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6506; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6507; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6508; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6509; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6510; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6511; GFX1064-NEXT: s_mov_b32 s2, -1 6512; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6513; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6514; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6515; GFX1064-NEXT: s_endpgm 6516; 6517; GFX1032-LABEL: umin_i64_constant: 6518; GFX1032: ; %bb.0: ; %entry 6519; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6520; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6521; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6522; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6523; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6524; GFX1032-NEXT: s_cbranch_execz .LBB24_2 6525; GFX1032-NEXT: ; %bb.1: 6526; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6527; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6528; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6529; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6530; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6531; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6532; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6533; GFX1032-NEXT: buffer_gl0_inv 6534; GFX1032-NEXT: .LBB24_2: 6535; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6536; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6537; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6538; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6539; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6540; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6541; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6542; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6543; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6544; GFX1032-NEXT: s_mov_b32 s2, -1 6545; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6546; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6547; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6548; GFX1032-NEXT: s_endpgm 6549; 6550; GFX1164-LABEL: umin_i64_constant: 6551; GFX1164: ; %bb.0: ; %entry 6552; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6553; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6554; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6555; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6556; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6557; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6558; GFX1164-NEXT: s_cbranch_execz .LBB24_2 6559; GFX1164-NEXT: ; %bb.1: 6560; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6561; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6562; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6563; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6564; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6565; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6566; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6567; GFX1164-NEXT: buffer_gl0_inv 6568; GFX1164-NEXT: .LBB24_2: 6569; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6570; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6571; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6572; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6573; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6574; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6575; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6576; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6577; GFX1164-NEXT: s_mov_b32 s2, -1 6578; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6579; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6580; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6581; GFX1164-NEXT: s_endpgm 6582; 6583; GFX1132-LABEL: umin_i64_constant: 6584; GFX1132: ; %bb.0: ; %entry 6585; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6586; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6587; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6588; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6589; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6590; GFX1132-NEXT: s_cbranch_execz .LBB24_2 6591; GFX1132-NEXT: ; %bb.1: 6592; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6593; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6594; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6595; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6596; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6597; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6598; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6599; GFX1132-NEXT: buffer_gl0_inv 6600; GFX1132-NEXT: .LBB24_2: 6601; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6602; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6603; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6604; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6605; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6606; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6607; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6608; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6609; GFX1132-NEXT: s_mov_b32 s2, -1 6610; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6611; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6612; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6613; GFX1132-NEXT: s_endpgm 6614entry: 6615 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 6616 store i64 %old, i64 addrspace(1)* %out 6617 ret void 6618} 6619