1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11 12@local_var32 = addrspace(3) global i32 undef, align 4 13@local_var64 = addrspace(3) global i64 undef, align 8 14 15; Show what the atomic optimization pass will do for local pointers. 16 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 18; 19; 20; GFX7LESS-LABEL: add_i32_constant: 21; GFX7LESS: ; %bb.0: ; %entry 22; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 23; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 24; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 25; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 26; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 27; GFX7LESS-NEXT: ; implicit-def: $vgpr1 28; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 29; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 30; GFX7LESS-NEXT: ; %bb.1: 31; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 32; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 33; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 34; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 35; GFX7LESS-NEXT: s_mov_b32 m0, -1 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: .LBB0_2: 40; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 41; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 42; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 43; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 44; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 45; GFX7LESS-NEXT: s_mov_b32 s2, -1 46; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; GFX7LESS-NEXT: s_endpgm 48; 49; GFX8-LABEL: add_i32_constant: 50; GFX8: ; %bb.0: ; %entry 51; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 52; GFX8-NEXT: s_mov_b64 s[2:3], exec 53; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 54; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 55; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 56; GFX8-NEXT: ; implicit-def: $vgpr1 57; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 58; GFX8-NEXT: s_cbranch_execz .LBB0_2 59; GFX8-NEXT: ; %bb.1: 60; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 61; GFX8-NEXT: s_mul_i32 s2, s2, 5 62; GFX8-NEXT: v_mov_b32_e32 v1, 0 63; GFX8-NEXT: v_mov_b32_e32 v2, s2 64; GFX8-NEXT: s_mov_b32 m0, -1 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: .LBB0_2: 69; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 70; GFX8-NEXT: s_waitcnt lgkmcnt(0) 71; GFX8-NEXT: v_readfirstlane_b32 s2, v1 72; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 73; GFX8-NEXT: s_mov_b32 s3, 0xf000 74; GFX8-NEXT: s_mov_b32 s2, -1 75; GFX8-NEXT: s_nop 1 76; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX8-NEXT: s_endpgm 78; 79; GFX9-LABEL: add_i32_constant: 80; GFX9: ; %bb.0: ; %entry 81; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 82; GFX9-NEXT: s_mov_b64 s[2:3], exec 83; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 84; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 85; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 86; GFX9-NEXT: ; implicit-def: $vgpr1 87; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX9-NEXT: s_cbranch_execz .LBB0_2 89; GFX9-NEXT: ; %bb.1: 90; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 91; GFX9-NEXT: s_mul_i32 s2, s2, 5 92; GFX9-NEXT: v_mov_b32_e32 v1, 0 93; GFX9-NEXT: v_mov_b32_e32 v2, s2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: .LBB0_2: 98; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 99; GFX9-NEXT: s_waitcnt lgkmcnt(0) 100; GFX9-NEXT: v_readfirstlane_b32 s2, v1 101; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 102; GFX9-NEXT: s_mov_b32 s3, 0xf000 103; GFX9-NEXT: s_mov_b32 s2, -1 104; GFX9-NEXT: s_nop 1 105; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; GFX9-NEXT: s_endpgm 107; 108; GFX1064-LABEL: add_i32_constant: 109; GFX1064: ; %bb.0: ; %entry 110; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 111; GFX1064-NEXT: s_mov_b64 s[2:3], exec 112; GFX1064-NEXT: ; implicit-def: $vgpr1 113; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 114; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 115; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 116; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 117; GFX1064-NEXT: s_cbranch_execz .LBB0_2 118; GFX1064-NEXT: ; %bb.1: 119; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 120; GFX1064-NEXT: v_mov_b32_e32 v1, 0 121; GFX1064-NEXT: s_mul_i32 s2, s2, 5 122; GFX1064-NEXT: v_mov_b32_e32 v2, s2 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 125; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 126; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 127; GFX1064-NEXT: buffer_gl0_inv 128; GFX1064-NEXT: .LBB0_2: 129; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 130; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 131; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 132; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 133; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 134; GFX1064-NEXT: s_mov_b32 s2, -1 135; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 136; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 137; GFX1064-NEXT: s_endpgm 138; 139; GFX1032-LABEL: add_i32_constant: 140; GFX1032: ; %bb.0: ; %entry 141; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 142; GFX1032-NEXT: s_mov_b32 s3, exec_lo 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz .LBB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 150; GFX1032-NEXT: v_mov_b32_e32 v1, 0 151; GFX1032-NEXT: s_mul_i32 s3, s3, 5 152; GFX1032-NEXT: v_mov_b32_e32 v2, s3 153; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 154; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 155; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 156; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 157; GFX1032-NEXT: buffer_gl0_inv 158; GFX1032-NEXT: .LBB0_2: 159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 166; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 167; GFX1032-NEXT: s_endpgm 168; 169; GFX1164-LABEL: add_i32_constant: 170; GFX1164: ; %bb.0: ; %entry 171; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 172; GFX1164-NEXT: s_mov_b64 s[2:3], exec 173; GFX1164-NEXT: s_mov_b64 s[4:5], exec 174; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 175; GFX1164-NEXT: ; implicit-def: $vgpr1 176; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 177; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 178; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 179; GFX1164-NEXT: s_cbranch_execz .LBB0_2 180; GFX1164-NEXT: ; %bb.1: 181; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 182; GFX1164-NEXT: v_mov_b32_e32 v1, 0 183; GFX1164-NEXT: s_mul_i32 s2, s2, 5 184; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 185; GFX1164-NEXT: v_mov_b32_e32 v2, s2 186; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 187; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 188; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 189; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 190; GFX1164-NEXT: buffer_gl0_inv 191; GFX1164-NEXT: .LBB0_2: 192; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 193; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 194; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 195; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 196; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 197; GFX1164-NEXT: s_mov_b32 s2, -1 198; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 199; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 200; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 201; GFX1164-NEXT: s_endpgm 202; 203; GFX1132-LABEL: add_i32_constant: 204; GFX1132: ; %bb.0: ; %entry 205; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 206; GFX1132-NEXT: s_mov_b32 s3, exec_lo 207; GFX1132-NEXT: s_mov_b32 s2, exec_lo 208; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 209; GFX1132-NEXT: ; implicit-def: $vgpr1 210; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 211; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 212; GFX1132-NEXT: s_cbranch_execz .LBB0_2 213; GFX1132-NEXT: ; %bb.1: 214; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 215; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 216; GFX1132-NEXT: s_mul_i32 s3, s3, 5 217; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 218; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 219; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 220; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 221; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 222; GFX1132-NEXT: buffer_gl0_inv 223; GFX1132-NEXT: .LBB0_2: 224; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 225; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 226; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 227; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 228; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 229; GFX1132-NEXT: s_mov_b32 s2, -1 230; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 231; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 232; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 233; GFX1132-NEXT: s_endpgm 234entry: 235 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 236 store i32 %old, i32 addrspace(1)* %out 237 ret void 238} 239 240define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 241; 242; 243; GFX7LESS-LABEL: add_i32_uniform: 244; GFX7LESS: ; %bb.0: ; %entry 245; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 246; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 247; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 248; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 249; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 250; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 251; GFX7LESS-NEXT: ; implicit-def: $vgpr1 252; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 253; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 254; GFX7LESS-NEXT: ; %bb.1: 255; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 256; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 257; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 258; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 259; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 260; GFX7LESS-NEXT: s_mov_b32 m0, -1 261; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 262; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 263; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 264; GFX7LESS-NEXT: .LBB1_2: 265; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 266; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 267; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 268; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 269; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 270; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 271; GFX7LESS-NEXT: s_mov_b32 s6, -1 272; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 273; GFX7LESS-NEXT: s_endpgm 274; 275; GFX8-LABEL: add_i32_uniform: 276; GFX8: ; %bb.0: ; %entry 277; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 278; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 279; GFX8-NEXT: s_mov_b64 s[2:3], exec 280; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 281; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 282; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 283; GFX8-NEXT: ; implicit-def: $vgpr1 284; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 285; GFX8-NEXT: s_cbranch_execz .LBB1_2 286; GFX8-NEXT: ; %bb.1: 287; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 288; GFX8-NEXT: s_waitcnt lgkmcnt(0) 289; GFX8-NEXT: s_mul_i32 s2, s6, s2 290; GFX8-NEXT: v_mov_b32_e32 v1, 0 291; GFX8-NEXT: v_mov_b32_e32 v2, s2 292; GFX8-NEXT: s_mov_b32 m0, -1 293; GFX8-NEXT: s_waitcnt lgkmcnt(0) 294; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 295; GFX8-NEXT: s_waitcnt lgkmcnt(0) 296; GFX8-NEXT: .LBB1_2: 297; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 298; GFX8-NEXT: s_waitcnt lgkmcnt(0) 299; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 300; GFX8-NEXT: v_readfirstlane_b32 s0, v1 301; GFX8-NEXT: s_mov_b32 s7, 0xf000 302; GFX8-NEXT: s_mov_b32 s6, -1 303; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 304; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 305; GFX8-NEXT: s_endpgm 306; 307; GFX9-LABEL: add_i32_uniform: 308; GFX9: ; %bb.0: ; %entry 309; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 310; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 311; GFX9-NEXT: s_mov_b64 s[2:3], exec 312; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 313; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 314; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 315; GFX9-NEXT: ; implicit-def: $vgpr1 316; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 317; GFX9-NEXT: s_cbranch_execz .LBB1_2 318; GFX9-NEXT: ; %bb.1: 319; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 320; GFX9-NEXT: s_waitcnt lgkmcnt(0) 321; GFX9-NEXT: s_mul_i32 s2, s6, s2 322; GFX9-NEXT: v_mov_b32_e32 v1, 0 323; GFX9-NEXT: v_mov_b32_e32 v2, s2 324; GFX9-NEXT: s_waitcnt lgkmcnt(0) 325; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-NEXT: .LBB1_2: 328; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 329; GFX9-NEXT: s_waitcnt lgkmcnt(0) 330; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 331; GFX9-NEXT: v_readfirstlane_b32 s0, v1 332; GFX9-NEXT: s_mov_b32 s7, 0xf000 333; GFX9-NEXT: s_mov_b32 s6, -1 334; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 335; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 336; GFX9-NEXT: s_endpgm 337; 338; GFX1064-LABEL: add_i32_uniform: 339; GFX1064: ; %bb.0: ; %entry 340; GFX1064-NEXT: s_clause 0x1 341; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 342; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 343; GFX1064-NEXT: s_mov_b64 s[2:3], exec 344; GFX1064-NEXT: ; implicit-def: $vgpr1 345; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 346; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 347; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 348; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 349; GFX1064-NEXT: s_cbranch_execz .LBB1_2 350; GFX1064-NEXT: ; %bb.1: 351; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 352; GFX1064-NEXT: v_mov_b32_e32 v1, 0 353; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 354; GFX1064-NEXT: s_mul_i32 s2, s6, s2 355; GFX1064-NEXT: v_mov_b32_e32 v2, s2 356; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 357; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 358; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 359; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 360; GFX1064-NEXT: buffer_gl0_inv 361; GFX1064-NEXT: .LBB1_2: 362; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 363; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 364; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 365; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 366; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 367; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1] 368; GFX1064-NEXT: s_mov_b32 s6, -1 369; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 370; GFX1064-NEXT: s_endpgm 371; 372; GFX1032-LABEL: add_i32_uniform: 373; GFX1032: ; %bb.0: ; %entry 374; GFX1032-NEXT: s_clause 0x1 375; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 376; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 377; GFX1032-NEXT: s_mov_b32 s3, exec_lo 378; GFX1032-NEXT: ; implicit-def: $vgpr1 379; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 380; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 381; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 382; GFX1032-NEXT: s_cbranch_execz .LBB1_2 383; GFX1032-NEXT: ; %bb.1: 384; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 385; GFX1032-NEXT: v_mov_b32_e32 v1, 0 386; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 387; GFX1032-NEXT: s_mul_i32 s1, s2, s1 388; GFX1032-NEXT: v_mov_b32_e32 v2, s1 389; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 390; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 391; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 392; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 393; GFX1032-NEXT: buffer_gl0_inv 394; GFX1032-NEXT: .LBB1_2: 395; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 396; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 397; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 398; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 399; GFX1032-NEXT: s_mov_b32 s6, -1 400; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 401; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] 402; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 403; GFX1032-NEXT: s_endpgm 404; 405; GFX1164-LABEL: add_i32_uniform: 406; GFX1164: ; %bb.0: ; %entry 407; GFX1164-NEXT: s_clause 0x1 408; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 409; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 410; GFX1164-NEXT: s_mov_b64 s[2:3], exec 411; GFX1164-NEXT: s_mov_b64 s[0:1], exec 412; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 413; GFX1164-NEXT: ; implicit-def: $vgpr1 414; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 415; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 416; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 417; GFX1164-NEXT: s_cbranch_execz .LBB1_2 418; GFX1164-NEXT: ; %bb.1: 419; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 420; GFX1164-NEXT: v_mov_b32_e32 v1, 0 421; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 422; GFX1164-NEXT: s_mul_i32 s2, s6, s2 423; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 424; GFX1164-NEXT: v_mov_b32_e32 v2, s2 425; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 426; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 427; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 428; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 429; GFX1164-NEXT: buffer_gl0_inv 430; GFX1164-NEXT: .LBB1_2: 431; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 432; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 433; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 434; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 435; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 436; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1] 437; GFX1164-NEXT: s_mov_b32 s6, -1 438; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 439; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 440; GFX1164-NEXT: s_endpgm 441; 442; GFX1132-LABEL: add_i32_uniform: 443; GFX1132: ; %bb.0: ; %entry 444; GFX1132-NEXT: s_clause 0x1 445; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 446; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 447; GFX1132-NEXT: s_mov_b32 s2, exec_lo 448; GFX1132-NEXT: s_mov_b32 s1, exec_lo 449; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 450; GFX1132-NEXT: ; implicit-def: $vgpr1 451; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 452; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 453; GFX1132-NEXT: s_cbranch_execz .LBB1_2 454; GFX1132-NEXT: ; %bb.1: 455; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 456; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 457; GFX1132-NEXT: s_mul_i32 s2, s0, s2 458; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 459; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 460; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 461; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 462; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 463; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 464; GFX1132-NEXT: buffer_gl0_inv 465; GFX1132-NEXT: .LBB1_2: 466; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 467; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 468; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 469; GFX1132-NEXT: s_mov_b32 s6, -1 470; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 471; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 472; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] 473; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 474; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 475; GFX1132-NEXT: s_endpgm 476entry: 477 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 478 store i32 %old, i32 addrspace(1)* %out 479 ret void 480} 481 482define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 483; 484; 485; GFX7LESS-LABEL: add_i32_varying: 486; GFX7LESS: ; %bb.0: ; %entry 487; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 488; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 489; GFX7LESS-NEXT: s_mov_b32 m0, -1 490; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 491; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 492; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 493; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 494; GFX7LESS-NEXT: s_mov_b32 s2, -1 495; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 496; GFX7LESS-NEXT: s_endpgm 497; 498; GFX8-LABEL: add_i32_varying: 499; GFX8: ; %bb.0: ; %entry 500; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 501; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 502; GFX8-NEXT: v_mov_b32_e32 v1, 0 503; GFX8-NEXT: s_mov_b64 exec, s[2:3] 504; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 505; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 506; GFX8-NEXT: v_mov_b32_e32 v2, v0 507; GFX8-NEXT: s_not_b64 exec, exec 508; GFX8-NEXT: v_mov_b32_e32 v2, 0 509; GFX8-NEXT: s_not_b64 exec, exec 510; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 511; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 512; GFX8-NEXT: s_nop 1 513; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 514; GFX8-NEXT: s_nop 1 515; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 516; GFX8-NEXT: s_nop 1 517; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 518; GFX8-NEXT: s_nop 1 519; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 520; GFX8-NEXT: s_nop 1 521; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 522; GFX8-NEXT: v_readlane_b32 s4, v2, 63 523; GFX8-NEXT: s_nop 0 524; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 525; GFX8-NEXT: s_mov_b64 exec, s[2:3] 526; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 527; GFX8-NEXT: ; implicit-def: $vgpr0 528; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 529; GFX8-NEXT: s_cbranch_execz .LBB2_2 530; GFX8-NEXT: ; %bb.1: 531; GFX8-NEXT: v_mov_b32_e32 v0, 0 532; GFX8-NEXT: v_mov_b32_e32 v3, s4 533; GFX8-NEXT: s_mov_b32 m0, -1 534; GFX8-NEXT: s_waitcnt lgkmcnt(0) 535; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 536; GFX8-NEXT: s_waitcnt lgkmcnt(0) 537; GFX8-NEXT: .LBB2_2: 538; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 539; GFX8-NEXT: s_waitcnt lgkmcnt(0) 540; GFX8-NEXT: v_readfirstlane_b32 s2, v0 541; GFX8-NEXT: v_mov_b32_e32 v0, v1 542; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 543; GFX8-NEXT: s_mov_b32 s3, 0xf000 544; GFX8-NEXT: s_mov_b32 s2, -1 545; GFX8-NEXT: s_nop 0 546; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 547; GFX8-NEXT: s_endpgm 548; 549; GFX9-LABEL: add_i32_varying: 550; GFX9: ; %bb.0: ; %entry 551; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 552; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 553; GFX9-NEXT: v_mov_b32_e32 v1, 0 554; GFX9-NEXT: s_mov_b64 exec, s[2:3] 555; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 556; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 557; GFX9-NEXT: v_mov_b32_e32 v2, v0 558; GFX9-NEXT: s_not_b64 exec, exec 559; GFX9-NEXT: v_mov_b32_e32 v2, 0 560; GFX9-NEXT: s_not_b64 exec, exec 561; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 562; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 563; GFX9-NEXT: s_nop 1 564; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 565; GFX9-NEXT: s_nop 1 566; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 567; GFX9-NEXT: s_nop 1 568; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 569; GFX9-NEXT: s_nop 1 570; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 571; GFX9-NEXT: s_nop 1 572; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 573; GFX9-NEXT: v_readlane_b32 s4, v2, 63 574; GFX9-NEXT: s_nop 0 575; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 576; GFX9-NEXT: s_mov_b64 exec, s[2:3] 577; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 578; GFX9-NEXT: ; implicit-def: $vgpr0 579; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 580; GFX9-NEXT: s_cbranch_execz .LBB2_2 581; GFX9-NEXT: ; %bb.1: 582; GFX9-NEXT: v_mov_b32_e32 v0, 0 583; GFX9-NEXT: v_mov_b32_e32 v3, s4 584; GFX9-NEXT: s_waitcnt lgkmcnt(0) 585; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 586; GFX9-NEXT: s_waitcnt lgkmcnt(0) 587; GFX9-NEXT: .LBB2_2: 588; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 589; GFX9-NEXT: s_waitcnt lgkmcnt(0) 590; GFX9-NEXT: v_readfirstlane_b32 s2, v0 591; GFX9-NEXT: v_mov_b32_e32 v0, v1 592; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 593; GFX9-NEXT: s_mov_b32 s3, 0xf000 594; GFX9-NEXT: s_mov_b32 s2, -1 595; GFX9-NEXT: s_nop 0 596; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 597; GFX9-NEXT: s_endpgm 598; 599; GFX1064-LABEL: add_i32_varying: 600; GFX1064: ; %bb.0: ; %entry 601; GFX1064-NEXT: v_mov_b32_e32 v1, v0 602; GFX1064-NEXT: s_not_b64 exec, exec 603; GFX1064-NEXT: v_mov_b32_e32 v1, 0 604; GFX1064-NEXT: s_not_b64 exec, exec 605; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 606; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 607; GFX1064-NEXT: v_mov_b32_e32 v3, 0 608; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 609; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 610; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 611; GFX1064-NEXT: v_mov_b32_e32 v2, v1 612; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 613; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 614; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 615; GFX1064-NEXT: v_mov_b32_e32 v2, s4 616; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 617; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 618; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 619; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 620; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 621; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 622; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 623; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 624; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 625; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 626; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 627; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 628; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 629; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 630; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 631; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 632; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 633; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 634; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 635; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 636; GFX1064-NEXT: s_mov_b32 s2, -1 637; GFX1064-NEXT: ; implicit-def: $vgpr0 638; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 639; GFX1064-NEXT: s_cbranch_execz .LBB2_2 640; GFX1064-NEXT: ; %bb.1: 641; GFX1064-NEXT: v_mov_b32_e32 v0, 0 642; GFX1064-NEXT: v_mov_b32_e32 v4, s7 643; GFX1064-NEXT: s_mov_b32 s3, s7 644; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 645; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 646; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 647; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 648; GFX1064-NEXT: buffer_gl0_inv 649; GFX1064-NEXT: .LBB2_2: 650; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 651; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 652; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 653; GFX1064-NEXT: v_mov_b32_e32 v0, v3 654; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 655; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 656; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 657; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 658; GFX1064-NEXT: s_endpgm 659; 660; GFX1032-LABEL: add_i32_varying: 661; GFX1032: ; %bb.0: ; %entry 662; GFX1032-NEXT: v_mov_b32_e32 v1, v0 663; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 664; GFX1032-NEXT: v_mov_b32_e32 v1, 0 665; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 666; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 667; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 671; GFX1032-NEXT: v_mov_b32_e32 v2, v1 672; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 673; GFX1032-NEXT: s_mov_b32 exec_lo, s2 674; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 675; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 676; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 677; GFX1032-NEXT: v_mov_b32_e32 v3, 0 678; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 679; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 680; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 681; GFX1032-NEXT: s_mov_b32 exec_lo, s2 682; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 683; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 684; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 685; GFX1032-NEXT: s_mov_b32 exec_lo, s2 686; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 687; GFX1032-NEXT: s_mov_b32 s2, -1 688; GFX1032-NEXT: ; implicit-def: $vgpr0 689; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 690; GFX1032-NEXT: s_cbranch_execz .LBB2_2 691; GFX1032-NEXT: ; %bb.1: 692; GFX1032-NEXT: v_mov_b32_e32 v0, 0 693; GFX1032-NEXT: v_mov_b32_e32 v4, s4 694; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 695; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 696; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 697; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 698; GFX1032-NEXT: buffer_gl0_inv 699; GFX1032-NEXT: .LBB2_2: 700; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 701; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 702; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 703; GFX1032-NEXT: v_mov_b32_e32 v0, v3 704; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 705; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 706; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 707; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 708; GFX1032-NEXT: s_endpgm 709; 710; GFX1164-LABEL: add_i32_varying: 711; GFX1164: ; %bb.0: ; %entry 712; GFX1164-NEXT: v_mov_b32_e32 v1, v0 713; GFX1164-NEXT: s_not_b64 exec, exec 714; GFX1164-NEXT: v_mov_b32_e32 v1, 0 715; GFX1164-NEXT: s_not_b64 exec, exec 716; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 717; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 718; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 719; GFX1164-NEXT: v_mov_b32_e32 v3, 0 720; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 721; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 722; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 723; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 724; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 725; GFX1164-NEXT: v_mov_b32_e32 v2, v1 726; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 727; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 728; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 729; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 730; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 731; GFX1164-NEXT: v_mov_b32_e32 v2, s4 732; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 733; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 734; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 735; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 736; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 737; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 738; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 739; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 740; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 741; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 742; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 743; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 744; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 745; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 746; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 747; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 748; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 749; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 750; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 751; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 752; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 753; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 754; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 755; GFX1164-NEXT: s_mov_b32 s2, -1 756; GFX1164-NEXT: ; implicit-def: $vgpr0 757; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 758; GFX1164-NEXT: s_cbranch_execz .LBB2_2 759; GFX1164-NEXT: ; %bb.1: 760; GFX1164-NEXT: v_mov_b32_e32 v0, 0 761; GFX1164-NEXT: v_mov_b32_e32 v4, s7 762; GFX1164-NEXT: s_mov_b32 s3, s7 763; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 764; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 765; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v4 766; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 767; GFX1164-NEXT: buffer_gl0_inv 768; GFX1164-NEXT: .LBB2_2: 769; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 770; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 771; GFX1164-NEXT: v_mov_b32_e32 v0, v3 772; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 773; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 774; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 775; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 776; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 777; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 778; GFX1164-NEXT: s_endpgm 779; 780; GFX1132-LABEL: add_i32_varying: 781; GFX1132: ; %bb.0: ; %entry 782; GFX1132-NEXT: v_mov_b32_e32 v1, v0 783; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 784; GFX1132-NEXT: v_mov_b32_e32 v1, 0 785; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 786; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 787; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 788; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 789; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 790; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 791; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 792; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 793; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 794; GFX1132-NEXT: v_mov_b32_e32 v2, v1 795; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 796; GFX1132-NEXT: s_mov_b32 exec_lo, s2 797; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 798; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 799; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 800; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 801; GFX1132-NEXT: v_mov_b32_e32 v3, 0 802; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 803; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 804; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 805; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 806; GFX1132-NEXT: s_mov_b32 exec_lo, s2 807; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 808; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 809; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 810; GFX1132-NEXT: s_mov_b32 exec_lo, s2 811; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 812; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 813; GFX1132-NEXT: s_mov_b32 s2, -1 814; GFX1132-NEXT: ; implicit-def: $vgpr0 815; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 816; GFX1132-NEXT: s_cbranch_execz .LBB2_2 817; GFX1132-NEXT: ; %bb.1: 818; GFX1132-NEXT: v_mov_b32_e32 v0, 0 819; GFX1132-NEXT: v_mov_b32_e32 v4, s4 820; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 821; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 822; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v4 823; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 824; GFX1132-NEXT: buffer_gl0_inv 825; GFX1132-NEXT: .LBB2_2: 826; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 827; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 828; GFX1132-NEXT: v_mov_b32_e32 v0, v3 829; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 830; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 831; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 832; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 833; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 834; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 835; GFX1132-NEXT: s_endpgm 836entry: 837 %lane = call i32 @llvm.amdgcn.workitem.id.x() 838 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 839 store i32 %old, i32 addrspace(1)* %out 840 ret void 841} 842 843define amdgpu_kernel void @add_i32_varying_nouse() { 844; GFX7LESS-LABEL: add_i32_varying_nouse: 845; GFX7LESS: ; %bb.0: ; %entry 846; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 847; GFX7LESS-NEXT: s_mov_b32 m0, -1 848; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 849; GFX7LESS-NEXT: ds_add_u32 v1, v0 850; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 851; GFX7LESS-NEXT: s_endpgm 852; 853; GFX8-LABEL: add_i32_varying_nouse: 854; GFX8: ; %bb.0: ; %entry 855; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 856; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 857; GFX8-NEXT: v_mov_b32_e32 v1, v0 858; GFX8-NEXT: s_not_b64 exec, exec 859; GFX8-NEXT: v_mov_b32_e32 v1, 0 860; GFX8-NEXT: s_not_b64 exec, exec 861; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 862; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 863; GFX8-NEXT: s_nop 1 864; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 865; GFX8-NEXT: s_nop 1 866; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 867; GFX8-NEXT: s_nop 1 868; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 869; GFX8-NEXT: s_nop 1 870; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 871; GFX8-NEXT: s_nop 1 872; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 873; GFX8-NEXT: v_readlane_b32 s2, v1, 63 874; GFX8-NEXT: s_mov_b64 exec, s[0:1] 875; GFX8-NEXT: s_mov_b32 s0, s2 876; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 877; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 878; GFX8-NEXT: s_cbranch_execz .LBB3_2 879; GFX8-NEXT: ; %bb.1: 880; GFX8-NEXT: v_mov_b32_e32 v0, 0 881; GFX8-NEXT: v_mov_b32_e32 v2, s0 882; GFX8-NEXT: s_mov_b32 m0, -1 883; GFX8-NEXT: s_waitcnt lgkmcnt(0) 884; GFX8-NEXT: ds_add_u32 v0, v2 885; GFX8-NEXT: s_waitcnt lgkmcnt(0) 886; GFX8-NEXT: .LBB3_2: 887; GFX8-NEXT: s_endpgm 888; 889; GFX9-LABEL: add_i32_varying_nouse: 890; GFX9: ; %bb.0: ; %entry 891; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 892; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 893; GFX9-NEXT: v_mov_b32_e32 v1, v0 894; GFX9-NEXT: s_not_b64 exec, exec 895; GFX9-NEXT: v_mov_b32_e32 v1, 0 896; GFX9-NEXT: s_not_b64 exec, exec 897; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 898; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 899; GFX9-NEXT: s_nop 1 900; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 901; GFX9-NEXT: s_nop 1 902; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 903; GFX9-NEXT: s_nop 1 904; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 905; GFX9-NEXT: s_nop 1 906; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 907; GFX9-NEXT: s_nop 1 908; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 909; GFX9-NEXT: v_readlane_b32 s2, v1, 63 910; GFX9-NEXT: s_mov_b64 exec, s[0:1] 911; GFX9-NEXT: s_mov_b32 s0, s2 912; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 913; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 914; GFX9-NEXT: s_cbranch_execz .LBB3_2 915; GFX9-NEXT: ; %bb.1: 916; GFX9-NEXT: v_mov_b32_e32 v0, 0 917; GFX9-NEXT: v_mov_b32_e32 v2, s0 918; GFX9-NEXT: s_waitcnt lgkmcnt(0) 919; GFX9-NEXT: ds_add_u32 v0, v2 920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 921; GFX9-NEXT: .LBB3_2: 922; GFX9-NEXT: s_endpgm 923; 924; GFX1064-LABEL: add_i32_varying_nouse: 925; GFX1064: ; %bb.0: ; %entry 926; GFX1064-NEXT: v_mov_b32_e32 v1, v0 927; GFX1064-NEXT: s_not_b64 exec, exec 928; GFX1064-NEXT: v_mov_b32_e32 v1, 0 929; GFX1064-NEXT: s_not_b64 exec, exec 930; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 931; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 932; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 933; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 934; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 935; GFX1064-NEXT: v_mov_b32_e32 v2, v1 936; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 937; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 938; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 939; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 940; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 941; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 942; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 943; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 944; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 945; GFX1064-NEXT: s_add_i32 s0, s2, s3 946; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 947; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 948; GFX1064-NEXT: s_cbranch_execz .LBB3_2 949; GFX1064-NEXT: ; %bb.1: 950; GFX1064-NEXT: v_mov_b32_e32 v0, 0 951; GFX1064-NEXT: v_mov_b32_e32 v3, s0 952; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 953; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 954; GFX1064-NEXT: ds_add_u32 v0, v3 955; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 956; GFX1064-NEXT: buffer_gl0_inv 957; GFX1064-NEXT: .LBB3_2: 958; GFX1064-NEXT: s_endpgm 959; 960; GFX1032-LABEL: add_i32_varying_nouse: 961; GFX1032: ; %bb.0: ; %entry 962; GFX1032-NEXT: v_mov_b32_e32 v1, v0 963; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 964; GFX1032-NEXT: v_mov_b32_e32 v1, 0 965; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 966; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 967; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 968; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 969; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 970; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 971; GFX1032-NEXT: v_mov_b32_e32 v2, v1 972; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 973; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 974; GFX1032-NEXT: s_mov_b32 exec_lo, s0 975; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 976; GFX1032-NEXT: v_mov_b32_e32 v0, v1 977; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 978; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 979; GFX1032-NEXT: s_cbranch_execz .LBB3_2 980; GFX1032-NEXT: ; %bb.1: 981; GFX1032-NEXT: v_mov_b32_e32 v3, 0 982; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 983; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 984; GFX1032-NEXT: ds_add_u32 v3, v0 985; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 986; GFX1032-NEXT: buffer_gl0_inv 987; GFX1032-NEXT: .LBB3_2: 988; GFX1032-NEXT: s_endpgm 989; 990; GFX1164-LABEL: add_i32_varying_nouse: 991; GFX1164: ; %bb.0: ; %entry 992; GFX1164-NEXT: v_mov_b32_e32 v1, v0 993; GFX1164-NEXT: s_not_b64 exec, exec 994; GFX1164-NEXT: v_mov_b32_e32 v1, 0 995; GFX1164-NEXT: s_not_b64 exec, exec 996; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 997; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 998; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 999; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1000; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1001; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1002; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1003; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1004; GFX1164-NEXT: v_mov_b32_e32 v2, v1 1005; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1006; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1007; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 1008; GFX1164-NEXT: v_permlane64_b32 v2, v1 1009; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 1010; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1011; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1012; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 1013; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 1014; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 1015; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) 1016; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 1017; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1018; GFX1164-NEXT: v_mov_b32_e32 v0, v1 1019; GFX1164-NEXT: s_mov_b64 s[0:1], exec 1020; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 1021; GFX1164-NEXT: s_cbranch_execz .LBB3_2 1022; GFX1164-NEXT: ; %bb.1: 1023; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1024; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1025; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1026; GFX1164-NEXT: ds_add_u32 v3, v0 1027; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX1164-NEXT: buffer_gl0_inv 1029; GFX1164-NEXT: .LBB3_2: 1030; GFX1164-NEXT: s_endpgm 1031; 1032; GFX1132-LABEL: add_i32_varying_nouse: 1033; GFX1132: ; %bb.0: ; %entry 1034; GFX1132-NEXT: v_mov_b32_e32 v1, v0 1035; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 1036; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1037; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 1038; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 1039; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1040; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1041; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1042; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1043; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1044; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1045; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1046; GFX1132-NEXT: v_mov_b32_e32 v2, v1 1047; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1048; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1049; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 1050; GFX1132-NEXT: s_mov_b32 exec_lo, s0 1051; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1052; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1053; GFX1132-NEXT: v_mov_b32_e32 v0, v1 1054; GFX1132-NEXT: s_mov_b32 s0, exec_lo 1055; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 1056; GFX1132-NEXT: s_cbranch_execz .LBB3_2 1057; GFX1132-NEXT: ; %bb.1: 1058; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1059; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1060; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1061; GFX1132-NEXT: ds_add_u32 v3, v0 1062; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1063; GFX1132-NEXT: buffer_gl0_inv 1064; GFX1132-NEXT: .LBB3_2: 1065; GFX1132-NEXT: s_endpgm 1066entry: 1067 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1068 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1069 ret void 1070} 1071 1072define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1073; 1074; 1075; GFX7LESS-LABEL: add_i64_constant: 1076; GFX7LESS: ; %bb.0: ; %entry 1077; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1078; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1079; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1080; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1081; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1082; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1083; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1084; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1085; GFX7LESS-NEXT: ; %bb.1: 1086; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1087; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1088; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1089; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1090; GFX7LESS-NEXT: s_mov_b32 m0, -1 1091; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1092; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1093; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1094; GFX7LESS-NEXT: .LBB4_2: 1095; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1096; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1097; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1098; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1099; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1100; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1101; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1102; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1103; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1104; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1105; GFX7LESS-NEXT: s_mov_b32 s2, -1 1106; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1107; GFX7LESS-NEXT: s_endpgm 1108; 1109; GFX8-LABEL: add_i64_constant: 1110; GFX8: ; %bb.0: ; %entry 1111; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1112; GFX8-NEXT: s_mov_b64 s[4:5], exec 1113; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1114; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1115; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1116; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1117; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1118; GFX8-NEXT: s_cbranch_execz .LBB4_2 1119; GFX8-NEXT: ; %bb.1: 1120; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1121; GFX8-NEXT: s_mul_i32 s4, s4, 5 1122; GFX8-NEXT: v_mov_b32_e32 v0, s4 1123; GFX8-NEXT: v_mov_b32_e32 v1, 0 1124; GFX8-NEXT: s_mov_b32 m0, -1 1125; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1126; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1127; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1128; GFX8-NEXT: .LBB4_2: 1129; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1130; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1131; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1132; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1133; GFX8-NEXT: v_mov_b32_e32 v0, s2 1134; GFX8-NEXT: v_mov_b32_e32 v1, s3 1135; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1136; GFX8-NEXT: s_mov_b32 s3, 0xf000 1137; GFX8-NEXT: s_mov_b32 s2, -1 1138; GFX8-NEXT: s_nop 2 1139; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1140; GFX8-NEXT: s_endpgm 1141; 1142; GFX9-LABEL: add_i64_constant: 1143; GFX9: ; %bb.0: ; %entry 1144; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1145; GFX9-NEXT: s_mov_b64 s[4:5], exec 1146; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1147; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1148; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1149; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1150; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1151; GFX9-NEXT: s_cbranch_execz .LBB4_2 1152; GFX9-NEXT: ; %bb.1: 1153; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1154; GFX9-NEXT: s_mul_i32 s4, s4, 5 1155; GFX9-NEXT: v_mov_b32_e32 v0, s4 1156; GFX9-NEXT: v_mov_b32_e32 v1, 0 1157; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1158; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1159; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX9-NEXT: .LBB4_2: 1161; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1162; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1163; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1164; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1165; GFX9-NEXT: v_mov_b32_e32 v0, s2 1166; GFX9-NEXT: v_mov_b32_e32 v1, s3 1167; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1168; GFX9-NEXT: s_mov_b32 s3, 0xf000 1169; GFX9-NEXT: s_mov_b32 s2, -1 1170; GFX9-NEXT: s_nop 2 1171; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1172; GFX9-NEXT: s_endpgm 1173; 1174; GFX1064-LABEL: add_i64_constant: 1175; GFX1064: ; %bb.0: ; %entry 1176; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1177; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1178; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1179; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1180; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1181; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1182; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1183; GFX1064-NEXT: s_cbranch_execz .LBB4_2 1184; GFX1064-NEXT: ; %bb.1: 1185; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1186; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1187; GFX1064-NEXT: s_mul_i32 s4, s4, 5 1188; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1189; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1190; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1191; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1192; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1193; GFX1064-NEXT: buffer_gl0_inv 1194; GFX1064-NEXT: .LBB4_2: 1195; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1196; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1197; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1198; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1199; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1200; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1201; GFX1064-NEXT: s_mov_b32 s2, -1 1202; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1204; GFX1064-NEXT: s_endpgm 1205; 1206; GFX1032-LABEL: add_i64_constant: 1207; GFX1032: ; %bb.0: ; %entry 1208; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1209; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1210; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1211; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1212; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1213; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1214; GFX1032-NEXT: s_cbranch_execz .LBB4_2 1215; GFX1032-NEXT: ; %bb.1: 1216; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1217; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1218; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1219; GFX1032-NEXT: v_mov_b32_e32 v0, s3 1220; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1221; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1222; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1223; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1224; GFX1032-NEXT: buffer_gl0_inv 1225; GFX1032-NEXT: .LBB4_2: 1226; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1227; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1228; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1229; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1230; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1231; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1232; GFX1032-NEXT: s_mov_b32 s2, -1 1233; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1234; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1235; GFX1032-NEXT: s_endpgm 1236; 1237; GFX1164-LABEL: add_i64_constant: 1238; GFX1164: ; %bb.0: ; %entry 1239; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1240; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1241; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1242; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1243; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1244; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1245; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1246; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1247; GFX1164-NEXT: s_cbranch_execz .LBB4_2 1248; GFX1164-NEXT: ; %bb.1: 1249; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1250; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1251; GFX1164-NEXT: s_mul_i32 s4, s4, 5 1252; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1253; GFX1164-NEXT: v_mov_b32_e32 v0, s4 1254; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1255; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1256; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1257; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1258; GFX1164-NEXT: buffer_gl0_inv 1259; GFX1164-NEXT: .LBB4_2: 1260; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 1261; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1262; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1263; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1264; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1265; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1266; GFX1164-NEXT: s_mov_b32 s2, -1 1267; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1268; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1269; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1270; GFX1164-NEXT: s_endpgm 1271; 1272; GFX1132-LABEL: add_i64_constant: 1273; GFX1132: ; %bb.0: ; %entry 1274; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1275; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1276; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1277; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1278; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1279; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1280; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1281; GFX1132-NEXT: s_cbranch_execz .LBB4_2 1282; GFX1132-NEXT: ; %bb.1: 1283; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1284; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1285; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1286; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1287; GFX1132-NEXT: v_mov_b32_e32 v0, s3 1288; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1289; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1290; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1291; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1292; GFX1132-NEXT: buffer_gl0_inv 1293; GFX1132-NEXT: .LBB4_2: 1294; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1295; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1296; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1297; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1298; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1299; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1300; GFX1132-NEXT: s_mov_b32 s2, -1 1301; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1302; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1303; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1304; GFX1132-NEXT: s_endpgm 1305entry: 1306 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1307 store i64 %old, i64 addrspace(1)* %out 1308 ret void 1309} 1310 1311define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1312; 1313; 1314; GFX7LESS-LABEL: add_i64_uniform: 1315; GFX7LESS: ; %bb.0: ; %entry 1316; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1317; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1318; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1319; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1320; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1321; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1322; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1323; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 1324; GFX7LESS-NEXT: ; %bb.1: 1325; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1326; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 1327; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1328; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1329; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1330; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 1331; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1332; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1333; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1334; GFX7LESS-NEXT: s_mov_b32 m0, -1 1335; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1336; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1337; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1338; GFX7LESS-NEXT: .LBB5_2: 1339; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1340; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1341; GFX7LESS-NEXT: s_mov_b32 s6, -1 1342; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX7LESS-NEXT: s_mov_b32 s4, s0 1344; GFX7LESS-NEXT: s_mov_b32 s5, s1 1345; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 1346; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1347; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 1348; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 1349; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 1350; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1351; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 1352; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1353; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1354; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1355; GFX7LESS-NEXT: s_endpgm 1356; 1357; GFX8-LABEL: add_i64_uniform: 1358; GFX8: ; %bb.0: ; %entry 1359; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1360; GFX8-NEXT: s_mov_b64 s[6:7], exec 1361; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1362; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1363; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1364; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1365; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1366; GFX8-NEXT: s_cbranch_execz .LBB5_2 1367; GFX8-NEXT: ; %bb.1: 1368; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 1369; GFX8-NEXT: v_mov_b32_e32 v0, s8 1370; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1371; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 1372; GFX8-NEXT: s_mul_i32 s6, s3, s8 1373; GFX8-NEXT: v_mov_b32_e32 v3, 0 1374; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1375; GFX8-NEXT: s_mov_b32 m0, -1 1376; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1377; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1378; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1379; GFX8-NEXT: .LBB5_2: 1380; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1381; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1382; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1383; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1384; GFX8-NEXT: v_mov_b32_e32 v0, s4 1385; GFX8-NEXT: v_mov_b32_e32 v1, s5 1386; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 1387; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 1388; GFX8-NEXT: s_mov_b32 s7, 0xf000 1389; GFX8-NEXT: s_mov_b32 s6, -1 1390; GFX8-NEXT: s_mov_b32 s4, s0 1391; GFX8-NEXT: s_mov_b32 s5, s1 1392; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1393; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1394; GFX8-NEXT: s_endpgm 1395; 1396; GFX9-LABEL: add_i64_uniform: 1397; GFX9: ; %bb.0: ; %entry 1398; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1399; GFX9-NEXT: s_mov_b64 s[6:7], exec 1400; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1401; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1402; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1403; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1404; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1405; GFX9-NEXT: s_cbranch_execz .LBB5_2 1406; GFX9-NEXT: ; %bb.1: 1407; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1408; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX9-NEXT: s_mul_i32 s7, s3, s6 1410; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1411; GFX9-NEXT: s_add_i32 s8, s8, s7 1412; GFX9-NEXT: s_mul_i32 s6, s2, s6 1413; GFX9-NEXT: v_mov_b32_e32 v0, s6 1414; GFX9-NEXT: v_mov_b32_e32 v1, s8 1415; GFX9-NEXT: v_mov_b32_e32 v3, 0 1416; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1419; GFX9-NEXT: .LBB5_2: 1420; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1421; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1423; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1424; GFX9-NEXT: v_mov_b32_e32 v0, s4 1425; GFX9-NEXT: v_mov_b32_e32 v1, s5 1426; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] 1427; GFX9-NEXT: s_mov_b32 s7, 0xf000 1428; GFX9-NEXT: s_mov_b32 s6, -1 1429; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1430; GFX9-NEXT: s_mov_b32 s4, s0 1431; GFX9-NEXT: s_mov_b32 s5, s1 1432; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1433; GFX9-NEXT: s_endpgm 1434; 1435; GFX1064-LABEL: add_i64_uniform: 1436; GFX1064: ; %bb.0: ; %entry 1437; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1438; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1439; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1440; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1441; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1442; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1443; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1444; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1445; GFX1064-NEXT: ; %bb.1: 1446; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1447; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1448; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1449; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1450; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1451; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1452; GFX1064-NEXT: s_add_i32 s8, s8, s7 1453; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1454; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1455; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1456; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1457; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1458; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1459; GFX1064-NEXT: buffer_gl0_inv 1460; GFX1064-NEXT: .LBB5_2: 1461; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1462; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1463; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1464; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1465; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1466; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] 1467; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1468; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1469; GFX1064-NEXT: s_mov_b32 s2, -1 1470; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1471; GFX1064-NEXT: s_endpgm 1472; 1473; GFX1032-LABEL: add_i64_uniform: 1474; GFX1032: ; %bb.0: ; %entry 1475; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1476; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1477; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1478; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1479; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1480; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1481; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1482; GFX1032-NEXT: ; %bb.1: 1483; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1484; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1485; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1486; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1487; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1488; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1489; GFX1032-NEXT: s_add_i32 s7, s7, s6 1490; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1491; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1492; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1493; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1494; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1495; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1496; GFX1032-NEXT: buffer_gl0_inv 1497; GFX1032-NEXT: .LBB5_2: 1498; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1499; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1500; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1501; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1502; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1503; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1504; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] 1505; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1506; GFX1032-NEXT: s_mov_b32 s2, -1 1507; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1508; GFX1032-NEXT: s_endpgm 1509; 1510; GFX1164-LABEL: add_i64_uniform: 1511; GFX1164: ; %bb.0: ; %entry 1512; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1513; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1514; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1515; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1516; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1517; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1518; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1519; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1520; GFX1164-NEXT: s_cbranch_execz .LBB5_2 1521; GFX1164-NEXT: ; %bb.1: 1522; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1523; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1524; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1525; GFX1164-NEXT: s_mul_i32 s7, s3, s6 1526; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 1527; GFX1164-NEXT: s_mul_i32 s6, s2, s6 1528; GFX1164-NEXT: s_add_i32 s8, s8, s7 1529; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1530; GFX1164-NEXT: v_mov_b32_e32 v1, s8 1531; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1532; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1533; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1534; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1535; GFX1164-NEXT: buffer_gl0_inv 1536; GFX1164-NEXT: .LBB5_2: 1537; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1538; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 1539; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 1540; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1541; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1542; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1543; GFX1164-NEXT: s_mov_b32 s2, -1 1544; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1545; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1546; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1547; GFX1164-NEXT: v_mov_b32_e32 v1, v3 1548; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1549; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1550; GFX1164-NEXT: s_endpgm 1551; 1552; GFX1132-LABEL: add_i64_uniform: 1553; GFX1132: ; %bb.0: ; %entry 1554; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1555; GFX1132-NEXT: s_mov_b32 s5, exec_lo 1556; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1557; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1558; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1559; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1560; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1561; GFX1132-NEXT: s_cbranch_execz .LBB5_2 1562; GFX1132-NEXT: ; %bb.1: 1563; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 1564; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1565; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1566; GFX1132-NEXT: s_mul_i32 s6, s3, s5 1567; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 1568; GFX1132-NEXT: s_mul_i32 s5, s2, s5 1569; GFX1132-NEXT: s_add_i32 s7, s7, s6 1570; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1571; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 1572; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1573; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1574; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1575; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1576; GFX1132-NEXT: buffer_gl0_inv 1577; GFX1132-NEXT: .LBB5_2: 1578; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1579; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 1580; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 1581; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1582; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1583; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1584; GFX1132-NEXT: s_mov_b32 s2, -1 1585; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1586; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1587; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1588; GFX1132-NEXT: v_mov_b32_e32 v1, v3 1589; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1590; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1591; GFX1132-NEXT: s_endpgm 1592entry: 1593 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1594 store i64 %old, i64 addrspace(1)* %out 1595 ret void 1596} 1597 1598define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1599; 1600; 1601; GFX7LESS-LABEL: add_i64_varying: 1602; GFX7LESS: ; %bb.0: ; %entry 1603; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1604; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1605; GFX7LESS-NEXT: s_mov_b32 m0, -1 1606; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1607; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1608; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1609; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1610; GFX7LESS-NEXT: s_mov_b32 s2, -1 1611; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1612; GFX7LESS-NEXT: s_endpgm 1613; 1614; GFX8-LABEL: add_i64_varying: 1615; GFX8: ; %bb.0: ; %entry 1616; GFX8-NEXT: v_mov_b32_e32 v1, 0 1617; GFX8-NEXT: s_mov_b32 m0, -1 1618; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1619; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1620; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1621; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1622; GFX8-NEXT: s_mov_b32 s3, 0xf000 1623; GFX8-NEXT: s_mov_b32 s2, -1 1624; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1625; GFX8-NEXT: s_endpgm 1626; 1627; GFX9-LABEL: add_i64_varying: 1628; GFX9: ; %bb.0: ; %entry 1629; GFX9-NEXT: v_mov_b32_e32 v1, 0 1630; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1631; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1632; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1633; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1634; GFX9-NEXT: s_mov_b32 s3, 0xf000 1635; GFX9-NEXT: s_mov_b32 s2, -1 1636; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1637; GFX9-NEXT: s_endpgm 1638; 1639; GFX10-LABEL: add_i64_varying: 1640; GFX10: ; %bb.0: ; %entry 1641; GFX10-NEXT: v_mov_b32_e32 v1, 0 1642; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1643; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1644; GFX10-NEXT: s_mov_b32 s2, -1 1645; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1646; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1647; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1648; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1649; GFX10-NEXT: buffer_gl0_inv 1650; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1651; GFX10-NEXT: s_endpgm 1652; 1653; GFX11-LABEL: add_i64_varying: 1654; GFX11: ; %bb.0: ; %entry 1655; GFX11-NEXT: v_mov_b32_e32 v1, 0 1656; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1657; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1658; GFX11-NEXT: s_mov_b32 s2, -1 1659; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1660; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1661; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1662; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1663; GFX11-NEXT: buffer_gl0_inv 1664; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1665; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1666; GFX11-NEXT: s_endpgm 1667entry: 1668 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1669 %zext = zext i32 %lane to i64 1670 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1671 store i64 %old, i64 addrspace(1)* %out 1672 ret void 1673} 1674 1675define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1676; 1677; 1678; GFX7LESS-LABEL: sub_i32_constant: 1679; GFX7LESS: ; %bb.0: ; %entry 1680; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1681; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1682; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1683; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1684; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1685; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1686; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1687; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1688; GFX7LESS-NEXT: ; %bb.1: 1689; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1690; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1691; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1692; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1693; GFX7LESS-NEXT: s_mov_b32 m0, -1 1694; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1695; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1696; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1697; GFX7LESS-NEXT: .LBB7_2: 1698; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1699; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1700; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1701; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1702; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1703; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1704; GFX7LESS-NEXT: s_mov_b32 s2, -1 1705; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1706; GFX7LESS-NEXT: s_endpgm 1707; 1708; GFX8-LABEL: sub_i32_constant: 1709; GFX8: ; %bb.0: ; %entry 1710; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1711; GFX8-NEXT: s_mov_b64 s[2:3], exec 1712; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1713; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1714; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1715; GFX8-NEXT: ; implicit-def: $vgpr1 1716; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1717; GFX8-NEXT: s_cbranch_execz .LBB7_2 1718; GFX8-NEXT: ; %bb.1: 1719; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1720; GFX8-NEXT: s_mul_i32 s2, s2, 5 1721; GFX8-NEXT: v_mov_b32_e32 v1, 0 1722; GFX8-NEXT: v_mov_b32_e32 v2, s2 1723; GFX8-NEXT: s_mov_b32 m0, -1 1724; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1725; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1726; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX8-NEXT: .LBB7_2: 1728; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1729; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1730; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1731; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1732; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1733; GFX8-NEXT: s_mov_b32 s3, 0xf000 1734; GFX8-NEXT: s_mov_b32 s2, -1 1735; GFX8-NEXT: s_nop 0 1736; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1737; GFX8-NEXT: s_endpgm 1738; 1739; GFX9-LABEL: sub_i32_constant: 1740; GFX9: ; %bb.0: ; %entry 1741; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1742; GFX9-NEXT: s_mov_b64 s[2:3], exec 1743; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1744; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1745; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1746; GFX9-NEXT: ; implicit-def: $vgpr1 1747; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1748; GFX9-NEXT: s_cbranch_execz .LBB7_2 1749; GFX9-NEXT: ; %bb.1: 1750; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1751; GFX9-NEXT: s_mul_i32 s2, s2, 5 1752; GFX9-NEXT: v_mov_b32_e32 v1, 0 1753; GFX9-NEXT: v_mov_b32_e32 v2, s2 1754; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1756; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1757; GFX9-NEXT: .LBB7_2: 1758; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1759; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1760; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1761; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1762; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1763; GFX9-NEXT: s_mov_b32 s3, 0xf000 1764; GFX9-NEXT: s_mov_b32 s2, -1 1765; GFX9-NEXT: s_nop 0 1766; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1767; GFX9-NEXT: s_endpgm 1768; 1769; GFX1064-LABEL: sub_i32_constant: 1770; GFX1064: ; %bb.0: ; %entry 1771; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1772; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1773; GFX1064-NEXT: ; implicit-def: $vgpr1 1774; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1775; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1776; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1777; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1778; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1779; GFX1064-NEXT: ; %bb.1: 1780; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1781; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1782; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1783; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1784; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1785; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1786; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1787; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1788; GFX1064-NEXT: buffer_gl0_inv 1789; GFX1064-NEXT: .LBB7_2: 1790; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1791; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1792; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1793; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1794; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1795; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1796; GFX1064-NEXT: s_mov_b32 s2, -1 1797; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1798; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1799; GFX1064-NEXT: s_endpgm 1800; 1801; GFX1032-LABEL: sub_i32_constant: 1802; GFX1032: ; %bb.0: ; %entry 1803; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1804; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1805; GFX1032-NEXT: ; implicit-def: $vgpr1 1806; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1807; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1808; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1809; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1810; GFX1032-NEXT: ; %bb.1: 1811; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1812; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1813; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1814; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1815; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1816; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1817; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1818; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1819; GFX1032-NEXT: buffer_gl0_inv 1820; GFX1032-NEXT: .LBB7_2: 1821; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1822; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1823; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1824; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1825; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1826; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1827; GFX1032-NEXT: s_mov_b32 s2, -1 1828; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1829; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1830; GFX1032-NEXT: s_endpgm 1831; 1832; GFX1164-LABEL: sub_i32_constant: 1833; GFX1164: ; %bb.0: ; %entry 1834; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1835; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1836; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1837; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1838; GFX1164-NEXT: ; implicit-def: $vgpr1 1839; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1840; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1841; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 1842; GFX1164-NEXT: s_cbranch_execz .LBB7_2 1843; GFX1164-NEXT: ; %bb.1: 1844; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1845; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1846; GFX1164-NEXT: s_mul_i32 s2, s2, 5 1847; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1848; GFX1164-NEXT: v_mov_b32_e32 v2, s2 1849; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1850; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1851; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 1852; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1853; GFX1164-NEXT: buffer_gl0_inv 1854; GFX1164-NEXT: .LBB7_2: 1855; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1856; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 1857; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1858; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1859; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1860; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1861; GFX1164-NEXT: s_mov_b32 s2, -1 1862; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1863; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1864; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1865; GFX1164-NEXT: s_endpgm 1866; 1867; GFX1132-LABEL: sub_i32_constant: 1868; GFX1132: ; %bb.0: ; %entry 1869; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1870; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1871; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1872; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1873; GFX1132-NEXT: ; implicit-def: $vgpr1 1874; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1875; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 1876; GFX1132-NEXT: s_cbranch_execz .LBB7_2 1877; GFX1132-NEXT: ; %bb.1: 1878; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1879; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1880; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1881; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 1882; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1883; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1884; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 1885; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1886; GFX1132-NEXT: buffer_gl0_inv 1887; GFX1132-NEXT: .LBB7_2: 1888; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1889; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 1890; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1891; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1892; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1893; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1894; GFX1132-NEXT: s_mov_b32 s2, -1 1895; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1896; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1897; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1898; GFX1132-NEXT: s_endpgm 1899entry: 1900 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1901 store i32 %old, i32 addrspace(1)* %out 1902 ret void 1903} 1904 1905define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1906; 1907; 1908; GFX7LESS-LABEL: sub_i32_uniform: 1909; GFX7LESS: ; %bb.0: ; %entry 1910; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1911; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1912; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1913; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1914; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1915; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1916; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1917; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1918; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1919; GFX7LESS-NEXT: ; %bb.1: 1920; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1921; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1922; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1923; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1924; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1925; GFX7LESS-NEXT: s_mov_b32 m0, -1 1926; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1927; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1928; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1929; GFX7LESS-NEXT: .LBB8_2: 1930; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1931; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1932; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1933; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1934; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1935; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1936; GFX7LESS-NEXT: s_mov_b32 s6, -1 1937; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1938; GFX7LESS-NEXT: s_endpgm 1939; 1940; GFX8-LABEL: sub_i32_uniform: 1941; GFX8: ; %bb.0: ; %entry 1942; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1943; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1944; GFX8-NEXT: s_mov_b64 s[2:3], exec 1945; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1946; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1947; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1948; GFX8-NEXT: ; implicit-def: $vgpr1 1949; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1950; GFX8-NEXT: s_cbranch_execz .LBB8_2 1951; GFX8-NEXT: ; %bb.1: 1952; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1954; GFX8-NEXT: s_mul_i32 s2, s6, s2 1955; GFX8-NEXT: v_mov_b32_e32 v1, 0 1956; GFX8-NEXT: v_mov_b32_e32 v2, s2 1957; GFX8-NEXT: s_mov_b32 m0, -1 1958; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1959; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1960; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1961; GFX8-NEXT: .LBB8_2: 1962; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1963; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1964; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1965; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1966; GFX8-NEXT: s_mov_b32 s7, 0xf000 1967; GFX8-NEXT: s_mov_b32 s6, -1 1968; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1969; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1970; GFX8-NEXT: s_endpgm 1971; 1972; GFX9-LABEL: sub_i32_uniform: 1973; GFX9: ; %bb.0: ; %entry 1974; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1975; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1976; GFX9-NEXT: s_mov_b64 s[2:3], exec 1977; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1978; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1979; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1980; GFX9-NEXT: ; implicit-def: $vgpr1 1981; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1982; GFX9-NEXT: s_cbranch_execz .LBB8_2 1983; GFX9-NEXT: ; %bb.1: 1984; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1985; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1986; GFX9-NEXT: s_mul_i32 s2, s6, s2 1987; GFX9-NEXT: v_mov_b32_e32 v1, 0 1988; GFX9-NEXT: v_mov_b32_e32 v2, s2 1989; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1990; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1991; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1992; GFX9-NEXT: .LBB8_2: 1993; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1994; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1995; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1996; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1997; GFX9-NEXT: s_mov_b32 s7, 0xf000 1998; GFX9-NEXT: s_mov_b32 s6, -1 1999; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 2000; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 2001; GFX9-NEXT: s_endpgm 2002; 2003; GFX1064-LABEL: sub_i32_uniform: 2004; GFX1064: ; %bb.0: ; %entry 2005; GFX1064-NEXT: s_clause 0x1 2006; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2007; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 2008; GFX1064-NEXT: s_mov_b64 s[2:3], exec 2009; GFX1064-NEXT: ; implicit-def: $vgpr1 2010; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2011; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2012; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2013; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 2014; GFX1064-NEXT: s_cbranch_execz .LBB8_2 2015; GFX1064-NEXT: ; %bb.1: 2016; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2017; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2018; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2019; GFX1064-NEXT: s_mul_i32 s2, s6, s2 2020; GFX1064-NEXT: v_mov_b32_e32 v2, s2 2021; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2022; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2023; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 2024; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2025; GFX1064-NEXT: buffer_gl0_inv 2026; GFX1064-NEXT: .LBB8_2: 2027; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2028; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 2029; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2030; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 2031; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 2032; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2033; GFX1064-NEXT: s_mov_b32 s6, -1 2034; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2035; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 2036; GFX1064-NEXT: s_endpgm 2037; 2038; GFX1032-LABEL: sub_i32_uniform: 2039; GFX1032: ; %bb.0: ; %entry 2040; GFX1032-NEXT: s_clause 0x1 2041; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2042; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 2043; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2044; GFX1032-NEXT: ; implicit-def: $vgpr1 2045; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 2046; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2047; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2048; GFX1032-NEXT: s_cbranch_execz .LBB8_2 2049; GFX1032-NEXT: ; %bb.1: 2050; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 2051; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2052; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX1032-NEXT: s_mul_i32 s1, s2, s1 2054; GFX1032-NEXT: v_mov_b32_e32 v2, s1 2055; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2056; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2057; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 2058; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2059; GFX1032-NEXT: buffer_gl0_inv 2060; GFX1032-NEXT: .LBB8_2: 2061; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2062; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 2063; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2064; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2065; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 2066; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2067; GFX1032-NEXT: s_mov_b32 s6, -1 2068; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2069; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 2070; GFX1032-NEXT: s_endpgm 2071; 2072; GFX1164-LABEL: sub_i32_uniform: 2073; GFX1164: ; %bb.0: ; %entry 2074; GFX1164-NEXT: s_clause 0x1 2075; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2076; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 2077; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2078; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2079; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2080; GFX1164-NEXT: ; implicit-def: $vgpr1 2081; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2082; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2083; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 2084; GFX1164-NEXT: s_cbranch_execz .LBB8_2 2085; GFX1164-NEXT: ; %bb.1: 2086; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2087; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2088; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2089; GFX1164-NEXT: s_mul_i32 s2, s6, s2 2090; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2091; GFX1164-NEXT: v_mov_b32_e32 v2, s2 2092; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2093; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2094; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 2095; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2096; GFX1164-NEXT: buffer_gl0_inv 2097; GFX1164-NEXT: .LBB8_2: 2098; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 2099; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2100; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 2101; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 2102; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 2103; GFX1164-NEXT: s_mov_b32 s6, -1 2104; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2105; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2106; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2107; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2108; GFX1164-NEXT: s_endpgm 2109; 2110; GFX1132-LABEL: sub_i32_uniform: 2111; GFX1132: ; %bb.0: ; %entry 2112; GFX1132-NEXT: s_clause 0x1 2113; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2114; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 2115; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2116; GFX1132-NEXT: s_mov_b32 s1, exec_lo 2117; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2118; GFX1132-NEXT: ; implicit-def: $vgpr1 2119; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2120; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 2121; GFX1132-NEXT: s_cbranch_execz .LBB8_2 2122; GFX1132-NEXT: ; %bb.1: 2123; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 2124; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2125; GFX1132-NEXT: s_mul_i32 s2, s0, s2 2126; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2127; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 2128; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2129; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2130; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 2131; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2132; GFX1132-NEXT: buffer_gl0_inv 2133; GFX1132-NEXT: .LBB8_2: 2134; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 2135; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2136; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 2137; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 2138; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 2139; GFX1132-NEXT: s_mov_b32 s6, -1 2140; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2141; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2142; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2143; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2144; GFX1132-NEXT: s_endpgm 2145entry: 2146 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 2147 store i32 %old, i32 addrspace(1)* %out 2148 ret void 2149} 2150 2151define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 2152; 2153; 2154; GFX7LESS-LABEL: sub_i32_varying: 2155; GFX7LESS: ; %bb.0: ; %entry 2156; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2157; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2158; GFX7LESS-NEXT: s_mov_b32 m0, -1 2159; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 2161; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2162; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2163; GFX7LESS-NEXT: s_mov_b32 s2, -1 2164; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2165; GFX7LESS-NEXT: s_endpgm 2166; 2167; GFX8-LABEL: sub_i32_varying: 2168; GFX8: ; %bb.0: ; %entry 2169; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2170; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2171; GFX8-NEXT: v_mov_b32_e32 v1, 0 2172; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2173; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2174; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2175; GFX8-NEXT: v_mov_b32_e32 v2, v0 2176; GFX8-NEXT: s_not_b64 exec, exec 2177; GFX8-NEXT: v_mov_b32_e32 v2, 0 2178; GFX8-NEXT: s_not_b64 exec, exec 2179; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2180; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2181; GFX8-NEXT: s_nop 1 2182; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2183; GFX8-NEXT: s_nop 1 2184; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2185; GFX8-NEXT: s_nop 1 2186; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2187; GFX8-NEXT: s_nop 1 2188; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2189; GFX8-NEXT: s_nop 1 2190; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2191; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2192; GFX8-NEXT: s_nop 0 2193; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2194; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2195; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2196; GFX8-NEXT: ; implicit-def: $vgpr0 2197; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2198; GFX8-NEXT: s_cbranch_execz .LBB9_2 2199; GFX8-NEXT: ; %bb.1: 2200; GFX8-NEXT: v_mov_b32_e32 v0, 0 2201; GFX8-NEXT: v_mov_b32_e32 v3, s4 2202; GFX8-NEXT: s_mov_b32 m0, -1 2203; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2204; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 2205; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2206; GFX8-NEXT: .LBB9_2: 2207; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2208; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2209; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2210; GFX8-NEXT: v_mov_b32_e32 v0, v1 2211; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2212; GFX8-NEXT: s_mov_b32 s3, 0xf000 2213; GFX8-NEXT: s_mov_b32 s2, -1 2214; GFX8-NEXT: s_nop 0 2215; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2216; GFX8-NEXT: s_endpgm 2217; 2218; GFX9-LABEL: sub_i32_varying: 2219; GFX9: ; %bb.0: ; %entry 2220; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2221; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2222; GFX9-NEXT: v_mov_b32_e32 v1, 0 2223; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2224; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2225; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2226; GFX9-NEXT: v_mov_b32_e32 v2, v0 2227; GFX9-NEXT: s_not_b64 exec, exec 2228; GFX9-NEXT: v_mov_b32_e32 v2, 0 2229; GFX9-NEXT: s_not_b64 exec, exec 2230; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2231; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2232; GFX9-NEXT: s_nop 1 2233; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2234; GFX9-NEXT: s_nop 1 2235; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2236; GFX9-NEXT: s_nop 1 2237; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2238; GFX9-NEXT: s_nop 1 2239; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2240; GFX9-NEXT: s_nop 1 2241; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2242; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2243; GFX9-NEXT: s_nop 0 2244; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2245; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2247; GFX9-NEXT: ; implicit-def: $vgpr0 2248; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2249; GFX9-NEXT: s_cbranch_execz .LBB9_2 2250; GFX9-NEXT: ; %bb.1: 2251; GFX9-NEXT: v_mov_b32_e32 v0, 0 2252; GFX9-NEXT: v_mov_b32_e32 v3, s4 2253; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2254; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2255; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2256; GFX9-NEXT: .LBB9_2: 2257; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2259; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2260; GFX9-NEXT: v_mov_b32_e32 v0, v1 2261; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2262; GFX9-NEXT: s_mov_b32 s3, 0xf000 2263; GFX9-NEXT: s_mov_b32 s2, -1 2264; GFX9-NEXT: s_nop 0 2265; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2266; GFX9-NEXT: s_endpgm 2267; 2268; GFX1064-LABEL: sub_i32_varying: 2269; GFX1064: ; %bb.0: ; %entry 2270; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2271; GFX1064-NEXT: s_not_b64 exec, exec 2272; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2273; GFX1064-NEXT: s_not_b64 exec, exec 2274; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2275; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2276; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2277; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2278; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2279; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2280; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2281; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2282; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2283; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2284; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2285; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2286; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2287; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2288; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2289; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2290; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2291; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2292; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2293; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2294; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2295; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2296; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2297; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2298; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2299; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2300; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2301; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2302; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2303; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2304; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2305; GFX1064-NEXT: s_mov_b32 s2, -1 2306; GFX1064-NEXT: ; implicit-def: $vgpr0 2307; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2308; GFX1064-NEXT: s_cbranch_execz .LBB9_2 2309; GFX1064-NEXT: ; %bb.1: 2310; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2311; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2312; GFX1064-NEXT: s_mov_b32 s3, s7 2313; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2314; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2315; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 2316; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2317; GFX1064-NEXT: buffer_gl0_inv 2318; GFX1064-NEXT: .LBB9_2: 2319; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2320; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2321; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2322; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2323; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2324; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2325; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2326; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2327; GFX1064-NEXT: s_endpgm 2328; 2329; GFX1032-LABEL: sub_i32_varying: 2330; GFX1032: ; %bb.0: ; %entry 2331; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2332; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2333; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2334; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2335; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2336; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2337; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2338; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2339; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2340; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2341; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2342; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2343; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2344; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2345; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2346; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2347; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2348; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2349; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2350; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2351; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2352; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2353; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2354; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2355; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2356; GFX1032-NEXT: s_mov_b32 s2, -1 2357; GFX1032-NEXT: ; implicit-def: $vgpr0 2358; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2359; GFX1032-NEXT: s_cbranch_execz .LBB9_2 2360; GFX1032-NEXT: ; %bb.1: 2361; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2362; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2363; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2364; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2365; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 2366; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2367; GFX1032-NEXT: buffer_gl0_inv 2368; GFX1032-NEXT: .LBB9_2: 2369; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2370; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2371; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2372; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2373; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2374; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2375; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2376; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2377; GFX1032-NEXT: s_endpgm 2378; 2379; GFX1164-LABEL: sub_i32_varying: 2380; GFX1164: ; %bb.0: ; %entry 2381; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2382; GFX1164-NEXT: s_not_b64 exec, exec 2383; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2384; GFX1164-NEXT: s_not_b64 exec, exec 2385; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2386; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2387; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2388; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2389; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2390; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2391; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2392; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2393; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2394; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2395; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2396; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2397; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2398; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 2399; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2400; GFX1164-NEXT: v_mov_b32_e32 v2, s4 2401; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2402; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2403; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 2404; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2405; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2406; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2407; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2408; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 2409; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 2410; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2411; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2412; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2413; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2414; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 2415; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 2416; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 2417; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2418; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 2419; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2420; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 2421; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 2422; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 2423; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2424; GFX1164-NEXT: s_mov_b32 s2, -1 2425; GFX1164-NEXT: ; implicit-def: $vgpr0 2426; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 2427; GFX1164-NEXT: s_cbranch_execz .LBB9_2 2428; GFX1164-NEXT: ; %bb.1: 2429; GFX1164-NEXT: v_mov_b32_e32 v0, 0 2430; GFX1164-NEXT: v_mov_b32_e32 v4, s7 2431; GFX1164-NEXT: s_mov_b32 s3, s7 2432; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2433; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2434; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v4 2435; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2436; GFX1164-NEXT: buffer_gl0_inv 2437; GFX1164-NEXT: .LBB9_2: 2438; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 2439; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 2440; GFX1164-NEXT: v_mov_b32_e32 v0, v3 2441; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2442; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2443; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2444; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2445; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2446; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2447; GFX1164-NEXT: s_endpgm 2448; 2449; GFX1132-LABEL: sub_i32_varying: 2450; GFX1132: ; %bb.0: ; %entry 2451; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2452; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2453; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2454; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2455; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2456; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2457; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2458; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2459; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2460; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2461; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2462; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2463; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2464; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2465; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2466; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2467; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2468; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2469; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2470; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2471; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 2472; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 2473; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2474; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2475; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2476; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2477; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2478; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 2479; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2480; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 2481; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2482; GFX1132-NEXT: s_mov_b32 s2, -1 2483; GFX1132-NEXT: ; implicit-def: $vgpr0 2484; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 2485; GFX1132-NEXT: s_cbranch_execz .LBB9_2 2486; GFX1132-NEXT: ; %bb.1: 2487; GFX1132-NEXT: v_mov_b32_e32 v0, 0 2488; GFX1132-NEXT: v_mov_b32_e32 v4, s4 2489; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2490; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2491; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v4 2492; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2493; GFX1132-NEXT: buffer_gl0_inv 2494; GFX1132-NEXT: .LBB9_2: 2495; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 2496; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 2497; GFX1132-NEXT: v_mov_b32_e32 v0, v3 2498; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2499; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2500; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2501; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2502; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2503; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2504; GFX1132-NEXT: s_endpgm 2505entry: 2506 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2507 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2508 store i32 %old, i32 addrspace(1)* %out 2509 ret void 2510} 2511 2512define amdgpu_kernel void @sub_i32_varying_nouse() { 2513; GFX7LESS-LABEL: sub_i32_varying_nouse: 2514; GFX7LESS: ; %bb.0: ; %entry 2515; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2516; GFX7LESS-NEXT: s_mov_b32 m0, -1 2517; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2518; GFX7LESS-NEXT: ds_sub_u32 v1, v0 2519; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2520; GFX7LESS-NEXT: s_endpgm 2521; 2522; GFX8-LABEL: sub_i32_varying_nouse: 2523; GFX8: ; %bb.0: ; %entry 2524; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2525; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2526; GFX8-NEXT: v_mov_b32_e32 v1, v0 2527; GFX8-NEXT: s_not_b64 exec, exec 2528; GFX8-NEXT: v_mov_b32_e32 v1, 0 2529; GFX8-NEXT: s_not_b64 exec, exec 2530; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 2531; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2532; GFX8-NEXT: s_nop 1 2533; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2534; GFX8-NEXT: s_nop 1 2535; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2536; GFX8-NEXT: s_nop 1 2537; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2538; GFX8-NEXT: s_nop 1 2539; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2540; GFX8-NEXT: s_nop 1 2541; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2542; GFX8-NEXT: v_readlane_b32 s2, v1, 63 2543; GFX8-NEXT: s_mov_b64 exec, s[0:1] 2544; GFX8-NEXT: s_mov_b32 s0, s2 2545; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2546; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2547; GFX8-NEXT: s_cbranch_execz .LBB10_2 2548; GFX8-NEXT: ; %bb.1: 2549; GFX8-NEXT: v_mov_b32_e32 v0, 0 2550; GFX8-NEXT: v_mov_b32_e32 v2, s0 2551; GFX8-NEXT: s_mov_b32 m0, -1 2552; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2553; GFX8-NEXT: ds_sub_u32 v0, v2 2554; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2555; GFX8-NEXT: .LBB10_2: 2556; GFX8-NEXT: s_endpgm 2557; 2558; GFX9-LABEL: sub_i32_varying_nouse: 2559; GFX9: ; %bb.0: ; %entry 2560; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2561; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2562; GFX9-NEXT: v_mov_b32_e32 v1, v0 2563; GFX9-NEXT: s_not_b64 exec, exec 2564; GFX9-NEXT: v_mov_b32_e32 v1, 0 2565; GFX9-NEXT: s_not_b64 exec, exec 2566; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 2567; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2568; GFX9-NEXT: s_nop 1 2569; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2570; GFX9-NEXT: s_nop 1 2571; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2572; GFX9-NEXT: s_nop 1 2573; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2574; GFX9-NEXT: s_nop 1 2575; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2576; GFX9-NEXT: s_nop 1 2577; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2578; GFX9-NEXT: v_readlane_b32 s2, v1, 63 2579; GFX9-NEXT: s_mov_b64 exec, s[0:1] 2580; GFX9-NEXT: s_mov_b32 s0, s2 2581; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2582; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2583; GFX9-NEXT: s_cbranch_execz .LBB10_2 2584; GFX9-NEXT: ; %bb.1: 2585; GFX9-NEXT: v_mov_b32_e32 v0, 0 2586; GFX9-NEXT: v_mov_b32_e32 v2, s0 2587; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2588; GFX9-NEXT: ds_sub_u32 v0, v2 2589; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2590; GFX9-NEXT: .LBB10_2: 2591; GFX9-NEXT: s_endpgm 2592; 2593; GFX1064-LABEL: sub_i32_varying_nouse: 2594; GFX1064: ; %bb.0: ; %entry 2595; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2596; GFX1064-NEXT: s_not_b64 exec, exec 2597; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2598; GFX1064-NEXT: s_not_b64 exec, exec 2599; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2600; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2601; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2602; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2603; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2604; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2605; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2606; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 2607; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2608; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2609; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2610; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 2611; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 2612; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2613; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2614; GFX1064-NEXT: s_add_i32 s0, s2, s3 2615; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2616; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2617; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2618; GFX1064-NEXT: ; %bb.1: 2619; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2620; GFX1064-NEXT: v_mov_b32_e32 v3, s0 2621; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2622; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2623; GFX1064-NEXT: ds_sub_u32 v0, v3 2624; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2625; GFX1064-NEXT: buffer_gl0_inv 2626; GFX1064-NEXT: .LBB10_2: 2627; GFX1064-NEXT: s_endpgm 2628; 2629; GFX1032-LABEL: sub_i32_varying_nouse: 2630; GFX1032: ; %bb.0: ; %entry 2631; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2632; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2633; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2634; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2635; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 2636; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2637; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2638; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2639; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2640; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2641; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2642; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 2643; GFX1032-NEXT: s_mov_b32 exec_lo, s0 2644; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2645; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2646; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2647; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2648; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2649; GFX1032-NEXT: ; %bb.1: 2650; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2651; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2652; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2653; GFX1032-NEXT: ds_sub_u32 v3, v0 2654; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2655; GFX1032-NEXT: buffer_gl0_inv 2656; GFX1032-NEXT: .LBB10_2: 2657; GFX1032-NEXT: s_endpgm 2658; 2659; GFX1164-LABEL: sub_i32_varying_nouse: 2660; GFX1164: ; %bb.0: ; %entry 2661; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2662; GFX1164-NEXT: s_not_b64 exec, exec 2663; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2664; GFX1164-NEXT: s_not_b64 exec, exec 2665; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2666; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2667; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2668; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2669; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2670; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2671; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2672; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2673; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2674; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2675; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2676; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2677; GFX1164-NEXT: v_permlane64_b32 v2, v1 2678; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2679; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2680; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2681; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2682; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2683; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2684; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) 2685; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 2686; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2687; GFX1164-NEXT: v_mov_b32_e32 v0, v1 2688; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2689; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 2690; GFX1164-NEXT: s_cbranch_execz .LBB10_2 2691; GFX1164-NEXT: ; %bb.1: 2692; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2693; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2694; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2695; GFX1164-NEXT: ds_sub_u32 v3, v0 2696; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2697; GFX1164-NEXT: buffer_gl0_inv 2698; GFX1164-NEXT: .LBB10_2: 2699; GFX1164-NEXT: s_endpgm 2700; 2701; GFX1132-LABEL: sub_i32_varying_nouse: 2702; GFX1132: ; %bb.0: ; %entry 2703; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2704; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2705; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2706; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2707; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 2708; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2709; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2710; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2711; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2712; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2713; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2714; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2715; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2716; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2717; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2718; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 2719; GFX1132-NEXT: s_mov_b32 exec_lo, s0 2720; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2721; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2722; GFX1132-NEXT: v_mov_b32_e32 v0, v1 2723; GFX1132-NEXT: s_mov_b32 s0, exec_lo 2724; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 2725; GFX1132-NEXT: s_cbranch_execz .LBB10_2 2726; GFX1132-NEXT: ; %bb.1: 2727; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2728; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2729; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2730; GFX1132-NEXT: ds_sub_u32 v3, v0 2731; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2732; GFX1132-NEXT: buffer_gl0_inv 2733; GFX1132-NEXT: .LBB10_2: 2734; GFX1132-NEXT: s_endpgm 2735entry: 2736 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2737 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2738 ret void 2739} 2740 2741define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2742; 2743; 2744; GFX7LESS-LABEL: sub_i64_constant: 2745; GFX7LESS: ; %bb.0: ; %entry 2746; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2747; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2748; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2749; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 2750; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2751; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2752; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2753; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 2754; GFX7LESS-NEXT: ; %bb.1: 2755; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2756; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 2757; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2758; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 2759; GFX7LESS-NEXT: s_mov_b32 m0, -1 2760; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2761; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2762; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2763; GFX7LESS-NEXT: .LBB11_2: 2764; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2765; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2766; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 2767; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 2768; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2769; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2770; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2771; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2772; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2773; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2774; GFX7LESS-NEXT: s_mov_b32 s2, -1 2775; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2776; GFX7LESS-NEXT: s_endpgm 2777; 2778; GFX8-LABEL: sub_i64_constant: 2779; GFX8: ; %bb.0: ; %entry 2780; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2781; GFX8-NEXT: s_mov_b64 s[4:5], exec 2782; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2783; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2784; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2785; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2786; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2787; GFX8-NEXT: s_cbranch_execz .LBB11_2 2788; GFX8-NEXT: ; %bb.1: 2789; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2790; GFX8-NEXT: s_mul_i32 s4, s4, 5 2791; GFX8-NEXT: v_mov_b32_e32 v0, s4 2792; GFX8-NEXT: v_mov_b32_e32 v1, 0 2793; GFX8-NEXT: s_mov_b32 m0, -1 2794; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2795; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2796; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2797; GFX8-NEXT: .LBB11_2: 2798; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2799; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2800; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2801; GFX8-NEXT: v_readfirstlane_b32 s3, v1 2802; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2803; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2804; GFX8-NEXT: v_mov_b32_e32 v2, s3 2805; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2806; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2807; GFX8-NEXT: s_mov_b32 s3, 0xf000 2808; GFX8-NEXT: s_mov_b32 s2, -1 2809; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2810; GFX8-NEXT: s_endpgm 2811; 2812; GFX9-LABEL: sub_i64_constant: 2813; GFX9: ; %bb.0: ; %entry 2814; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2815; GFX9-NEXT: s_mov_b64 s[4:5], exec 2816; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2817; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2818; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2819; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2820; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2821; GFX9-NEXT: s_cbranch_execz .LBB11_2 2822; GFX9-NEXT: ; %bb.1: 2823; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2824; GFX9-NEXT: s_mul_i32 s4, s4, 5 2825; GFX9-NEXT: v_mov_b32_e32 v0, s4 2826; GFX9-NEXT: v_mov_b32_e32 v1, 0 2827; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2828; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2829; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2830; GFX9-NEXT: .LBB11_2: 2831; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2832; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2833; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2834; GFX9-NEXT: v_readfirstlane_b32 s3, v1 2835; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2836; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2837; GFX9-NEXT: v_mov_b32_e32 v2, s3 2838; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2839; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2840; GFX9-NEXT: s_mov_b32 s3, 0xf000 2841; GFX9-NEXT: s_mov_b32 s2, -1 2842; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2843; GFX9-NEXT: s_endpgm 2844; 2845; GFX1064-LABEL: sub_i64_constant: 2846; GFX1064: ; %bb.0: ; %entry 2847; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2848; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2849; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2850; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2851; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2852; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2853; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2854; GFX1064-NEXT: s_cbranch_execz .LBB11_2 2855; GFX1064-NEXT: ; %bb.1: 2856; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2857; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2858; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2859; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2860; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2861; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2862; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2863; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2864; GFX1064-NEXT: buffer_gl0_inv 2865; GFX1064-NEXT: .LBB11_2: 2866; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2867; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2868; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2869; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2870; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2871; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2872; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2873; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2874; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2875; GFX1064-NEXT: s_mov_b32 s2, -1 2876; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2877; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2878; GFX1064-NEXT: s_endpgm 2879; 2880; GFX1032-LABEL: sub_i64_constant: 2881; GFX1032: ; %bb.0: ; %entry 2882; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2883; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2884; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2885; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2886; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2887; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2888; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2889; GFX1032-NEXT: ; %bb.1: 2890; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2891; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2892; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2893; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2894; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2895; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2896; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2897; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2898; GFX1032-NEXT: buffer_gl0_inv 2899; GFX1032-NEXT: .LBB11_2: 2900; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2901; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2902; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2903; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2904; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2905; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2906; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2907; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2908; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2909; GFX1032-NEXT: s_mov_b32 s2, -1 2910; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2911; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2912; GFX1032-NEXT: s_endpgm 2913; 2914; GFX1164-LABEL: sub_i64_constant: 2915; GFX1164: ; %bb.0: ; %entry 2916; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2917; GFX1164-NEXT: s_mov_b64 s[4:5], exec 2918; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2919; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2920; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2921; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2922; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2923; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2924; GFX1164-NEXT: s_cbranch_execz .LBB11_2 2925; GFX1164-NEXT: ; %bb.1: 2926; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2927; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2928; GFX1164-NEXT: s_mul_i32 s4, s4, 5 2929; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2930; GFX1164-NEXT: v_mov_b32_e32 v0, s4 2931; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2932; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2933; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2934; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2935; GFX1164-NEXT: buffer_gl0_inv 2936; GFX1164-NEXT: .LBB11_2: 2937; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 2938; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2939; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2940; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 2941; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2942; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2943; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2944; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2945; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2946; GFX1164-NEXT: s_mov_b32 s2, -1 2947; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2948; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2949; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2950; GFX1164-NEXT: s_endpgm 2951; 2952; GFX1132-LABEL: sub_i64_constant: 2953; GFX1132: ; %bb.0: ; %entry 2954; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2955; GFX1132-NEXT: s_mov_b32 s3, exec_lo 2956; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2957; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2958; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 2959; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2960; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 2961; GFX1132-NEXT: s_cbranch_execz .LBB11_2 2962; GFX1132-NEXT: ; %bb.1: 2963; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 2964; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2965; GFX1132-NEXT: s_mul_i32 s3, s3, 5 2966; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2967; GFX1132-NEXT: v_mov_b32_e32 v0, s3 2968; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2969; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2970; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2971; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2972; GFX1132-NEXT: buffer_gl0_inv 2973; GFX1132-NEXT: .LBB11_2: 2974; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 2975; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2976; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2977; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 2978; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2979; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2980; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2981; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2982; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2983; GFX1132-NEXT: s_mov_b32 s2, -1 2984; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2985; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2986; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2987; GFX1132-NEXT: s_endpgm 2988entry: 2989 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2990 store i64 %old, i64 addrspace(1)* %out 2991 ret void 2992} 2993 2994define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2995; 2996; 2997; GFX7LESS-LABEL: sub_i64_uniform: 2998; GFX7LESS: ; %bb.0: ; %entry 2999; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 3000; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3001; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 3002; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 3003; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3004; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3005; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 3006; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 3007; GFX7LESS-NEXT: ; %bb.1: 3008; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3009; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 3010; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3011; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 3012; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 3013; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 3014; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 3015; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 3016; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 3017; GFX7LESS-NEXT: s_mov_b32 m0, -1 3018; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3019; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3020; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3021; GFX7LESS-NEXT: .LBB12_2: 3022; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 3023; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 3024; GFX7LESS-NEXT: s_mov_b32 s6, -1 3025; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3026; GFX7LESS-NEXT: s_mov_b32 s4, s0 3027; GFX7LESS-NEXT: s_mov_b32 s5, s1 3028; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 3029; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 3030; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 3031; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 3032; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 3033; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 3034; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 3035; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 3036; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 3037; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3038; GFX7LESS-NEXT: s_endpgm 3039; 3040; GFX8-LABEL: sub_i64_uniform: 3041; GFX8: ; %bb.0: ; %entry 3042; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3043; GFX8-NEXT: s_mov_b64 s[6:7], exec 3044; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3045; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3046; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3047; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3048; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3049; GFX8-NEXT: s_cbranch_execz .LBB12_2 3050; GFX8-NEXT: ; %bb.1: 3051; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 3052; GFX8-NEXT: v_mov_b32_e32 v0, s8 3053; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3054; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 3055; GFX8-NEXT: s_mul_i32 s6, s3, s8 3056; GFX8-NEXT: v_mov_b32_e32 v3, 0 3057; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 3058; GFX8-NEXT: s_mov_b32 m0, -1 3059; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3060; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3061; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3062; GFX8-NEXT: .LBB12_2: 3063; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3064; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3065; GFX8-NEXT: s_mov_b32 s4, s0 3066; GFX8-NEXT: s_mov_b32 s5, s1 3067; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 3068; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 3069; GFX8-NEXT: v_readfirstlane_b32 s0, v0 3070; GFX8-NEXT: v_readfirstlane_b32 s1, v1 3071; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 3072; GFX8-NEXT: v_mov_b32_e32 v3, s1 3073; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 3074; GFX8-NEXT: s_mov_b32 s7, 0xf000 3075; GFX8-NEXT: s_mov_b32 s6, -1 3076; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 3077; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3078; GFX8-NEXT: s_endpgm 3079; 3080; GFX9-LABEL: sub_i64_uniform: 3081; GFX9: ; %bb.0: ; %entry 3082; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3083; GFX9-NEXT: s_mov_b64 s[6:7], exec 3084; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3085; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3086; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3087; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3088; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3089; GFX9-NEXT: s_cbranch_execz .LBB12_2 3090; GFX9-NEXT: ; %bb.1: 3091; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3092; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3093; GFX9-NEXT: s_mul_i32 s7, s3, s6 3094; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 3095; GFX9-NEXT: s_add_i32 s8, s8, s7 3096; GFX9-NEXT: s_mul_i32 s6, s2, s6 3097; GFX9-NEXT: v_mov_b32_e32 v0, s6 3098; GFX9-NEXT: v_mov_b32_e32 v1, s8 3099; GFX9-NEXT: v_mov_b32_e32 v3, 0 3100; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3101; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3102; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3103; GFX9-NEXT: .LBB12_2: 3104; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3106; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3107; GFX9-NEXT: s_mov_b32 s4, s0 3108; GFX9-NEXT: s_mov_b32 s5, s1 3109; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 3110; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3111; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3112; GFX9-NEXT: v_mov_b32_e32 v1, v4 3113; GFX9-NEXT: v_mov_b32_e32 v2, s1 3114; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 3115; GFX9-NEXT: s_mov_b32 s7, 0xf000 3116; GFX9-NEXT: s_mov_b32 s6, -1 3117; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 3118; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3119; GFX9-NEXT: s_endpgm 3120; 3121; GFX1064-LABEL: sub_i64_uniform: 3122; GFX1064: ; %bb.0: ; %entry 3123; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3124; GFX1064-NEXT: s_mov_b64 s[6:7], exec 3125; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3126; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3127; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3128; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3129; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3130; GFX1064-NEXT: s_cbranch_execz .LBB12_2 3131; GFX1064-NEXT: ; %bb.1: 3132; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3133; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3134; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3135; GFX1064-NEXT: s_mul_i32 s7, s3, s6 3136; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 3137; GFX1064-NEXT: s_mul_i32 s6, s2, s6 3138; GFX1064-NEXT: s_add_i32 s8, s8, s7 3139; GFX1064-NEXT: v_mov_b32_e32 v0, s6 3140; GFX1064-NEXT: v_mov_b32_e32 v1, s8 3141; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3142; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3143; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3144; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3145; GFX1064-NEXT: buffer_gl0_inv 3146; GFX1064-NEXT: .LBB12_2: 3147; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3148; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3149; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3150; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3151; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 3152; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 3153; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3154; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3155; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3156; GFX1064-NEXT: v_mov_b32_e32 v1, v4 3157; GFX1064-NEXT: s_mov_b32 s2, -1 3158; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3159; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3160; GFX1064-NEXT: s_endpgm 3161; 3162; GFX1032-LABEL: sub_i64_uniform: 3163; GFX1032: ; %bb.0: ; %entry 3164; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3165; GFX1032-NEXT: s_mov_b32 s5, exec_lo 3166; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3167; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3168; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 3169; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3170; GFX1032-NEXT: s_cbranch_execz .LBB12_2 3171; GFX1032-NEXT: ; %bb.1: 3172; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 3173; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3174; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3175; GFX1032-NEXT: s_mul_i32 s6, s3, s5 3176; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 3177; GFX1032-NEXT: s_mul_i32 s5, s2, s5 3178; GFX1032-NEXT: s_add_i32 s7, s7, s6 3179; GFX1032-NEXT: v_mov_b32_e32 v0, s5 3180; GFX1032-NEXT: v_mov_b32_e32 v1, s7 3181; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3182; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3183; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3184; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3185; GFX1032-NEXT: buffer_gl0_inv 3186; GFX1032-NEXT: .LBB12_2: 3187; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3188; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3189; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3190; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 3191; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 3192; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] 3193; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3194; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3195; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3196; GFX1032-NEXT: v_mov_b32_e32 v1, v4 3197; GFX1032-NEXT: s_mov_b32 s2, -1 3198; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3199; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3200; GFX1032-NEXT: s_endpgm 3201; 3202; GFX1164-LABEL: sub_i64_uniform: 3203; GFX1164: ; %bb.0: ; %entry 3204; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3205; GFX1164-NEXT: s_mov_b64 s[6:7], exec 3206; GFX1164-NEXT: s_mov_b64 s[4:5], exec 3207; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3208; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3209; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3210; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 3211; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 3212; GFX1164-NEXT: s_cbranch_execz .LBB12_2 3213; GFX1164-NEXT: ; %bb.1: 3214; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3215; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3216; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3217; GFX1164-NEXT: s_mul_i32 s7, s3, s6 3218; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 3219; GFX1164-NEXT: s_mul_i32 s6, s2, s6 3220; GFX1164-NEXT: s_add_i32 s8, s8, s7 3221; GFX1164-NEXT: v_mov_b32_e32 v0, s6 3222; GFX1164-NEXT: v_mov_b32_e32 v1, s8 3223; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3224; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3225; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3226; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3227; GFX1164-NEXT: buffer_gl0_inv 3228; GFX1164-NEXT: .LBB12_2: 3229; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3230; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3231; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3232; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 3233; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 3234; GFX1164-NEXT: s_waitcnt_depctr 0xfff 3235; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3236; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3237; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3238; GFX1164-NEXT: s_mov_b32 s2, -1 3239; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3240; GFX1164-NEXT: v_mov_b32_e32 v1, v5 3241; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3242; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3243; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3244; GFX1164-NEXT: s_endpgm 3245; 3246; GFX1132-LABEL: sub_i64_uniform: 3247; GFX1132: ; %bb.0: ; %entry 3248; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3249; GFX1132-NEXT: s_mov_b32 s5, exec_lo 3250; GFX1132-NEXT: s_mov_b32 s4, exec_lo 3251; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3252; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 3253; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3254; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 3255; GFX1132-NEXT: s_cbranch_execz .LBB12_2 3256; GFX1132-NEXT: ; %bb.1: 3257; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 3258; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3259; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3260; GFX1132-NEXT: s_mul_i32 s6, s3, s5 3261; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 3262; GFX1132-NEXT: s_mul_i32 s5, s2, s5 3263; GFX1132-NEXT: s_add_i32 s7, s7, s6 3264; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3265; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 3266; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3267; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3268; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3269; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3270; GFX1132-NEXT: buffer_gl0_inv 3271; GFX1132-NEXT: .LBB12_2: 3272; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 3273; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3274; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3275; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 3276; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 3277; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3278; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3279; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3280; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3281; GFX1132-NEXT: s_mov_b32 s2, -1 3282; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3283; GFX1132-NEXT: v_mov_b32_e32 v1, v5 3284; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3285; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3286; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3287; GFX1132-NEXT: s_endpgm 3288entry: 3289 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 3290 store i64 %old, i64 addrspace(1)* %out 3291 ret void 3292} 3293 3294define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 3295; 3296; 3297; GFX7LESS-LABEL: sub_i64_varying: 3298; GFX7LESS: ; %bb.0: ; %entry 3299; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3300; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3301; GFX7LESS-NEXT: s_mov_b32 m0, -1 3302; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3303; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3304; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3305; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3306; GFX7LESS-NEXT: s_mov_b32 s2, -1 3307; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3308; GFX7LESS-NEXT: s_endpgm 3309; 3310; GFX8-LABEL: sub_i64_varying: 3311; GFX8: ; %bb.0: ; %entry 3312; GFX8-NEXT: v_mov_b32_e32 v1, 0 3313; GFX8-NEXT: s_mov_b32 m0, -1 3314; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3315; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3316; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3317; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3318; GFX8-NEXT: s_mov_b32 s3, 0xf000 3319; GFX8-NEXT: s_mov_b32 s2, -1 3320; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3321; GFX8-NEXT: s_endpgm 3322; 3323; GFX9-LABEL: sub_i64_varying: 3324; GFX9: ; %bb.0: ; %entry 3325; GFX9-NEXT: v_mov_b32_e32 v1, 0 3326; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3327; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3328; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3329; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3330; GFX9-NEXT: s_mov_b32 s3, 0xf000 3331; GFX9-NEXT: s_mov_b32 s2, -1 3332; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3333; GFX9-NEXT: s_endpgm 3334; 3335; GFX10-LABEL: sub_i64_varying: 3336; GFX10: ; %bb.0: ; %entry 3337; GFX10-NEXT: v_mov_b32_e32 v1, 0 3338; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3339; GFX10-NEXT: s_mov_b32 s3, 0x31016000 3340; GFX10-NEXT: s_mov_b32 s2, -1 3341; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3342; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3343; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3344; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3345; GFX10-NEXT: buffer_gl0_inv 3346; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3347; GFX10-NEXT: s_endpgm 3348; 3349; GFX11-LABEL: sub_i64_varying: 3350; GFX11: ; %bb.0: ; %entry 3351; GFX11-NEXT: v_mov_b32_e32 v1, 0 3352; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3353; GFX11-NEXT: s_mov_b32 s3, 0x31016000 3354; GFX11-NEXT: s_mov_b32 s2, -1 3355; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3356; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3357; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3358; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3359; GFX11-NEXT: buffer_gl0_inv 3360; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3361; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3362; GFX11-NEXT: s_endpgm 3363entry: 3364 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3365 %zext = zext i32 %lane to i64 3366 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 3367 store i64 %old, i64 addrspace(1)* %out 3368 ret void 3369} 3370 3371define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 3372; 3373; 3374; GFX7LESS-LABEL: and_i32_varying: 3375; GFX7LESS: ; %bb.0: ; %entry 3376; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3377; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3378; GFX7LESS-NEXT: s_mov_b32 m0, -1 3379; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3380; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 3381; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3382; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3383; GFX7LESS-NEXT: s_mov_b32 s2, -1 3384; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3385; GFX7LESS-NEXT: s_endpgm 3386; 3387; GFX8-LABEL: and_i32_varying: 3388; GFX8: ; %bb.0: ; %entry 3389; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3390; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3391; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3392; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3393; GFX8-NEXT: v_mov_b32_e32 v1, -1 3394; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3395; GFX8-NEXT: v_mov_b32_e32 v2, v0 3396; GFX8-NEXT: s_not_b64 exec, exec 3397; GFX8-NEXT: v_mov_b32_e32 v2, -1 3398; GFX8-NEXT: s_not_b64 exec, exec 3399; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3400; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3401; GFX8-NEXT: s_nop 1 3402; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3403; GFX8-NEXT: s_nop 1 3404; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3405; GFX8-NEXT: s_nop 1 3406; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3407; GFX8-NEXT: s_nop 1 3408; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3409; GFX8-NEXT: s_nop 1 3410; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3411; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3412; GFX8-NEXT: s_nop 0 3413; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3414; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3415; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3416; GFX8-NEXT: ; implicit-def: $vgpr0 3417; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3418; GFX8-NEXT: s_cbranch_execz .LBB14_2 3419; GFX8-NEXT: ; %bb.1: 3420; GFX8-NEXT: v_mov_b32_e32 v0, 0 3421; GFX8-NEXT: v_mov_b32_e32 v3, s4 3422; GFX8-NEXT: s_mov_b32 m0, -1 3423; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3424; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 3425; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3426; GFX8-NEXT: .LBB14_2: 3427; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3428; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3429; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3430; GFX8-NEXT: v_mov_b32_e32 v0, v1 3431; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 3432; GFX8-NEXT: s_mov_b32 s3, 0xf000 3433; GFX8-NEXT: s_mov_b32 s2, -1 3434; GFX8-NEXT: s_nop 0 3435; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3436; GFX8-NEXT: s_endpgm 3437; 3438; GFX9-LABEL: and_i32_varying: 3439; GFX9: ; %bb.0: ; %entry 3440; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3441; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3442; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3443; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3444; GFX9-NEXT: v_mov_b32_e32 v1, -1 3445; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3446; GFX9-NEXT: v_mov_b32_e32 v2, v0 3447; GFX9-NEXT: s_not_b64 exec, exec 3448; GFX9-NEXT: v_mov_b32_e32 v2, -1 3449; GFX9-NEXT: s_not_b64 exec, exec 3450; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3451; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3452; GFX9-NEXT: s_nop 1 3453; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3454; GFX9-NEXT: s_nop 1 3455; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3456; GFX9-NEXT: s_nop 1 3457; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3458; GFX9-NEXT: s_nop 1 3459; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3460; GFX9-NEXT: s_nop 1 3461; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3462; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3463; GFX9-NEXT: s_nop 0 3464; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3465; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3466; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3467; GFX9-NEXT: ; implicit-def: $vgpr0 3468; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3469; GFX9-NEXT: s_cbranch_execz .LBB14_2 3470; GFX9-NEXT: ; %bb.1: 3471; GFX9-NEXT: v_mov_b32_e32 v0, 0 3472; GFX9-NEXT: v_mov_b32_e32 v3, s4 3473; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3474; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 3475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3476; GFX9-NEXT: .LBB14_2: 3477; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3478; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3479; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3480; GFX9-NEXT: v_mov_b32_e32 v0, v1 3481; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 3482; GFX9-NEXT: s_mov_b32 s3, 0xf000 3483; GFX9-NEXT: s_mov_b32 s2, -1 3484; GFX9-NEXT: s_nop 0 3485; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3486; GFX9-NEXT: s_endpgm 3487; 3488; GFX1064-LABEL: and_i32_varying: 3489; GFX1064: ; %bb.0: ; %entry 3490; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3491; GFX1064-NEXT: s_not_b64 exec, exec 3492; GFX1064-NEXT: v_mov_b32_e32 v1, -1 3493; GFX1064-NEXT: s_not_b64 exec, exec 3494; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3495; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3496; GFX1064-NEXT: v_mov_b32_e32 v3, -1 3497; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3498; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3499; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3500; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3501; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3502; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3503; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3504; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3505; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3506; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3507; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3508; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3509; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3510; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3511; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3512; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3513; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3514; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3515; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3516; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3517; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3518; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3519; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3520; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3521; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3522; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3523; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3524; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3525; GFX1064-NEXT: s_mov_b32 s2, -1 3526; GFX1064-NEXT: ; implicit-def: $vgpr0 3527; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3528; GFX1064-NEXT: s_cbranch_execz .LBB14_2 3529; GFX1064-NEXT: ; %bb.1: 3530; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3531; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3532; GFX1064-NEXT: s_mov_b32 s3, s7 3533; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3534; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3535; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 3536; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3537; GFX1064-NEXT: buffer_gl0_inv 3538; GFX1064-NEXT: .LBB14_2: 3539; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3540; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3541; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3542; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3543; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 3544; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3545; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3546; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3547; GFX1064-NEXT: s_endpgm 3548; 3549; GFX1032-LABEL: and_i32_varying: 3550; GFX1032: ; %bb.0: ; %entry 3551; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3552; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3553; GFX1032-NEXT: v_mov_b32_e32 v1, -1 3554; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3555; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3556; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3557; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3558; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3559; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3560; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3561; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3562; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3563; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3564; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3565; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3566; GFX1032-NEXT: v_mov_b32_e32 v3, -1 3567; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3568; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3569; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3570; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3571; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3572; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3573; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3574; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3575; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3576; GFX1032-NEXT: s_mov_b32 s2, -1 3577; GFX1032-NEXT: ; implicit-def: $vgpr0 3578; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3579; GFX1032-NEXT: s_cbranch_execz .LBB14_2 3580; GFX1032-NEXT: ; %bb.1: 3581; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3582; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3583; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3584; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3585; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 3586; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3587; GFX1032-NEXT: buffer_gl0_inv 3588; GFX1032-NEXT: .LBB14_2: 3589; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3590; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3591; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3592; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3593; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 3594; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3595; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3596; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3597; GFX1032-NEXT: s_endpgm 3598; 3599; GFX1164-LABEL: and_i32_varying: 3600; GFX1164: ; %bb.0: ; %entry 3601; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3602; GFX1164-NEXT: s_not_b64 exec, exec 3603; GFX1164-NEXT: v_mov_b32_e32 v1, -1 3604; GFX1164-NEXT: s_not_b64 exec, exec 3605; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3606; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3607; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3608; GFX1164-NEXT: v_mov_b32_e32 v3, -1 3609; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3610; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3611; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3612; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3613; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3614; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3615; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3616; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3617; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3618; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3619; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3620; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3621; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3622; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3623; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3624; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3625; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3626; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3627; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3628; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3629; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3630; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3631; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3632; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3633; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3634; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3635; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3636; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3637; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3638; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 3639; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3640; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3641; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3642; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3643; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3644; GFX1164-NEXT: s_mov_b32 s2, -1 3645; GFX1164-NEXT: ; implicit-def: $vgpr0 3646; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3647; GFX1164-NEXT: s_cbranch_execz .LBB14_2 3648; GFX1164-NEXT: ; %bb.1: 3649; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3650; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3651; GFX1164-NEXT: s_mov_b32 s3, s7 3652; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3653; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3654; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 3655; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3656; GFX1164-NEXT: buffer_gl0_inv 3657; GFX1164-NEXT: .LBB14_2: 3658; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3659; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3660; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3661; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3662; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 3663; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3664; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3665; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3666; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3667; GFX1164-NEXT: s_endpgm 3668; 3669; GFX1132-LABEL: and_i32_varying: 3670; GFX1132: ; %bb.0: ; %entry 3671; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3672; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3673; GFX1132-NEXT: v_mov_b32_e32 v1, -1 3674; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3675; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3676; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3677; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3678; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3679; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3680; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3681; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3682; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3683; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3684; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3685; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3686; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3687; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3688; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3689; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3690; GFX1132-NEXT: v_mov_b32_e32 v3, -1 3691; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3692; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3693; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 3694; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3695; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3696; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3697; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3698; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3699; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3700; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 3701; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3702; GFX1132-NEXT: s_mov_b32 s2, -1 3703; GFX1132-NEXT: ; implicit-def: $vgpr0 3704; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3705; GFX1132-NEXT: s_cbranch_execz .LBB14_2 3706; GFX1132-NEXT: ; %bb.1: 3707; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3708; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3709; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3710; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3711; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 3712; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3713; GFX1132-NEXT: buffer_gl0_inv 3714; GFX1132-NEXT: .LBB14_2: 3715; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3716; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3717; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3718; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3719; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 3720; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3721; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3722; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3723; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3724; GFX1132-NEXT: s_endpgm 3725entry: 3726 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3727 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3728 store i32 %old, i32 addrspace(1)* %out 3729 ret void 3730} 3731 3732define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 3733; 3734; 3735; GFX7LESS-LABEL: or_i32_varying: 3736; GFX7LESS: ; %bb.0: ; %entry 3737; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3738; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3739; GFX7LESS-NEXT: s_mov_b32 m0, -1 3740; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3741; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 3742; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3743; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3744; GFX7LESS-NEXT: s_mov_b32 s2, -1 3745; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3746; GFX7LESS-NEXT: s_endpgm 3747; 3748; GFX8-LABEL: or_i32_varying: 3749; GFX8: ; %bb.0: ; %entry 3750; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3751; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3752; GFX8-NEXT: v_mov_b32_e32 v1, 0 3753; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3754; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3755; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3756; GFX8-NEXT: v_mov_b32_e32 v2, v0 3757; GFX8-NEXT: s_not_b64 exec, exec 3758; GFX8-NEXT: v_mov_b32_e32 v2, 0 3759; GFX8-NEXT: s_not_b64 exec, exec 3760; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3761; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3762; GFX8-NEXT: s_nop 1 3763; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3764; GFX8-NEXT: s_nop 1 3765; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3766; GFX8-NEXT: s_nop 1 3767; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3768; GFX8-NEXT: s_nop 1 3769; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3770; GFX8-NEXT: s_nop 1 3771; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3772; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3773; GFX8-NEXT: s_nop 0 3774; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3775; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3776; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3777; GFX8-NEXT: ; implicit-def: $vgpr0 3778; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3779; GFX8-NEXT: s_cbranch_execz .LBB15_2 3780; GFX8-NEXT: ; %bb.1: 3781; GFX8-NEXT: v_mov_b32_e32 v0, 0 3782; GFX8-NEXT: v_mov_b32_e32 v3, s4 3783; GFX8-NEXT: s_mov_b32 m0, -1 3784; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3785; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 3786; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3787; GFX8-NEXT: .LBB15_2: 3788; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3789; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3790; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3791; GFX8-NEXT: v_mov_b32_e32 v0, v1 3792; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 3793; GFX8-NEXT: s_mov_b32 s3, 0xf000 3794; GFX8-NEXT: s_mov_b32 s2, -1 3795; GFX8-NEXT: s_nop 0 3796; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3797; GFX8-NEXT: s_endpgm 3798; 3799; GFX9-LABEL: or_i32_varying: 3800; GFX9: ; %bb.0: ; %entry 3801; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3802; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3803; GFX9-NEXT: v_mov_b32_e32 v1, 0 3804; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3805; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3806; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3807; GFX9-NEXT: v_mov_b32_e32 v2, v0 3808; GFX9-NEXT: s_not_b64 exec, exec 3809; GFX9-NEXT: v_mov_b32_e32 v2, 0 3810; GFX9-NEXT: s_not_b64 exec, exec 3811; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3812; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3813; GFX9-NEXT: s_nop 1 3814; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3815; GFX9-NEXT: s_nop 1 3816; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3817; GFX9-NEXT: s_nop 1 3818; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3819; GFX9-NEXT: s_nop 1 3820; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3821; GFX9-NEXT: s_nop 1 3822; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3823; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3824; GFX9-NEXT: s_nop 0 3825; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3826; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3827; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3828; GFX9-NEXT: ; implicit-def: $vgpr0 3829; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3830; GFX9-NEXT: s_cbranch_execz .LBB15_2 3831; GFX9-NEXT: ; %bb.1: 3832; GFX9-NEXT: v_mov_b32_e32 v0, 0 3833; GFX9-NEXT: v_mov_b32_e32 v3, s4 3834; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3835; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3836; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3837; GFX9-NEXT: .LBB15_2: 3838; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3839; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3840; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3841; GFX9-NEXT: v_mov_b32_e32 v0, v1 3842; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3843; GFX9-NEXT: s_mov_b32 s3, 0xf000 3844; GFX9-NEXT: s_mov_b32 s2, -1 3845; GFX9-NEXT: s_nop 0 3846; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3847; GFX9-NEXT: s_endpgm 3848; 3849; GFX1064-LABEL: or_i32_varying: 3850; GFX1064: ; %bb.0: ; %entry 3851; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3852; GFX1064-NEXT: s_not_b64 exec, exec 3853; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3854; GFX1064-NEXT: s_not_b64 exec, exec 3855; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3856; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3857; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3858; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3859; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3860; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3861; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3862; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3863; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3864; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3865; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3866; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3867; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3868; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3869; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3870; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3871; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3872; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3873; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3874; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3875; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3876; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3877; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3878; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3879; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3880; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3881; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3882; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3883; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3884; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3885; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3886; GFX1064-NEXT: s_mov_b32 s2, -1 3887; GFX1064-NEXT: ; implicit-def: $vgpr0 3888; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3889; GFX1064-NEXT: s_cbranch_execz .LBB15_2 3890; GFX1064-NEXT: ; %bb.1: 3891; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3892; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3893; GFX1064-NEXT: s_mov_b32 s3, s7 3894; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3895; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3896; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 3897; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3898; GFX1064-NEXT: buffer_gl0_inv 3899; GFX1064-NEXT: .LBB15_2: 3900; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3901; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3902; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3903; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3904; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3905; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3906; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3907; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3908; GFX1064-NEXT: s_endpgm 3909; 3910; GFX1032-LABEL: or_i32_varying: 3911; GFX1032: ; %bb.0: ; %entry 3912; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3913; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3914; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3915; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3916; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3917; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3918; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3919; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3920; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3921; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3922; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3923; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3924; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3925; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3926; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3927; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3928; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3929; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3930; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3931; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3932; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3933; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3934; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3935; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3936; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3937; GFX1032-NEXT: s_mov_b32 s2, -1 3938; GFX1032-NEXT: ; implicit-def: $vgpr0 3939; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3940; GFX1032-NEXT: s_cbranch_execz .LBB15_2 3941; GFX1032-NEXT: ; %bb.1: 3942; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3943; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3944; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3945; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3946; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 3947; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3948; GFX1032-NEXT: buffer_gl0_inv 3949; GFX1032-NEXT: .LBB15_2: 3950; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3951; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3952; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3953; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3954; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3955; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3956; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3957; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3958; GFX1032-NEXT: s_endpgm 3959; 3960; GFX1164-LABEL: or_i32_varying: 3961; GFX1164: ; %bb.0: ; %entry 3962; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3963; GFX1164-NEXT: s_not_b64 exec, exec 3964; GFX1164-NEXT: v_mov_b32_e32 v1, 0 3965; GFX1164-NEXT: s_not_b64 exec, exec 3966; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3967; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3968; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3969; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3970; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3971; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3972; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3973; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3974; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3975; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3976; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3977; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3978; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3979; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3980; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3981; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3982; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3983; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3984; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3985; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3986; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3987; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3988; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3989; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3990; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3991; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3992; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3993; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3994; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3995; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3996; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3997; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3998; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3999; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 4000; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4001; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4002; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4003; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4004; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4005; GFX1164-NEXT: s_mov_b32 s2, -1 4006; GFX1164-NEXT: ; implicit-def: $vgpr0 4007; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4008; GFX1164-NEXT: s_cbranch_execz .LBB15_2 4009; GFX1164-NEXT: ; %bb.1: 4010; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4011; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4012; GFX1164-NEXT: s_mov_b32 s3, s7 4013; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4014; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4015; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v4 4016; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4017; GFX1164-NEXT: buffer_gl0_inv 4018; GFX1164-NEXT: .LBB15_2: 4019; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4020; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4021; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4022; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4023; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 4024; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4025; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4026; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4027; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4028; GFX1164-NEXT: s_endpgm 4029; 4030; GFX1132-LABEL: or_i32_varying: 4031; GFX1132: ; %bb.0: ; %entry 4032; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4033; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4034; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4035; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4036; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4037; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4038; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4039; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4040; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4041; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4042; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4043; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4044; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4045; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4046; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4047; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4048; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4049; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4050; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4051; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4052; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4053; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4054; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4055; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4056; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4057; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4058; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4059; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4060; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4061; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4062; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4063; GFX1132-NEXT: s_mov_b32 s2, -1 4064; GFX1132-NEXT: ; implicit-def: $vgpr0 4065; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4066; GFX1132-NEXT: s_cbranch_execz .LBB15_2 4067; GFX1132-NEXT: ; %bb.1: 4068; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4069; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4070; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4071; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4072; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v4 4073; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4074; GFX1132-NEXT: buffer_gl0_inv 4075; GFX1132-NEXT: .LBB15_2: 4076; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4077; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4078; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4079; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4080; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 4081; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4082; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4083; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4084; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4085; GFX1132-NEXT: s_endpgm 4086entry: 4087 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4088 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4089 store i32 %old, i32 addrspace(1)* %out 4090 ret void 4091} 4092 4093define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 4094; 4095; 4096; GFX7LESS-LABEL: xor_i32_varying: 4097; GFX7LESS: ; %bb.0: ; %entry 4098; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4099; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4100; GFX7LESS-NEXT: s_mov_b32 m0, -1 4101; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4102; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 4103; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4104; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4105; GFX7LESS-NEXT: s_mov_b32 s2, -1 4106; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4107; GFX7LESS-NEXT: s_endpgm 4108; 4109; GFX8-LABEL: xor_i32_varying: 4110; GFX8: ; %bb.0: ; %entry 4111; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4112; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4113; GFX8-NEXT: v_mov_b32_e32 v1, 0 4114; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4115; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4116; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4117; GFX8-NEXT: v_mov_b32_e32 v2, v0 4118; GFX8-NEXT: s_not_b64 exec, exec 4119; GFX8-NEXT: v_mov_b32_e32 v2, 0 4120; GFX8-NEXT: s_not_b64 exec, exec 4121; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4122; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4123; GFX8-NEXT: s_nop 1 4124; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4125; GFX8-NEXT: s_nop 1 4126; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4127; GFX8-NEXT: s_nop 1 4128; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4129; GFX8-NEXT: s_nop 1 4130; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4131; GFX8-NEXT: s_nop 1 4132; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4133; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4134; GFX8-NEXT: s_nop 0 4135; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4136; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4137; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4138; GFX8-NEXT: ; implicit-def: $vgpr0 4139; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4140; GFX8-NEXT: s_cbranch_execz .LBB16_2 4141; GFX8-NEXT: ; %bb.1: 4142; GFX8-NEXT: v_mov_b32_e32 v0, 0 4143; GFX8-NEXT: v_mov_b32_e32 v3, s4 4144; GFX8-NEXT: s_mov_b32 m0, -1 4145; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4146; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 4147; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4148; GFX8-NEXT: .LBB16_2: 4149; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4150; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4151; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4152; GFX8-NEXT: v_mov_b32_e32 v0, v1 4153; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 4154; GFX8-NEXT: s_mov_b32 s3, 0xf000 4155; GFX8-NEXT: s_mov_b32 s2, -1 4156; GFX8-NEXT: s_nop 0 4157; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4158; GFX8-NEXT: s_endpgm 4159; 4160; GFX9-LABEL: xor_i32_varying: 4161; GFX9: ; %bb.0: ; %entry 4162; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4163; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4164; GFX9-NEXT: v_mov_b32_e32 v1, 0 4165; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4166; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4167; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4168; GFX9-NEXT: v_mov_b32_e32 v2, v0 4169; GFX9-NEXT: s_not_b64 exec, exec 4170; GFX9-NEXT: v_mov_b32_e32 v2, 0 4171; GFX9-NEXT: s_not_b64 exec, exec 4172; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4173; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4174; GFX9-NEXT: s_nop 1 4175; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4176; GFX9-NEXT: s_nop 1 4177; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4178; GFX9-NEXT: s_nop 1 4179; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4180; GFX9-NEXT: s_nop 1 4181; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4182; GFX9-NEXT: s_nop 1 4183; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4184; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4185; GFX9-NEXT: s_nop 0 4186; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4187; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4188; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4189; GFX9-NEXT: ; implicit-def: $vgpr0 4190; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4191; GFX9-NEXT: s_cbranch_execz .LBB16_2 4192; GFX9-NEXT: ; %bb.1: 4193; GFX9-NEXT: v_mov_b32_e32 v0, 0 4194; GFX9-NEXT: v_mov_b32_e32 v3, s4 4195; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4196; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 4197; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4198; GFX9-NEXT: .LBB16_2: 4199; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4200; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4201; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4202; GFX9-NEXT: v_mov_b32_e32 v0, v1 4203; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 4204; GFX9-NEXT: s_mov_b32 s3, 0xf000 4205; GFX9-NEXT: s_mov_b32 s2, -1 4206; GFX9-NEXT: s_nop 0 4207; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4208; GFX9-NEXT: s_endpgm 4209; 4210; GFX1064-LABEL: xor_i32_varying: 4211; GFX1064: ; %bb.0: ; %entry 4212; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4213; GFX1064-NEXT: s_not_b64 exec, exec 4214; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4215; GFX1064-NEXT: s_not_b64 exec, exec 4216; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4217; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4218; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4219; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4220; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4221; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4222; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4223; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4224; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4225; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4226; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4227; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4228; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4229; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4230; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4231; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4232; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4233; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4234; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4235; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4236; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4237; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4238; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4239; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4240; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4241; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4242; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4243; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4244; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4245; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4246; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4247; GFX1064-NEXT: s_mov_b32 s2, -1 4248; GFX1064-NEXT: ; implicit-def: $vgpr0 4249; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4250; GFX1064-NEXT: s_cbranch_execz .LBB16_2 4251; GFX1064-NEXT: ; %bb.1: 4252; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4253; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4254; GFX1064-NEXT: s_mov_b32 s3, s7 4255; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4256; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4257; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 4258; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4259; GFX1064-NEXT: buffer_gl0_inv 4260; GFX1064-NEXT: .LBB16_2: 4261; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4262; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4263; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4264; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4265; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 4266; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4267; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4268; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4269; GFX1064-NEXT: s_endpgm 4270; 4271; GFX1032-LABEL: xor_i32_varying: 4272; GFX1032: ; %bb.0: ; %entry 4273; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4274; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4275; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4276; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4277; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4278; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4279; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4280; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4281; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4282; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4283; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4284; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4285; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4286; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4287; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4288; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4289; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4290; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4291; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4292; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4293; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4294; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4295; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4296; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4297; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4298; GFX1032-NEXT: s_mov_b32 s2, -1 4299; GFX1032-NEXT: ; implicit-def: $vgpr0 4300; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4301; GFX1032-NEXT: s_cbranch_execz .LBB16_2 4302; GFX1032-NEXT: ; %bb.1: 4303; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4304; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4305; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4306; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4307; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 4308; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4309; GFX1032-NEXT: buffer_gl0_inv 4310; GFX1032-NEXT: .LBB16_2: 4311; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4312; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4313; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4314; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4315; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 4316; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4317; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4318; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4319; GFX1032-NEXT: s_endpgm 4320; 4321; GFX1164-LABEL: xor_i32_varying: 4322; GFX1164: ; %bb.0: ; %entry 4323; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4324; GFX1164-NEXT: s_not_b64 exec, exec 4325; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4326; GFX1164-NEXT: s_not_b64 exec, exec 4327; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4328; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4329; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4330; GFX1164-NEXT: v_mov_b32_e32 v3, 0 4331; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4332; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4333; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4334; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4335; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4336; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4337; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4338; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4339; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4340; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4341; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4342; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4343; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4344; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4345; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4346; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4347; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4348; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4349; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4350; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4351; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4352; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4353; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4354; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4355; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4356; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4357; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4358; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4359; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4360; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 4361; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4362; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4363; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4364; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4365; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4366; GFX1164-NEXT: s_mov_b32 s2, -1 4367; GFX1164-NEXT: ; implicit-def: $vgpr0 4368; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4369; GFX1164-NEXT: s_cbranch_execz .LBB16_2 4370; GFX1164-NEXT: ; %bb.1: 4371; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4372; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4373; GFX1164-NEXT: s_mov_b32 s3, s7 4374; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4375; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4376; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v4 4377; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4378; GFX1164-NEXT: buffer_gl0_inv 4379; GFX1164-NEXT: .LBB16_2: 4380; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4381; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4382; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4383; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4384; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 4385; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4386; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4387; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4388; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4389; GFX1164-NEXT: s_endpgm 4390; 4391; GFX1132-LABEL: xor_i32_varying: 4392; GFX1132: ; %bb.0: ; %entry 4393; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4394; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4395; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4396; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4397; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4398; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4399; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4400; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4401; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4402; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4403; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4404; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4405; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4406; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4407; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4408; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4409; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4410; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4411; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4412; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4413; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4414; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4415; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4416; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4417; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4418; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4419; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4420; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4421; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4422; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4423; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4424; GFX1132-NEXT: s_mov_b32 s2, -1 4425; GFX1132-NEXT: ; implicit-def: $vgpr0 4426; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4427; GFX1132-NEXT: s_cbranch_execz .LBB16_2 4428; GFX1132-NEXT: ; %bb.1: 4429; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4430; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4431; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4432; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4433; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v4 4434; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4435; GFX1132-NEXT: buffer_gl0_inv 4436; GFX1132-NEXT: .LBB16_2: 4437; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4438; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4439; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4440; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4441; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 4442; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4443; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4444; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4445; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4446; GFX1132-NEXT: s_endpgm 4447entry: 4448 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4449 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4450 store i32 %old, i32 addrspace(1)* %out 4451 ret void 4452} 4453 4454define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 4455; 4456; 4457; GFX7LESS-LABEL: max_i32_varying: 4458; GFX7LESS: ; %bb.0: ; %entry 4459; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4460; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4461; GFX7LESS-NEXT: s_mov_b32 m0, -1 4462; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4463; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 4464; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4465; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4466; GFX7LESS-NEXT: s_mov_b32 s2, -1 4467; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4468; GFX7LESS-NEXT: s_endpgm 4469; 4470; GFX8-LABEL: max_i32_varying: 4471; GFX8: ; %bb.0: ; %entry 4472; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4473; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4474; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4475; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4476; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4477; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4478; GFX8-NEXT: v_mov_b32_e32 v2, v0 4479; GFX8-NEXT: s_not_b64 exec, exec 4480; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 4481; GFX8-NEXT: s_not_b64 exec, exec 4482; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4483; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4484; GFX8-NEXT: s_nop 1 4485; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4486; GFX8-NEXT: s_nop 1 4487; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4488; GFX8-NEXT: s_nop 1 4489; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4490; GFX8-NEXT: s_nop 1 4491; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4492; GFX8-NEXT: s_nop 1 4493; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4494; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4495; GFX8-NEXT: s_nop 0 4496; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4497; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4498; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4499; GFX8-NEXT: ; implicit-def: $vgpr0 4500; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4501; GFX8-NEXT: s_cbranch_execz .LBB17_2 4502; GFX8-NEXT: ; %bb.1: 4503; GFX8-NEXT: v_mov_b32_e32 v0, 0 4504; GFX8-NEXT: v_mov_b32_e32 v3, s4 4505; GFX8-NEXT: s_mov_b32 m0, -1 4506; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4507; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 4508; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4509; GFX8-NEXT: .LBB17_2: 4510; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4511; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4512; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4513; GFX8-NEXT: v_mov_b32_e32 v0, v1 4514; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 4515; GFX8-NEXT: s_mov_b32 s3, 0xf000 4516; GFX8-NEXT: s_mov_b32 s2, -1 4517; GFX8-NEXT: s_nop 0 4518; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4519; GFX8-NEXT: s_endpgm 4520; 4521; GFX9-LABEL: max_i32_varying: 4522; GFX9: ; %bb.0: ; %entry 4523; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4524; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4525; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4526; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4527; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4528; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4529; GFX9-NEXT: v_mov_b32_e32 v2, v0 4530; GFX9-NEXT: s_not_b64 exec, exec 4531; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 4532; GFX9-NEXT: s_not_b64 exec, exec 4533; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4534; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4535; GFX9-NEXT: s_nop 1 4536; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4537; GFX9-NEXT: s_nop 1 4538; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4539; GFX9-NEXT: s_nop 1 4540; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4541; GFX9-NEXT: s_nop 1 4542; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4543; GFX9-NEXT: s_nop 1 4544; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4545; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4546; GFX9-NEXT: s_nop 0 4547; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4548; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4549; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4550; GFX9-NEXT: ; implicit-def: $vgpr0 4551; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4552; GFX9-NEXT: s_cbranch_execz .LBB17_2 4553; GFX9-NEXT: ; %bb.1: 4554; GFX9-NEXT: v_mov_b32_e32 v0, 0 4555; GFX9-NEXT: v_mov_b32_e32 v3, s4 4556; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4557; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 4558; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4559; GFX9-NEXT: .LBB17_2: 4560; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4561; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4562; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4563; GFX9-NEXT: v_mov_b32_e32 v0, v1 4564; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 4565; GFX9-NEXT: s_mov_b32 s3, 0xf000 4566; GFX9-NEXT: s_mov_b32 s2, -1 4567; GFX9-NEXT: s_nop 0 4568; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4569; GFX9-NEXT: s_endpgm 4570; 4571; GFX1064-LABEL: max_i32_varying: 4572; GFX1064: ; %bb.0: ; %entry 4573; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4574; GFX1064-NEXT: s_not_b64 exec, exec 4575; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 4576; GFX1064-NEXT: s_not_b64 exec, exec 4577; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4578; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4579; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 4580; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4581; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4582; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4583; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4584; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4585; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4586; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4587; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4588; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4589; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4590; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4591; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4592; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4593; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4594; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4595; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4596; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4597; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4598; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4599; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4600; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4601; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4602; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4603; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4604; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4605; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4606; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4607; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4608; GFX1064-NEXT: s_mov_b32 s2, -1 4609; GFX1064-NEXT: ; implicit-def: $vgpr0 4610; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4611; GFX1064-NEXT: s_cbranch_execz .LBB17_2 4612; GFX1064-NEXT: ; %bb.1: 4613; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4614; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4615; GFX1064-NEXT: s_mov_b32 s3, s7 4616; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4617; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4618; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 4619; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4620; GFX1064-NEXT: buffer_gl0_inv 4621; GFX1064-NEXT: .LBB17_2: 4622; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4623; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4624; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4625; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4626; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 4627; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4628; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4629; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4630; GFX1064-NEXT: s_endpgm 4631; 4632; GFX1032-LABEL: max_i32_varying: 4633; GFX1032: ; %bb.0: ; %entry 4634; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4635; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4636; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 4637; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4638; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4639; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4640; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4641; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4642; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4643; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4644; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4645; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4646; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4647; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4648; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4649; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 4650; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4651; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4652; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4653; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4654; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4655; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4656; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4657; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4658; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4659; GFX1032-NEXT: s_mov_b32 s2, -1 4660; GFX1032-NEXT: ; implicit-def: $vgpr0 4661; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4662; GFX1032-NEXT: s_cbranch_execz .LBB17_2 4663; GFX1032-NEXT: ; %bb.1: 4664; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4665; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4666; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4667; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4668; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 4669; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4670; GFX1032-NEXT: buffer_gl0_inv 4671; GFX1032-NEXT: .LBB17_2: 4672; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4673; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4674; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4675; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4676; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 4677; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4678; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4679; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4680; GFX1032-NEXT: s_endpgm 4681; 4682; GFX1164-LABEL: max_i32_varying: 4683; GFX1164: ; %bb.0: ; %entry 4684; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4685; GFX1164-NEXT: s_not_b64 exec, exec 4686; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 4687; GFX1164-NEXT: s_not_b64 exec, exec 4688; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4689; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4690; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4691; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 4692; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4693; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4694; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4695; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4696; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4697; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4698; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4699; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4700; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4701; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4702; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4703; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4704; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4705; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4706; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4707; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4708; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4709; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4710; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4711; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4712; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4713; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4714; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4715; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4716; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4717; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4718; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4719; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4720; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4721; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 4722; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4723; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4724; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4725; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4726; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4727; GFX1164-NEXT: s_mov_b32 s2, -1 4728; GFX1164-NEXT: ; implicit-def: $vgpr0 4729; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4730; GFX1164-NEXT: s_cbranch_execz .LBB17_2 4731; GFX1164-NEXT: ; %bb.1: 4732; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4733; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4734; GFX1164-NEXT: s_mov_b32 s3, s7 4735; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4736; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4737; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 4738; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4739; GFX1164-NEXT: buffer_gl0_inv 4740; GFX1164-NEXT: .LBB17_2: 4741; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4742; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4743; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4744; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4745; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 4746; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4747; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4748; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4749; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4750; GFX1164-NEXT: s_endpgm 4751; 4752; GFX1132-LABEL: max_i32_varying: 4753; GFX1132: ; %bb.0: ; %entry 4754; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4755; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4756; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 4757; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4758; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4759; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4760; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4761; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4762; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4763; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4764; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4765; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4766; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4767; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4768; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4769; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4770; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4771; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4772; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4773; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 4774; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4775; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4776; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4777; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4778; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4779; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4780; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4781; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4782; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4783; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4784; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4785; GFX1132-NEXT: s_mov_b32 s2, -1 4786; GFX1132-NEXT: ; implicit-def: $vgpr0 4787; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4788; GFX1132-NEXT: s_cbranch_execz .LBB17_2 4789; GFX1132-NEXT: ; %bb.1: 4790; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4791; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4792; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4793; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4794; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 4795; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4796; GFX1132-NEXT: buffer_gl0_inv 4797; GFX1132-NEXT: .LBB17_2: 4798; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4799; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4800; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4801; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4802; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 4803; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4804; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4805; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4806; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4807; GFX1132-NEXT: s_endpgm 4808entry: 4809 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4810 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4811 store i32 %old, i32 addrspace(1)* %out 4812 ret void 4813} 4814 4815define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 4816; 4817; 4818; GFX7LESS-LABEL: max_i64_constant: 4819; GFX7LESS: ; %bb.0: ; %entry 4820; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4821; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4822; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4823; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4824; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4825; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4826; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 4827; GFX7LESS-NEXT: ; %bb.1: 4828; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4829; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4830; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4831; GFX7LESS-NEXT: s_mov_b32 m0, -1 4832; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4833; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4834; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4835; GFX7LESS-NEXT: .LBB18_2: 4836; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4837; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4838; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4839; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4840; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 4841; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4842; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4843; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4844; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4845; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4846; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 4847; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4848; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4849; GFX7LESS-NEXT: s_mov_b32 s2, -1 4850; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4851; GFX7LESS-NEXT: s_endpgm 4852; 4853; GFX8-LABEL: max_i64_constant: 4854; GFX8: ; %bb.0: ; %entry 4855; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4856; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4857; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4858; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4859; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4860; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4861; GFX8-NEXT: s_cbranch_execz .LBB18_2 4862; GFX8-NEXT: ; %bb.1: 4863; GFX8-NEXT: v_mov_b32_e32 v0, 5 4864; GFX8-NEXT: v_mov_b32_e32 v2, 0 4865; GFX8-NEXT: v_mov_b32_e32 v1, 0 4866; GFX8-NEXT: s_mov_b32 m0, -1 4867; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4868; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4869; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4870; GFX8-NEXT: .LBB18_2: 4871; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4872; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4873; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4874; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 4875; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4876; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4877; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4878; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4879; GFX8-NEXT: v_mov_b32_e32 v2, s3 4880; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4881; GFX8-NEXT: v_mov_b32_e32 v2, s2 4882; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4883; GFX8-NEXT: s_mov_b32 s3, 0xf000 4884; GFX8-NEXT: s_mov_b32 s2, -1 4885; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4886; GFX8-NEXT: s_endpgm 4887; 4888; GFX9-LABEL: max_i64_constant: 4889; GFX9: ; %bb.0: ; %entry 4890; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4891; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4892; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4893; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4894; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4895; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4896; GFX9-NEXT: s_cbranch_execz .LBB18_2 4897; GFX9-NEXT: ; %bb.1: 4898; GFX9-NEXT: v_mov_b32_e32 v0, 5 4899; GFX9-NEXT: v_mov_b32_e32 v1, 0 4900; GFX9-NEXT: v_mov_b32_e32 v2, 0 4901; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4902; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4903; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4904; GFX9-NEXT: .LBB18_2: 4905; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4906; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4907; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4908; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 4909; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4910; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4911; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4912; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4913; GFX9-NEXT: v_mov_b32_e32 v2, s3 4914; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4915; GFX9-NEXT: v_mov_b32_e32 v2, s2 4916; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4917; GFX9-NEXT: s_mov_b32 s3, 0xf000 4918; GFX9-NEXT: s_mov_b32 s2, -1 4919; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4920; GFX9-NEXT: s_endpgm 4921; 4922; GFX1064-LABEL: max_i64_constant: 4923; GFX1064: ; %bb.0: ; %entry 4924; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4925; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4926; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4927; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4928; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4929; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4930; GFX1064-NEXT: s_cbranch_execz .LBB18_2 4931; GFX1064-NEXT: ; %bb.1: 4932; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4933; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4934; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4935; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4936; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4937; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4938; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4939; GFX1064-NEXT: buffer_gl0_inv 4940; GFX1064-NEXT: .LBB18_2: 4941; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4942; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4943; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4944; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4945; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4946; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4947; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4948; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4949; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4950; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4951; GFX1064-NEXT: s_mov_b32 s2, -1 4952; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4953; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4954; GFX1064-NEXT: s_endpgm 4955; 4956; GFX1032-LABEL: max_i64_constant: 4957; GFX1032: ; %bb.0: ; %entry 4958; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4959; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4960; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4961; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4962; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4963; GFX1032-NEXT: s_cbranch_execz .LBB18_2 4964; GFX1032-NEXT: ; %bb.1: 4965; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4966; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4967; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4968; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4969; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4970; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4971; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4972; GFX1032-NEXT: buffer_gl0_inv 4973; GFX1032-NEXT: .LBB18_2: 4974; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4975; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4976; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4977; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4978; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4979; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4980; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4981; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4982; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4983; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4984; GFX1032-NEXT: s_mov_b32 s2, -1 4985; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4986; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4987; GFX1032-NEXT: s_endpgm 4988; 4989; GFX1164-LABEL: max_i64_constant: 4990; GFX1164: ; %bb.0: ; %entry 4991; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4992; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4993; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4994; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4995; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4996; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 4997; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 4998; GFX1164-NEXT: s_cbranch_execz .LBB18_2 4999; GFX1164-NEXT: ; %bb.1: 5000; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5001; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5002; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5003; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5004; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5005; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 5006; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5007; GFX1164-NEXT: buffer_gl0_inv 5008; GFX1164-NEXT: .LBB18_2: 5009; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5010; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5011; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5012; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 5013; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5014; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5015; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 5016; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5017; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5018; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5019; GFX1164-NEXT: s_mov_b32 s2, -1 5020; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5021; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5022; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5023; GFX1164-NEXT: s_endpgm 5024; 5025; GFX1132-LABEL: max_i64_constant: 5026; GFX1132: ; %bb.0: ; %entry 5027; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5028; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5029; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5030; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5031; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5032; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5033; GFX1132-NEXT: s_cbranch_execz .LBB18_2 5034; GFX1132-NEXT: ; %bb.1: 5035; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5036; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 5037; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5038; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5039; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 5040; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5041; GFX1132-NEXT: buffer_gl0_inv 5042; GFX1132-NEXT: .LBB18_2: 5043; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5044; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5045; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5046; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 5047; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 5048; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5049; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 5050; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5051; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5052; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5053; GFX1132-NEXT: s_mov_b32 s2, -1 5054; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5055; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5056; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5057; GFX1132-NEXT: s_endpgm 5058entry: 5059 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 5060 store i64 %old, i64 addrspace(1)* %out 5061 ret void 5062} 5063 5064define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 5065; 5066; 5067; GFX7LESS-LABEL: min_i32_varying: 5068; GFX7LESS: ; %bb.0: ; %entry 5069; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5070; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5071; GFX7LESS-NEXT: s_mov_b32 m0, -1 5072; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5073; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 5074; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5075; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5076; GFX7LESS-NEXT: s_mov_b32 s2, -1 5077; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5078; GFX7LESS-NEXT: s_endpgm 5079; 5080; GFX8-LABEL: min_i32_varying: 5081; GFX8: ; %bb.0: ; %entry 5082; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5083; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5084; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5085; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5086; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 5087; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5088; GFX8-NEXT: v_mov_b32_e32 v2, v0 5089; GFX8-NEXT: s_not_b64 exec, exec 5090; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 5091; GFX8-NEXT: s_not_b64 exec, exec 5092; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5093; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5094; GFX8-NEXT: s_nop 1 5095; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5096; GFX8-NEXT: s_nop 1 5097; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5098; GFX8-NEXT: s_nop 1 5099; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5100; GFX8-NEXT: s_nop 1 5101; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5102; GFX8-NEXT: s_nop 1 5103; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5104; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5105; GFX8-NEXT: s_nop 0 5106; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5107; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5108; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5109; GFX8-NEXT: ; implicit-def: $vgpr0 5110; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5111; GFX8-NEXT: s_cbranch_execz .LBB19_2 5112; GFX8-NEXT: ; %bb.1: 5113; GFX8-NEXT: v_mov_b32_e32 v0, 0 5114; GFX8-NEXT: v_mov_b32_e32 v3, s4 5115; GFX8-NEXT: s_mov_b32 m0, -1 5116; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5117; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 5118; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5119; GFX8-NEXT: .LBB19_2: 5120; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5121; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5122; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5123; GFX8-NEXT: v_mov_b32_e32 v0, v1 5124; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 5125; GFX8-NEXT: s_mov_b32 s3, 0xf000 5126; GFX8-NEXT: s_mov_b32 s2, -1 5127; GFX8-NEXT: s_nop 0 5128; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5129; GFX8-NEXT: s_endpgm 5130; 5131; GFX9-LABEL: min_i32_varying: 5132; GFX9: ; %bb.0: ; %entry 5133; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5134; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5135; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5136; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5137; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 5138; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5139; GFX9-NEXT: v_mov_b32_e32 v2, v0 5140; GFX9-NEXT: s_not_b64 exec, exec 5141; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 5142; GFX9-NEXT: s_not_b64 exec, exec 5143; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5144; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5145; GFX9-NEXT: s_nop 1 5146; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5147; GFX9-NEXT: s_nop 1 5148; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5149; GFX9-NEXT: s_nop 1 5150; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5151; GFX9-NEXT: s_nop 1 5152; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5153; GFX9-NEXT: s_nop 1 5154; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5155; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5156; GFX9-NEXT: s_nop 0 5157; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5158; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5159; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5160; GFX9-NEXT: ; implicit-def: $vgpr0 5161; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5162; GFX9-NEXT: s_cbranch_execz .LBB19_2 5163; GFX9-NEXT: ; %bb.1: 5164; GFX9-NEXT: v_mov_b32_e32 v0, 0 5165; GFX9-NEXT: v_mov_b32_e32 v3, s4 5166; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5167; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 5168; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5169; GFX9-NEXT: .LBB19_2: 5170; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5171; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5172; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5173; GFX9-NEXT: v_mov_b32_e32 v0, v1 5174; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 5175; GFX9-NEXT: s_mov_b32 s3, 0xf000 5176; GFX9-NEXT: s_mov_b32 s2, -1 5177; GFX9-NEXT: s_nop 0 5178; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5179; GFX9-NEXT: s_endpgm 5180; 5181; GFX1064-LABEL: min_i32_varying: 5182; GFX1064: ; %bb.0: ; %entry 5183; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5184; GFX1064-NEXT: s_not_b64 exec, exec 5185; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 5186; GFX1064-NEXT: s_not_b64 exec, exec 5187; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5188; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5189; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 5190; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5191; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5192; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5193; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5194; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5195; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5196; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5197; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5198; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5199; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5200; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5201; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5202; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5203; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5204; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5205; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5206; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5207; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5208; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5209; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5210; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5211; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5212; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5213; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5214; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5215; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5216; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5217; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5218; GFX1064-NEXT: s_mov_b32 s2, -1 5219; GFX1064-NEXT: ; implicit-def: $vgpr0 5220; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5221; GFX1064-NEXT: s_cbranch_execz .LBB19_2 5222; GFX1064-NEXT: ; %bb.1: 5223; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5224; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5225; GFX1064-NEXT: s_mov_b32 s3, s7 5226; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5227; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5228; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 5229; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5230; GFX1064-NEXT: buffer_gl0_inv 5231; GFX1064-NEXT: .LBB19_2: 5232; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5233; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5234; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5235; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5236; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 5237; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5238; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5239; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5240; GFX1064-NEXT: s_endpgm 5241; 5242; GFX1032-LABEL: min_i32_varying: 5243; GFX1032: ; %bb.0: ; %entry 5244; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5245; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5246; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 5247; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5248; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5249; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5250; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5251; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5252; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5253; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5254; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5255; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5256; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5257; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5258; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5259; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 5260; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5261; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5262; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5263; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5264; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5265; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5266; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5267; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5268; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5269; GFX1032-NEXT: s_mov_b32 s2, -1 5270; GFX1032-NEXT: ; implicit-def: $vgpr0 5271; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5272; GFX1032-NEXT: s_cbranch_execz .LBB19_2 5273; GFX1032-NEXT: ; %bb.1: 5274; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5275; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5276; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5277; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5278; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 5279; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5280; GFX1032-NEXT: buffer_gl0_inv 5281; GFX1032-NEXT: .LBB19_2: 5282; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5283; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5284; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5285; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5286; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 5287; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5288; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5289; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5290; GFX1032-NEXT: s_endpgm 5291; 5292; GFX1164-LABEL: min_i32_varying: 5293; GFX1164: ; %bb.0: ; %entry 5294; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5295; GFX1164-NEXT: s_not_b64 exec, exec 5296; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 5297; GFX1164-NEXT: s_not_b64 exec, exec 5298; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5299; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5300; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5301; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 5302; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5303; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5304; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5305; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5306; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5307; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5308; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5309; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5310; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5311; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5312; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5313; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5314; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5315; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5316; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5317; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5318; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5319; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5320; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5321; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5322; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5323; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5324; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5325; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5326; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5327; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5328; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5329; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5330; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5331; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 5332; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5333; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5334; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5335; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5336; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5337; GFX1164-NEXT: s_mov_b32 s2, -1 5338; GFX1164-NEXT: ; implicit-def: $vgpr0 5339; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5340; GFX1164-NEXT: s_cbranch_execz .LBB19_2 5341; GFX1164-NEXT: ; %bb.1: 5342; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5343; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5344; GFX1164-NEXT: s_mov_b32 s3, s7 5345; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5346; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5347; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 5348; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5349; GFX1164-NEXT: buffer_gl0_inv 5350; GFX1164-NEXT: .LBB19_2: 5351; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5352; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5353; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5354; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5355; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 5356; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5357; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5358; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5359; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5360; GFX1164-NEXT: s_endpgm 5361; 5362; GFX1132-LABEL: min_i32_varying: 5363; GFX1132: ; %bb.0: ; %entry 5364; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5365; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5366; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 5367; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5368; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5369; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5370; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5371; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5372; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5373; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5374; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5375; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5376; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5377; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5378; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5379; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5380; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5381; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5382; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5383; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 5384; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5385; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5386; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 5387; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5388; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5389; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5390; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5391; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5392; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5393; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 5394; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5395; GFX1132-NEXT: s_mov_b32 s2, -1 5396; GFX1132-NEXT: ; implicit-def: $vgpr0 5397; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5398; GFX1132-NEXT: s_cbranch_execz .LBB19_2 5399; GFX1132-NEXT: ; %bb.1: 5400; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5401; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5402; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5403; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5404; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 5405; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5406; GFX1132-NEXT: buffer_gl0_inv 5407; GFX1132-NEXT: .LBB19_2: 5408; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5409; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5410; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5411; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5412; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 5413; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5414; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5415; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5416; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5417; GFX1132-NEXT: s_endpgm 5418entry: 5419 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5420 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5421 store i32 %old, i32 addrspace(1)* %out 5422 ret void 5423} 5424 5425define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 5426; 5427; 5428; GFX7LESS-LABEL: min_i64_constant: 5429; GFX7LESS: ; %bb.0: ; %entry 5430; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5431; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5432; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5433; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5434; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5435; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5436; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 5437; GFX7LESS-NEXT: ; %bb.1: 5438; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5439; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5440; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5441; GFX7LESS-NEXT: s_mov_b32 m0, -1 5442; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5443; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5444; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5445; GFX7LESS-NEXT: .LBB20_2: 5446; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5447; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5448; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5449; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5450; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 5451; GFX7LESS-NEXT: s_mov_b32 s2, -1 5452; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5453; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5454; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 5455; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 5456; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5457; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5458; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5459; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5460; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5461; GFX7LESS-NEXT: s_endpgm 5462; 5463; GFX8-LABEL: min_i64_constant: 5464; GFX8: ; %bb.0: ; %entry 5465; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5466; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5467; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5468; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5469; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5470; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5471; GFX8-NEXT: s_cbranch_execz .LBB20_2 5472; GFX8-NEXT: ; %bb.1: 5473; GFX8-NEXT: v_mov_b32_e32 v0, 5 5474; GFX8-NEXT: v_mov_b32_e32 v2, 0 5475; GFX8-NEXT: v_mov_b32_e32 v1, 0 5476; GFX8-NEXT: s_mov_b32 m0, -1 5477; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5478; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5479; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5480; GFX8-NEXT: .LBB20_2: 5481; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5482; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5483; GFX8-NEXT: v_readfirstlane_b32 s4, v0 5484; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 5485; GFX8-NEXT: v_readfirstlane_b32 s5, v1 5486; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5487; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5488; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5489; GFX8-NEXT: v_mov_b32_e32 v2, s5 5490; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5491; GFX8-NEXT: v_mov_b32_e32 v2, s4 5492; GFX8-NEXT: s_mov_b32 s2, -1 5493; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5494; GFX8-NEXT: s_mov_b32 s3, 0xf000 5495; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5496; GFX8-NEXT: s_endpgm 5497; 5498; GFX9-LABEL: min_i64_constant: 5499; GFX9: ; %bb.0: ; %entry 5500; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5501; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5502; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5503; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5504; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5505; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5506; GFX9-NEXT: s_cbranch_execz .LBB20_2 5507; GFX9-NEXT: ; %bb.1: 5508; GFX9-NEXT: v_mov_b32_e32 v0, 5 5509; GFX9-NEXT: v_mov_b32_e32 v1, 0 5510; GFX9-NEXT: v_mov_b32_e32 v2, 0 5511; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5512; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5513; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5514; GFX9-NEXT: .LBB20_2: 5515; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5516; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5517; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5518; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 5519; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5520; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5521; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5522; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5523; GFX9-NEXT: v_mov_b32_e32 v2, s5 5524; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5525; GFX9-NEXT: v_mov_b32_e32 v2, s4 5526; GFX9-NEXT: s_mov_b32 s2, -1 5527; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5528; GFX9-NEXT: s_mov_b32 s3, 0xf000 5529; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5530; GFX9-NEXT: s_endpgm 5531; 5532; GFX1064-LABEL: min_i64_constant: 5533; GFX1064: ; %bb.0: ; %entry 5534; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5535; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5536; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5537; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5538; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5539; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5540; GFX1064-NEXT: s_cbranch_execz .LBB20_2 5541; GFX1064-NEXT: ; %bb.1: 5542; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5543; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5544; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5545; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5546; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5547; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5548; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5549; GFX1064-NEXT: buffer_gl0_inv 5550; GFX1064-NEXT: .LBB20_2: 5551; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5552; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5553; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5554; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5555; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5556; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5557; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5558; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5559; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5560; GFX1064-NEXT: s_mov_b32 s2, -1 5561; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5562; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5563; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5564; GFX1064-NEXT: s_endpgm 5565; 5566; GFX1032-LABEL: min_i64_constant: 5567; GFX1032: ; %bb.0: ; %entry 5568; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5569; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5570; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5571; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5572; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5573; GFX1032-NEXT: s_cbranch_execz .LBB20_2 5574; GFX1032-NEXT: ; %bb.1: 5575; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5576; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5577; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5578; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5579; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5580; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5581; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5582; GFX1032-NEXT: buffer_gl0_inv 5583; GFX1032-NEXT: .LBB20_2: 5584; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5585; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5586; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5587; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5588; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5589; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5590; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5591; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5592; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5593; GFX1032-NEXT: s_mov_b32 s2, -1 5594; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5595; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5596; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5597; GFX1032-NEXT: s_endpgm 5598; 5599; GFX1164-LABEL: min_i64_constant: 5600; GFX1164: ; %bb.0: ; %entry 5601; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5602; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5603; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5604; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5605; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5606; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5607; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5608; GFX1164-NEXT: s_cbranch_execz .LBB20_2 5609; GFX1164-NEXT: ; %bb.1: 5610; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5611; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5612; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5613; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5614; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5615; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5616; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5617; GFX1164-NEXT: buffer_gl0_inv 5618; GFX1164-NEXT: .LBB20_2: 5619; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5620; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5621; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5622; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5623; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5624; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5625; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5626; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5627; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5628; GFX1164-NEXT: s_mov_b32 s2, -1 5629; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5630; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5631; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5632; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5633; GFX1164-NEXT: s_endpgm 5634; 5635; GFX1132-LABEL: min_i64_constant: 5636; GFX1132: ; %bb.0: ; %entry 5637; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5638; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5639; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5640; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5641; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5642; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5643; GFX1132-NEXT: s_cbranch_execz .LBB20_2 5644; GFX1132-NEXT: ; %bb.1: 5645; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5646; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 5647; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5648; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5649; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5650; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5651; GFX1132-NEXT: buffer_gl0_inv 5652; GFX1132-NEXT: .LBB20_2: 5653; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5654; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5655; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5656; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5657; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5658; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5659; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5660; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5661; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5662; GFX1132-NEXT: s_mov_b32 s2, -1 5663; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5664; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5665; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5666; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5667; GFX1132-NEXT: s_endpgm 5668entry: 5669 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 5670 store i64 %old, i64 addrspace(1)* %out 5671 ret void 5672} 5673 5674define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 5675; 5676; 5677; GFX7LESS-LABEL: umax_i32_varying: 5678; GFX7LESS: ; %bb.0: ; %entry 5679; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5680; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5681; GFX7LESS-NEXT: s_mov_b32 m0, -1 5682; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5683; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 5684; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5685; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5686; GFX7LESS-NEXT: s_mov_b32 s2, -1 5687; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5688; GFX7LESS-NEXT: s_endpgm 5689; 5690; GFX8-LABEL: umax_i32_varying: 5691; GFX8: ; %bb.0: ; %entry 5692; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5693; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5694; GFX8-NEXT: v_mov_b32_e32 v1, 0 5695; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5696; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5697; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5698; GFX8-NEXT: v_mov_b32_e32 v2, v0 5699; GFX8-NEXT: s_not_b64 exec, exec 5700; GFX8-NEXT: v_mov_b32_e32 v2, 0 5701; GFX8-NEXT: s_not_b64 exec, exec 5702; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5703; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5704; GFX8-NEXT: s_nop 1 5705; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5706; GFX8-NEXT: s_nop 1 5707; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5708; GFX8-NEXT: s_nop 1 5709; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5710; GFX8-NEXT: s_nop 1 5711; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5712; GFX8-NEXT: s_nop 1 5713; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5714; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5715; GFX8-NEXT: s_nop 0 5716; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5717; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5718; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5719; GFX8-NEXT: ; implicit-def: $vgpr0 5720; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5721; GFX8-NEXT: s_cbranch_execz .LBB21_2 5722; GFX8-NEXT: ; %bb.1: 5723; GFX8-NEXT: v_mov_b32_e32 v0, 0 5724; GFX8-NEXT: v_mov_b32_e32 v3, s4 5725; GFX8-NEXT: s_mov_b32 m0, -1 5726; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5727; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 5728; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5729; GFX8-NEXT: .LBB21_2: 5730; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5731; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5732; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5733; GFX8-NEXT: v_mov_b32_e32 v0, v1 5734; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 5735; GFX8-NEXT: s_mov_b32 s3, 0xf000 5736; GFX8-NEXT: s_mov_b32 s2, -1 5737; GFX8-NEXT: s_nop 0 5738; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5739; GFX8-NEXT: s_endpgm 5740; 5741; GFX9-LABEL: umax_i32_varying: 5742; GFX9: ; %bb.0: ; %entry 5743; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5744; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5745; GFX9-NEXT: v_mov_b32_e32 v1, 0 5746; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5747; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5748; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5749; GFX9-NEXT: v_mov_b32_e32 v2, v0 5750; GFX9-NEXT: s_not_b64 exec, exec 5751; GFX9-NEXT: v_mov_b32_e32 v2, 0 5752; GFX9-NEXT: s_not_b64 exec, exec 5753; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5754; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5755; GFX9-NEXT: s_nop 1 5756; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5757; GFX9-NEXT: s_nop 1 5758; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5759; GFX9-NEXT: s_nop 1 5760; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5761; GFX9-NEXT: s_nop 1 5762; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5763; GFX9-NEXT: s_nop 1 5764; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5765; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5766; GFX9-NEXT: s_nop 0 5767; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5768; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5769; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5770; GFX9-NEXT: ; implicit-def: $vgpr0 5771; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5772; GFX9-NEXT: s_cbranch_execz .LBB21_2 5773; GFX9-NEXT: ; %bb.1: 5774; GFX9-NEXT: v_mov_b32_e32 v0, 0 5775; GFX9-NEXT: v_mov_b32_e32 v3, s4 5776; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5777; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 5778; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5779; GFX9-NEXT: .LBB21_2: 5780; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5781; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5782; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5783; GFX9-NEXT: v_mov_b32_e32 v0, v1 5784; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 5785; GFX9-NEXT: s_mov_b32 s3, 0xf000 5786; GFX9-NEXT: s_mov_b32 s2, -1 5787; GFX9-NEXT: s_nop 0 5788; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5789; GFX9-NEXT: s_endpgm 5790; 5791; GFX1064-LABEL: umax_i32_varying: 5792; GFX1064: ; %bb.0: ; %entry 5793; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5794; GFX1064-NEXT: s_not_b64 exec, exec 5795; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5796; GFX1064-NEXT: s_not_b64 exec, exec 5797; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5798; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5799; GFX1064-NEXT: v_mov_b32_e32 v3, 0 5800; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5801; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5802; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5803; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5804; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5805; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5806; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5807; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5808; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5809; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5810; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5811; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5812; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5813; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5814; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5815; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5816; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5817; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5818; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5819; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5820; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5821; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5822; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5823; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5824; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5825; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5826; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5827; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5828; GFX1064-NEXT: s_mov_b32 s2, -1 5829; GFX1064-NEXT: ; implicit-def: $vgpr0 5830; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5831; GFX1064-NEXT: s_cbranch_execz .LBB21_2 5832; GFX1064-NEXT: ; %bb.1: 5833; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5834; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5835; GFX1064-NEXT: s_mov_b32 s3, s7 5836; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5837; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5838; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 5839; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5840; GFX1064-NEXT: buffer_gl0_inv 5841; GFX1064-NEXT: .LBB21_2: 5842; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5843; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5844; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5845; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5846; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 5847; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5848; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5849; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5850; GFX1064-NEXT: s_endpgm 5851; 5852; GFX1032-LABEL: umax_i32_varying: 5853; GFX1032: ; %bb.0: ; %entry 5854; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5855; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5856; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5857; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5858; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5859; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5860; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5861; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5862; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5863; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5864; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5865; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5866; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5867; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5868; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5869; GFX1032-NEXT: v_mov_b32_e32 v3, 0 5870; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5871; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5872; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5873; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5874; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5875; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5876; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5877; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5878; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5879; GFX1032-NEXT: s_mov_b32 s2, -1 5880; GFX1032-NEXT: ; implicit-def: $vgpr0 5881; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5882; GFX1032-NEXT: s_cbranch_execz .LBB21_2 5883; GFX1032-NEXT: ; %bb.1: 5884; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5885; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5886; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5887; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5888; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 5889; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5890; GFX1032-NEXT: buffer_gl0_inv 5891; GFX1032-NEXT: .LBB21_2: 5892; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5893; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5894; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5895; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5896; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 5897; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5898; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5899; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5900; GFX1032-NEXT: s_endpgm 5901; 5902; GFX1164-LABEL: umax_i32_varying: 5903; GFX1164: ; %bb.0: ; %entry 5904; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5905; GFX1164-NEXT: s_not_b64 exec, exec 5906; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5907; GFX1164-NEXT: s_not_b64 exec, exec 5908; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5909; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5910; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5911; GFX1164-NEXT: v_mov_b32_e32 v3, 0 5912; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5913; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5914; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5915; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5916; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5917; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5918; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5919; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5920; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5921; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5922; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5923; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5924; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5925; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5926; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5927; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5928; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5929; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5930; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5931; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5932; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5933; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5934; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5935; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5936; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5937; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5938; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5939; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5940; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5941; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 5942; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5943; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5944; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5945; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5946; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5947; GFX1164-NEXT: s_mov_b32 s2, -1 5948; GFX1164-NEXT: ; implicit-def: $vgpr0 5949; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5950; GFX1164-NEXT: s_cbranch_execz .LBB21_2 5951; GFX1164-NEXT: ; %bb.1: 5952; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5953; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5954; GFX1164-NEXT: s_mov_b32 s3, s7 5955; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5956; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5957; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v4 5958; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5959; GFX1164-NEXT: buffer_gl0_inv 5960; GFX1164-NEXT: .LBB21_2: 5961; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5962; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5963; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5964; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5965; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 5966; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5967; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5968; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5969; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5970; GFX1164-NEXT: s_endpgm 5971; 5972; GFX1132-LABEL: umax_i32_varying: 5973; GFX1132: ; %bb.0: ; %entry 5974; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5975; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5976; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5977; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5978; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5979; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5980; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5981; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5982; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5983; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5984; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5985; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5986; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5987; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5988; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5989; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5990; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5991; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5992; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5993; GFX1132-NEXT: v_mov_b32_e32 v3, 0 5994; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5995; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5996; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 5997; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5998; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5999; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6000; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6001; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 6002; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6003; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 6004; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6005; GFX1132-NEXT: s_mov_b32 s2, -1 6006; GFX1132-NEXT: ; implicit-def: $vgpr0 6007; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 6008; GFX1132-NEXT: s_cbranch_execz .LBB21_2 6009; GFX1132-NEXT: ; %bb.1: 6010; GFX1132-NEXT: v_mov_b32_e32 v0, 0 6011; GFX1132-NEXT: v_mov_b32_e32 v4, s4 6012; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6013; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6014; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v4 6015; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6016; GFX1132-NEXT: buffer_gl0_inv 6017; GFX1132-NEXT: .LBB21_2: 6018; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 6019; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 6020; GFX1132-NEXT: v_mov_b32_e32 v0, v3 6021; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6022; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 6023; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6024; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6025; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6026; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6027; GFX1132-NEXT: s_endpgm 6028entry: 6029 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6030 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 6031 store i32 %old, i32 addrspace(1)* %out 6032 ret void 6033} 6034 6035define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 6036; 6037; 6038; GFX7LESS-LABEL: umax_i64_constant: 6039; GFX7LESS: ; %bb.0: ; %entry 6040; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6041; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6042; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6043; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6044; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6045; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6046; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 6047; GFX7LESS-NEXT: ; %bb.1: 6048; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6049; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6050; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6051; GFX7LESS-NEXT: s_mov_b32 m0, -1 6052; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6053; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6054; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6055; GFX7LESS-NEXT: .LBB22_2: 6056; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6057; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6058; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6059; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6060; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6061; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6062; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6063; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6064; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 6065; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6066; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 6067; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6068; GFX7LESS-NEXT: s_mov_b32 s2, -1 6069; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6070; GFX7LESS-NEXT: s_endpgm 6071; 6072; GFX8-LABEL: umax_i64_constant: 6073; GFX8: ; %bb.0: ; %entry 6074; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6075; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6076; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6077; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6078; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6079; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6080; GFX8-NEXT: s_cbranch_execz .LBB22_2 6081; GFX8-NEXT: ; %bb.1: 6082; GFX8-NEXT: v_mov_b32_e32 v0, 5 6083; GFX8-NEXT: v_mov_b32_e32 v2, 0 6084; GFX8-NEXT: v_mov_b32_e32 v1, 0 6085; GFX8-NEXT: s_mov_b32 m0, -1 6086; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6087; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6088; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6089; GFX8-NEXT: .LBB22_2: 6090; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6091; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6092; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6093; GFX8-NEXT: v_readfirstlane_b32 s3, v1 6094; GFX8-NEXT: v_mov_b32_e32 v1, 0 6095; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6096; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6097; GFX8-NEXT: v_mov_b32_e32 v2, s2 6098; GFX8-NEXT: v_mov_b32_e32 v1, s3 6099; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6100; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6101; GFX8-NEXT: s_mov_b32 s3, 0xf000 6102; GFX8-NEXT: s_mov_b32 s2, -1 6103; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6104; GFX8-NEXT: s_endpgm 6105; 6106; GFX9-LABEL: umax_i64_constant: 6107; GFX9: ; %bb.0: ; %entry 6108; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6109; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6110; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6111; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6112; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6113; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6114; GFX9-NEXT: s_cbranch_execz .LBB22_2 6115; GFX9-NEXT: ; %bb.1: 6116; GFX9-NEXT: v_mov_b32_e32 v0, 5 6117; GFX9-NEXT: v_mov_b32_e32 v1, 0 6118; GFX9-NEXT: v_mov_b32_e32 v2, 0 6119; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6120; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6121; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6122; GFX9-NEXT: .LBB22_2: 6123; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6124; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6125; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6126; GFX9-NEXT: v_readfirstlane_b32 s3, v1 6127; GFX9-NEXT: v_mov_b32_e32 v1, 0 6128; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6129; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6130; GFX9-NEXT: v_mov_b32_e32 v2, s2 6131; GFX9-NEXT: v_mov_b32_e32 v1, s3 6132; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6133; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6134; GFX9-NEXT: s_mov_b32 s3, 0xf000 6135; GFX9-NEXT: s_mov_b32 s2, -1 6136; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6137; GFX9-NEXT: s_endpgm 6138; 6139; GFX1064-LABEL: umax_i64_constant: 6140; GFX1064: ; %bb.0: ; %entry 6141; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6142; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6143; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6144; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6145; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6146; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6147; GFX1064-NEXT: s_cbranch_execz .LBB22_2 6148; GFX1064-NEXT: ; %bb.1: 6149; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6150; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6151; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6152; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6153; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6154; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6155; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6156; GFX1064-NEXT: buffer_gl0_inv 6157; GFX1064-NEXT: .LBB22_2: 6158; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6159; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6160; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6161; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6162; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6163; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6164; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6165; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6166; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 6167; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6168; GFX1064-NEXT: s_mov_b32 s2, -1 6169; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6170; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6171; GFX1064-NEXT: s_endpgm 6172; 6173; GFX1032-LABEL: umax_i64_constant: 6174; GFX1032: ; %bb.0: ; %entry 6175; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6176; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6177; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6178; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6179; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6180; GFX1032-NEXT: s_cbranch_execz .LBB22_2 6181; GFX1032-NEXT: ; %bb.1: 6182; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6183; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6184; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6185; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6186; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6187; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6188; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6189; GFX1032-NEXT: buffer_gl0_inv 6190; GFX1032-NEXT: .LBB22_2: 6191; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6192; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6193; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6194; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6195; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6196; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6197; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6198; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6199; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6200; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6201; GFX1032-NEXT: s_mov_b32 s2, -1 6202; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6203; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6204; GFX1032-NEXT: s_endpgm 6205; 6206; GFX1164-LABEL: umax_i64_constant: 6207; GFX1164: ; %bb.0: ; %entry 6208; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6209; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6210; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6211; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6212; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6213; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6214; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6215; GFX1164-NEXT: s_cbranch_execz .LBB22_2 6216; GFX1164-NEXT: ; %bb.1: 6217; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6218; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6219; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6220; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6221; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6222; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6223; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6224; GFX1164-NEXT: buffer_gl0_inv 6225; GFX1164-NEXT: .LBB22_2: 6226; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6227; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6228; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6229; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6230; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6231; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6232; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6233; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6234; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 6235; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6236; GFX1164-NEXT: s_mov_b32 s2, -1 6237; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6238; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6239; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6240; GFX1164-NEXT: s_endpgm 6241; 6242; GFX1132-LABEL: umax_i64_constant: 6243; GFX1132: ; %bb.0: ; %entry 6244; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6245; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6246; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6247; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6248; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6249; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6250; GFX1132-NEXT: s_cbranch_execz .LBB22_2 6251; GFX1132-NEXT: ; %bb.1: 6252; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6253; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 6254; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6255; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6256; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6257; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6258; GFX1132-NEXT: buffer_gl0_inv 6259; GFX1132-NEXT: .LBB22_2: 6260; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6261; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6262; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6263; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6264; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6265; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6266; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6267; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6268; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6269; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6270; GFX1132-NEXT: s_mov_b32 s2, -1 6271; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6272; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6273; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6274; GFX1132-NEXT: s_endpgm 6275entry: 6276 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 6277 store i64 %old, i64 addrspace(1)* %out 6278 ret void 6279} 6280 6281define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 6282; 6283; 6284; GFX7LESS-LABEL: umin_i32_varying: 6285; GFX7LESS: ; %bb.0: ; %entry 6286; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6287; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6288; GFX7LESS-NEXT: s_mov_b32 m0, -1 6289; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6290; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 6291; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6292; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6293; GFX7LESS-NEXT: s_mov_b32 s2, -1 6294; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 6295; GFX7LESS-NEXT: s_endpgm 6296; 6297; GFX8-LABEL: umin_i32_varying: 6298; GFX8: ; %bb.0: ; %entry 6299; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6300; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6301; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6302; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6303; GFX8-NEXT: v_mov_b32_e32 v1, -1 6304; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6305; GFX8-NEXT: v_mov_b32_e32 v2, v0 6306; GFX8-NEXT: s_not_b64 exec, exec 6307; GFX8-NEXT: v_mov_b32_e32 v2, -1 6308; GFX8-NEXT: s_not_b64 exec, exec 6309; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6310; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6311; GFX8-NEXT: s_nop 1 6312; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6313; GFX8-NEXT: s_nop 1 6314; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6315; GFX8-NEXT: s_nop 1 6316; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6317; GFX8-NEXT: s_nop 1 6318; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6319; GFX8-NEXT: s_nop 1 6320; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6321; GFX8-NEXT: v_readlane_b32 s4, v2, 63 6322; GFX8-NEXT: s_nop 0 6323; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6324; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6325; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6326; GFX8-NEXT: ; implicit-def: $vgpr0 6327; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6328; GFX8-NEXT: s_cbranch_execz .LBB23_2 6329; GFX8-NEXT: ; %bb.1: 6330; GFX8-NEXT: v_mov_b32_e32 v0, 0 6331; GFX8-NEXT: v_mov_b32_e32 v3, s4 6332; GFX8-NEXT: s_mov_b32 m0, -1 6333; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6334; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 6335; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6336; GFX8-NEXT: .LBB23_2: 6337; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6338; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6339; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6340; GFX8-NEXT: v_mov_b32_e32 v0, v1 6341; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 6342; GFX8-NEXT: s_mov_b32 s3, 0xf000 6343; GFX8-NEXT: s_mov_b32 s2, -1 6344; GFX8-NEXT: s_nop 0 6345; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 6346; GFX8-NEXT: s_endpgm 6347; 6348; GFX9-LABEL: umin_i32_varying: 6349; GFX9: ; %bb.0: ; %entry 6350; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6351; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6352; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6353; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6354; GFX9-NEXT: v_mov_b32_e32 v1, -1 6355; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6356; GFX9-NEXT: v_mov_b32_e32 v2, v0 6357; GFX9-NEXT: s_not_b64 exec, exec 6358; GFX9-NEXT: v_mov_b32_e32 v2, -1 6359; GFX9-NEXT: s_not_b64 exec, exec 6360; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6361; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6362; GFX9-NEXT: s_nop 1 6363; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6364; GFX9-NEXT: s_nop 1 6365; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6366; GFX9-NEXT: s_nop 1 6367; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6368; GFX9-NEXT: s_nop 1 6369; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6370; GFX9-NEXT: s_nop 1 6371; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6372; GFX9-NEXT: v_readlane_b32 s4, v2, 63 6373; GFX9-NEXT: s_nop 0 6374; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6375; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6376; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6377; GFX9-NEXT: ; implicit-def: $vgpr0 6378; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6379; GFX9-NEXT: s_cbranch_execz .LBB23_2 6380; GFX9-NEXT: ; %bb.1: 6381; GFX9-NEXT: v_mov_b32_e32 v0, 0 6382; GFX9-NEXT: v_mov_b32_e32 v3, s4 6383; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6384; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 6385; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6386; GFX9-NEXT: .LBB23_2: 6387; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6388; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6389; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6390; GFX9-NEXT: v_mov_b32_e32 v0, v1 6391; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 6392; GFX9-NEXT: s_mov_b32 s3, 0xf000 6393; GFX9-NEXT: s_mov_b32 s2, -1 6394; GFX9-NEXT: s_nop 0 6395; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 6396; GFX9-NEXT: s_endpgm 6397; 6398; GFX1064-LABEL: umin_i32_varying: 6399; GFX1064: ; %bb.0: ; %entry 6400; GFX1064-NEXT: v_mov_b32_e32 v1, v0 6401; GFX1064-NEXT: s_not_b64 exec, exec 6402; GFX1064-NEXT: v_mov_b32_e32 v1, -1 6403; GFX1064-NEXT: s_not_b64 exec, exec 6404; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6405; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6406; GFX1064-NEXT: v_mov_b32_e32 v3, -1 6407; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6408; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6409; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6410; GFX1064-NEXT: v_mov_b32_e32 v2, v1 6411; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6412; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6413; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 6414; GFX1064-NEXT: v_mov_b32_e32 v2, s4 6415; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6416; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 6417; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6418; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6419; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6420; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6421; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 6422; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 6423; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6424; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6425; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6426; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 6427; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 6428; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 6429; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6430; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6431; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 6432; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 6433; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 6434; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6435; GFX1064-NEXT: s_mov_b32 s2, -1 6436; GFX1064-NEXT: ; implicit-def: $vgpr0 6437; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 6438; GFX1064-NEXT: s_cbranch_execz .LBB23_2 6439; GFX1064-NEXT: ; %bb.1: 6440; GFX1064-NEXT: v_mov_b32_e32 v0, 0 6441; GFX1064-NEXT: v_mov_b32_e32 v4, s7 6442; GFX1064-NEXT: s_mov_b32 s3, s7 6443; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6444; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6445; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 6446; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6447; GFX1064-NEXT: buffer_gl0_inv 6448; GFX1064-NEXT: .LBB23_2: 6449; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6450; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 6451; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 6452; GFX1064-NEXT: v_mov_b32_e32 v0, v3 6453; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 6454; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6455; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6456; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 6457; GFX1064-NEXT: s_endpgm 6458; 6459; GFX1032-LABEL: umin_i32_varying: 6460; GFX1032: ; %bb.0: ; %entry 6461; GFX1032-NEXT: v_mov_b32_e32 v1, v0 6462; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6463; GFX1032-NEXT: v_mov_b32_e32 v1, -1 6464; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6465; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6466; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6467; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6468; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6469; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6470; GFX1032-NEXT: v_mov_b32_e32 v2, v1 6471; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6472; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6473; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6474; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6475; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6476; GFX1032-NEXT: v_mov_b32_e32 v3, -1 6477; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 6478; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 6479; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6480; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6481; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6482; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6483; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 6484; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6485; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6486; GFX1032-NEXT: s_mov_b32 s2, -1 6487; GFX1032-NEXT: ; implicit-def: $vgpr0 6488; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 6489; GFX1032-NEXT: s_cbranch_execz .LBB23_2 6490; GFX1032-NEXT: ; %bb.1: 6491; GFX1032-NEXT: v_mov_b32_e32 v0, 0 6492; GFX1032-NEXT: v_mov_b32_e32 v4, s4 6493; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6494; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6495; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 6496; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6497; GFX1032-NEXT: buffer_gl0_inv 6498; GFX1032-NEXT: .LBB23_2: 6499; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6500; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 6501; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 6502; GFX1032-NEXT: v_mov_b32_e32 v0, v3 6503; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 6504; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6505; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6506; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 6507; GFX1032-NEXT: s_endpgm 6508; 6509; GFX1164-LABEL: umin_i32_varying: 6510; GFX1164: ; %bb.0: ; %entry 6511; GFX1164-NEXT: v_mov_b32_e32 v1, v0 6512; GFX1164-NEXT: s_not_b64 exec, exec 6513; GFX1164-NEXT: v_mov_b32_e32 v1, -1 6514; GFX1164-NEXT: s_not_b64 exec, exec 6515; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6516; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6517; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6518; GFX1164-NEXT: v_mov_b32_e32 v3, -1 6519; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6520; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6521; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6522; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6523; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6524; GFX1164-NEXT: v_mov_b32_e32 v2, v1 6525; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6526; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6527; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6528; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 6529; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6530; GFX1164-NEXT: v_mov_b32_e32 v2, s4 6531; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6532; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6533; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 6534; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6535; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6536; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6537; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6538; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 6539; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 6540; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6541; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6542; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6543; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6544; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 6545; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 6546; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 6547; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6548; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 6549; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6550; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 6551; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 6552; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 6553; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6554; GFX1164-NEXT: s_mov_b32 s2, -1 6555; GFX1164-NEXT: ; implicit-def: $vgpr0 6556; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 6557; GFX1164-NEXT: s_cbranch_execz .LBB23_2 6558; GFX1164-NEXT: ; %bb.1: 6559; GFX1164-NEXT: v_mov_b32_e32 v0, 0 6560; GFX1164-NEXT: v_mov_b32_e32 v4, s7 6561; GFX1164-NEXT: s_mov_b32 s3, s7 6562; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6563; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6564; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 6565; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6566; GFX1164-NEXT: buffer_gl0_inv 6567; GFX1164-NEXT: .LBB23_2: 6568; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 6569; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 6570; GFX1164-NEXT: v_mov_b32_e32 v0, v3 6571; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6572; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 6573; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6574; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6575; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6576; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6577; GFX1164-NEXT: s_endpgm 6578; 6579; GFX1132-LABEL: umin_i32_varying: 6580; GFX1132: ; %bb.0: ; %entry 6581; GFX1132-NEXT: v_mov_b32_e32 v1, v0 6582; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6583; GFX1132-NEXT: v_mov_b32_e32 v1, -1 6584; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6585; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6586; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6587; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6588; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6589; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6590; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6591; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6592; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6593; GFX1132-NEXT: v_mov_b32_e32 v2, v1 6594; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6595; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6596; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6597; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6598; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6599; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6600; GFX1132-NEXT: v_mov_b32_e32 v3, -1 6601; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 6602; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 6603; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6604; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6605; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6606; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6607; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6608; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 6609; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6610; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 6611; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6612; GFX1132-NEXT: s_mov_b32 s2, -1 6613; GFX1132-NEXT: ; implicit-def: $vgpr0 6614; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 6615; GFX1132-NEXT: s_cbranch_execz .LBB23_2 6616; GFX1132-NEXT: ; %bb.1: 6617; GFX1132-NEXT: v_mov_b32_e32 v0, 0 6618; GFX1132-NEXT: v_mov_b32_e32 v4, s4 6619; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6620; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6621; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 6622; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6623; GFX1132-NEXT: buffer_gl0_inv 6624; GFX1132-NEXT: .LBB23_2: 6625; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 6626; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 6627; GFX1132-NEXT: v_mov_b32_e32 v0, v3 6628; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6629; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 6630; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6631; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6632; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6633; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6634; GFX1132-NEXT: s_endpgm 6635entry: 6636 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6637 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 6638 store i32 %old, i32 addrspace(1)* %out 6639 ret void 6640} 6641 6642define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 6643; 6644; 6645; GFX7LESS-LABEL: umin_i64_constant: 6646; GFX7LESS: ; %bb.0: ; %entry 6647; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6648; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6649; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6650; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6651; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6652; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6653; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 6654; GFX7LESS-NEXT: ; %bb.1: 6655; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6656; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6657; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6658; GFX7LESS-NEXT: s_mov_b32 m0, -1 6659; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6660; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6661; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6662; GFX7LESS-NEXT: .LBB24_2: 6663; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6664; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6665; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6666; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6667; GFX7LESS-NEXT: s_mov_b32 s2, -1 6668; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6669; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6670; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 6671; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6672; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6673; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6674; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6675; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6676; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6677; GFX7LESS-NEXT: s_endpgm 6678; 6679; GFX8-LABEL: umin_i64_constant: 6680; GFX8: ; %bb.0: ; %entry 6681; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6682; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6683; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6684; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6685; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6686; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6687; GFX8-NEXT: s_cbranch_execz .LBB24_2 6688; GFX8-NEXT: ; %bb.1: 6689; GFX8-NEXT: v_mov_b32_e32 v0, 5 6690; GFX8-NEXT: v_mov_b32_e32 v2, 0 6691; GFX8-NEXT: v_mov_b32_e32 v1, 0 6692; GFX8-NEXT: s_mov_b32 m0, -1 6693; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6694; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6695; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6696; GFX8-NEXT: .LBB24_2: 6697; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6698; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6699; GFX8-NEXT: v_readfirstlane_b32 s4, v0 6700; GFX8-NEXT: v_readfirstlane_b32 s5, v1 6701; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6702; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6703; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6704; GFX8-NEXT: v_mov_b32_e32 v2, s5 6705; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6706; GFX8-NEXT: v_mov_b32_e32 v2, s4 6707; GFX8-NEXT: s_mov_b32 s2, -1 6708; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6709; GFX8-NEXT: s_mov_b32 s3, 0xf000 6710; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6711; GFX8-NEXT: s_endpgm 6712; 6713; GFX9-LABEL: umin_i64_constant: 6714; GFX9: ; %bb.0: ; %entry 6715; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6716; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6717; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6718; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6719; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6720; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6721; GFX9-NEXT: s_cbranch_execz .LBB24_2 6722; GFX9-NEXT: ; %bb.1: 6723; GFX9-NEXT: v_mov_b32_e32 v0, 5 6724; GFX9-NEXT: v_mov_b32_e32 v1, 0 6725; GFX9-NEXT: v_mov_b32_e32 v2, 0 6726; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6727; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6728; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6729; GFX9-NEXT: .LBB24_2: 6730; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6731; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6732; GFX9-NEXT: v_readfirstlane_b32 s4, v0 6733; GFX9-NEXT: v_readfirstlane_b32 s5, v1 6734; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6735; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6736; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6737; GFX9-NEXT: v_mov_b32_e32 v2, s5 6738; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6739; GFX9-NEXT: v_mov_b32_e32 v2, s4 6740; GFX9-NEXT: s_mov_b32 s2, -1 6741; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6742; GFX9-NEXT: s_mov_b32 s3, 0xf000 6743; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6744; GFX9-NEXT: s_endpgm 6745; 6746; GFX1064-LABEL: umin_i64_constant: 6747; GFX1064: ; %bb.0: ; %entry 6748; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6749; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6750; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6751; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6752; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6753; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6754; GFX1064-NEXT: s_cbranch_execz .LBB24_2 6755; GFX1064-NEXT: ; %bb.1: 6756; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6757; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6758; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6759; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6760; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6761; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6762; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6763; GFX1064-NEXT: buffer_gl0_inv 6764; GFX1064-NEXT: .LBB24_2: 6765; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6766; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6767; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6768; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6769; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6770; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6771; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6772; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6773; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6774; GFX1064-NEXT: s_mov_b32 s2, -1 6775; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6776; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6777; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6778; GFX1064-NEXT: s_endpgm 6779; 6780; GFX1032-LABEL: umin_i64_constant: 6781; GFX1032: ; %bb.0: ; %entry 6782; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6783; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6784; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6785; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6786; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6787; GFX1032-NEXT: s_cbranch_execz .LBB24_2 6788; GFX1032-NEXT: ; %bb.1: 6789; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6790; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6791; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6792; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6793; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6794; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6795; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6796; GFX1032-NEXT: buffer_gl0_inv 6797; GFX1032-NEXT: .LBB24_2: 6798; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6799; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6800; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6801; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6802; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6803; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6804; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6805; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6806; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6807; GFX1032-NEXT: s_mov_b32 s2, -1 6808; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6809; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6810; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6811; GFX1032-NEXT: s_endpgm 6812; 6813; GFX1164-LABEL: umin_i64_constant: 6814; GFX1164: ; %bb.0: ; %entry 6815; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6816; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6817; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6818; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6819; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6820; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6821; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6822; GFX1164-NEXT: s_cbranch_execz .LBB24_2 6823; GFX1164-NEXT: ; %bb.1: 6824; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6825; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6826; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6827; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6828; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6829; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6830; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6831; GFX1164-NEXT: buffer_gl0_inv 6832; GFX1164-NEXT: .LBB24_2: 6833; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6834; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6835; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6836; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6837; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6838; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6839; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6840; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6841; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6842; GFX1164-NEXT: s_mov_b32 s2, -1 6843; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6844; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6845; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6846; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6847; GFX1164-NEXT: s_endpgm 6848; 6849; GFX1132-LABEL: umin_i64_constant: 6850; GFX1132: ; %bb.0: ; %entry 6851; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6852; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6853; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6854; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6855; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6856; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6857; GFX1132-NEXT: s_cbranch_execz .LBB24_2 6858; GFX1132-NEXT: ; %bb.1: 6859; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6860; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 6861; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6862; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6863; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6864; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6865; GFX1132-NEXT: buffer_gl0_inv 6866; GFX1132-NEXT: .LBB24_2: 6867; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6868; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6869; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6870; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6871; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6872; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6873; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6874; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6875; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6876; GFX1132-NEXT: s_mov_b32 s2, -1 6877; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6878; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6879; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6880; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6881; GFX1132-NEXT: s_endpgm 6882entry: 6883 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 6884 store i64 %old, i64 addrspace(1)* %out 6885 ret void 6886} 6887