1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11 12@local_var32 = addrspace(3) global i32 undef, align 4 13@local_var64 = addrspace(3) global i64 undef, align 8 14 15; Show what the atomic optimization pass will do for local pointers. 16 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 18; 19; 20; GFX7LESS-LABEL: add_i32_constant: 21; GFX7LESS: ; %bb.0: ; %entry 22; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 23; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 24; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 25; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 26; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 27; GFX7LESS-NEXT: ; implicit-def: $vgpr1 28; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 29; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 30; GFX7LESS-NEXT: ; %bb.1: 31; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 32; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 33; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 34; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 35; GFX7LESS-NEXT: s_mov_b32 m0, -1 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: .LBB0_2: 40; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 41; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 42; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 43; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 44; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 45; GFX7LESS-NEXT: s_mov_b32 s2, -1 46; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; GFX7LESS-NEXT: s_endpgm 48; 49; GFX8-LABEL: add_i32_constant: 50; GFX8: ; %bb.0: ; %entry 51; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 52; GFX8-NEXT: s_mov_b64 s[2:3], exec 53; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 54; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 55; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 56; GFX8-NEXT: ; implicit-def: $vgpr1 57; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 58; GFX8-NEXT: s_cbranch_execz .LBB0_2 59; GFX8-NEXT: ; %bb.1: 60; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 61; GFX8-NEXT: s_mul_i32 s2, s2, 5 62; GFX8-NEXT: v_mov_b32_e32 v1, 0 63; GFX8-NEXT: v_mov_b32_e32 v2, s2 64; GFX8-NEXT: s_mov_b32 m0, -1 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: .LBB0_2: 69; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 70; GFX8-NEXT: s_waitcnt lgkmcnt(0) 71; GFX8-NEXT: v_readfirstlane_b32 s2, v1 72; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 73; GFX8-NEXT: s_mov_b32 s3, 0xf000 74; GFX8-NEXT: s_mov_b32 s2, -1 75; GFX8-NEXT: s_nop 1 76; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX8-NEXT: s_endpgm 78; 79; GFX9-LABEL: add_i32_constant: 80; GFX9: ; %bb.0: ; %entry 81; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 82; GFX9-NEXT: s_mov_b64 s[2:3], exec 83; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 84; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 85; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 86; GFX9-NEXT: ; implicit-def: $vgpr1 87; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX9-NEXT: s_cbranch_execz .LBB0_2 89; GFX9-NEXT: ; %bb.1: 90; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 91; GFX9-NEXT: s_mul_i32 s2, s2, 5 92; GFX9-NEXT: v_mov_b32_e32 v1, 0 93; GFX9-NEXT: v_mov_b32_e32 v2, s2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: .LBB0_2: 98; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 99; GFX9-NEXT: s_waitcnt lgkmcnt(0) 100; GFX9-NEXT: v_readfirstlane_b32 s2, v1 101; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 102; GFX9-NEXT: s_mov_b32 s3, 0xf000 103; GFX9-NEXT: s_mov_b32 s2, -1 104; GFX9-NEXT: s_nop 1 105; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; GFX9-NEXT: s_endpgm 107; 108; GFX1064-LABEL: add_i32_constant: 109; GFX1064: ; %bb.0: ; %entry 110; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 111; GFX1064-NEXT: s_mov_b64 s[2:3], exec 112; GFX1064-NEXT: ; implicit-def: $vgpr1 113; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 114; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 115; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 116; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 117; GFX1064-NEXT: s_cbranch_execz .LBB0_2 118; GFX1064-NEXT: ; %bb.1: 119; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 120; GFX1064-NEXT: v_mov_b32_e32 v1, 0 121; GFX1064-NEXT: s_mul_i32 s2, s2, 5 122; GFX1064-NEXT: v_mov_b32_e32 v2, s2 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 125; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 126; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 127; GFX1064-NEXT: buffer_gl0_inv 128; GFX1064-NEXT: .LBB0_2: 129; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 130; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 131; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 132; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 133; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 134; GFX1064-NEXT: s_mov_b32 s2, -1 135; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 136; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 137; GFX1064-NEXT: s_endpgm 138; 139; GFX1032-LABEL: add_i32_constant: 140; GFX1032: ; %bb.0: ; %entry 141; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 142; GFX1032-NEXT: s_mov_b32 s3, exec_lo 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz .LBB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 150; GFX1032-NEXT: v_mov_b32_e32 v1, 0 151; GFX1032-NEXT: s_mul_i32 s3, s3, 5 152; GFX1032-NEXT: v_mov_b32_e32 v2, s3 153; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 154; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 155; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 156; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 157; GFX1032-NEXT: buffer_gl0_inv 158; GFX1032-NEXT: .LBB0_2: 159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 166; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 167; GFX1032-NEXT: s_endpgm 168; 169; GFX1164-LABEL: add_i32_constant: 170; GFX1164: ; %bb.0: ; %entry 171; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 172; GFX1164-NEXT: s_mov_b64 s[2:3], exec 173; GFX1164-NEXT: s_mov_b64 s[4:5], exec 174; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 175; GFX1164-NEXT: ; implicit-def: $vgpr1 176; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 177; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 178; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 179; GFX1164-NEXT: s_cbranch_execz .LBB0_2 180; GFX1164-NEXT: ; %bb.1: 181; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 182; GFX1164-NEXT: v_mov_b32_e32 v1, 0 183; GFX1164-NEXT: s_mul_i32 s2, s2, 5 184; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 185; GFX1164-NEXT: v_mov_b32_e32 v2, s2 186; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 187; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 188; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 189; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 190; GFX1164-NEXT: buffer_gl0_inv 191; GFX1164-NEXT: .LBB0_2: 192; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 193; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 194; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 195; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 196; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 197; GFX1164-NEXT: s_mov_b32 s2, -1 198; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 199; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 200; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 201; GFX1164-NEXT: s_endpgm 202; 203; GFX1132-LABEL: add_i32_constant: 204; GFX1132: ; %bb.0: ; %entry 205; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 206; GFX1132-NEXT: s_mov_b32 s3, exec_lo 207; GFX1132-NEXT: s_mov_b32 s2, exec_lo 208; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 209; GFX1132-NEXT: ; implicit-def: $vgpr1 210; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 211; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 212; GFX1132-NEXT: s_cbranch_execz .LBB0_2 213; GFX1132-NEXT: ; %bb.1: 214; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 215; GFX1132-NEXT: v_mov_b32_e32 v1, 0 216; GFX1132-NEXT: s_mul_i32 s3, s3, 5 217; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 218; GFX1132-NEXT: v_mov_b32_e32 v2, s3 219; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 220; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 221; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 222; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 223; GFX1132-NEXT: buffer_gl0_inv 224; GFX1132-NEXT: .LBB0_2: 225; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 226; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 227; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 228; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 229; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 230; GFX1132-NEXT: s_mov_b32 s2, -1 231; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 232; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 233; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 234; GFX1132-NEXT: s_endpgm 235entry: 236 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 237 store i32 %old, i32 addrspace(1)* %out 238 ret void 239} 240 241define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 242; 243; 244; GFX7LESS-LABEL: add_i32_uniform: 245; GFX7LESS: ; %bb.0: ; %entry 246; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 247; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 248; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 249; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 250; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 251; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 252; GFX7LESS-NEXT: ; implicit-def: $vgpr1 253; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 254; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 255; GFX7LESS-NEXT: ; %bb.1: 256; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 257; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 258; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 259; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 260; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 261; GFX7LESS-NEXT: s_mov_b32 m0, -1 262; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 263; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 264; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 265; GFX7LESS-NEXT: .LBB1_2: 266; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 267; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 268; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 269; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 270; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 271; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 272; GFX7LESS-NEXT: s_mov_b32 s6, -1 273; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 274; GFX7LESS-NEXT: s_endpgm 275; 276; GFX8-LABEL: add_i32_uniform: 277; GFX8: ; %bb.0: ; %entry 278; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 279; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 280; GFX8-NEXT: s_mov_b64 s[2:3], exec 281; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 282; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 283; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 284; GFX8-NEXT: ; implicit-def: $vgpr1 285; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 286; GFX8-NEXT: s_cbranch_execz .LBB1_2 287; GFX8-NEXT: ; %bb.1: 288; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 289; GFX8-NEXT: s_waitcnt lgkmcnt(0) 290; GFX8-NEXT: s_mul_i32 s2, s6, s2 291; GFX8-NEXT: v_mov_b32_e32 v1, 0 292; GFX8-NEXT: v_mov_b32_e32 v2, s2 293; GFX8-NEXT: s_mov_b32 m0, -1 294; GFX8-NEXT: s_waitcnt lgkmcnt(0) 295; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 296; GFX8-NEXT: s_waitcnt lgkmcnt(0) 297; GFX8-NEXT: .LBB1_2: 298; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 299; GFX8-NEXT: s_waitcnt lgkmcnt(0) 300; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 301; GFX8-NEXT: v_readfirstlane_b32 s0, v1 302; GFX8-NEXT: s_mov_b32 s7, 0xf000 303; GFX8-NEXT: s_mov_b32 s6, -1 304; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 305; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 306; GFX8-NEXT: s_endpgm 307; 308; GFX9-LABEL: add_i32_uniform: 309; GFX9: ; %bb.0: ; %entry 310; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 311; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 312; GFX9-NEXT: s_mov_b64 s[2:3], exec 313; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 314; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 315; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 316; GFX9-NEXT: ; implicit-def: $vgpr1 317; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 318; GFX9-NEXT: s_cbranch_execz .LBB1_2 319; GFX9-NEXT: ; %bb.1: 320; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 321; GFX9-NEXT: s_waitcnt lgkmcnt(0) 322; GFX9-NEXT: s_mul_i32 s2, s6, s2 323; GFX9-NEXT: v_mov_b32_e32 v1, 0 324; GFX9-NEXT: v_mov_b32_e32 v2, s2 325; GFX9-NEXT: s_waitcnt lgkmcnt(0) 326; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 327; GFX9-NEXT: s_waitcnt lgkmcnt(0) 328; GFX9-NEXT: .LBB1_2: 329; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 330; GFX9-NEXT: s_waitcnt lgkmcnt(0) 331; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 332; GFX9-NEXT: v_readfirstlane_b32 s0, v1 333; GFX9-NEXT: s_mov_b32 s7, 0xf000 334; GFX9-NEXT: s_mov_b32 s6, -1 335; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 336; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX9-NEXT: s_endpgm 338; 339; GFX1064-LABEL: add_i32_uniform: 340; GFX1064: ; %bb.0: ; %entry 341; GFX1064-NEXT: s_clause 0x1 342; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 343; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 344; GFX1064-NEXT: s_mov_b64 s[2:3], exec 345; GFX1064-NEXT: ; implicit-def: $vgpr1 346; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 347; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 348; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 349; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 350; GFX1064-NEXT: s_cbranch_execz .LBB1_2 351; GFX1064-NEXT: ; %bb.1: 352; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 353; GFX1064-NEXT: v_mov_b32_e32 v1, 0 354; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 355; GFX1064-NEXT: s_mul_i32 s2, s6, s2 356; GFX1064-NEXT: v_mov_b32_e32 v2, s2 357; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 358; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 359; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 360; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 361; GFX1064-NEXT: buffer_gl0_inv 362; GFX1064-NEXT: .LBB1_2: 363; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 364; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 365; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 366; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 367; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 368; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1] 369; GFX1064-NEXT: s_mov_b32 s6, -1 370; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 371; GFX1064-NEXT: s_endpgm 372; 373; GFX1032-LABEL: add_i32_uniform: 374; GFX1032: ; %bb.0: ; %entry 375; GFX1032-NEXT: s_clause 0x1 376; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 377; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 378; GFX1032-NEXT: s_mov_b32 s3, exec_lo 379; GFX1032-NEXT: ; implicit-def: $vgpr1 380; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 381; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 382; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 383; GFX1032-NEXT: s_cbranch_execz .LBB1_2 384; GFX1032-NEXT: ; %bb.1: 385; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 386; GFX1032-NEXT: v_mov_b32_e32 v1, 0 387; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 388; GFX1032-NEXT: s_mul_i32 s1, s2, s1 389; GFX1032-NEXT: v_mov_b32_e32 v2, s1 390; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 391; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 392; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 393; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 394; GFX1032-NEXT: buffer_gl0_inv 395; GFX1032-NEXT: .LBB1_2: 396; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 397; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 398; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 399; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 400; GFX1032-NEXT: s_mov_b32 s6, -1 401; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 402; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] 403; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 404; GFX1032-NEXT: s_endpgm 405; 406; GFX1164-LABEL: add_i32_uniform: 407; GFX1164: ; %bb.0: ; %entry 408; GFX1164-NEXT: s_clause 0x1 409; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 410; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 411; GFX1164-NEXT: s_mov_b64 s[2:3], exec 412; GFX1164-NEXT: s_mov_b64 s[0:1], exec 413; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 414; GFX1164-NEXT: ; implicit-def: $vgpr1 415; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 416; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 417; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 418; GFX1164-NEXT: s_cbranch_execz .LBB1_2 419; GFX1164-NEXT: ; %bb.1: 420; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 421; GFX1164-NEXT: v_mov_b32_e32 v1, 0 422; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 423; GFX1164-NEXT: s_mul_i32 s2, s6, s2 424; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 425; GFX1164-NEXT: v_mov_b32_e32 v2, s2 426; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 427; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 428; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 429; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 430; GFX1164-NEXT: buffer_gl0_inv 431; GFX1164-NEXT: .LBB1_2: 432; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 433; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 434; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 435; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 436; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 437; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1] 438; GFX1164-NEXT: s_mov_b32 s6, -1 439; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 440; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 441; GFX1164-NEXT: s_endpgm 442; 443; GFX1132-LABEL: add_i32_uniform: 444; GFX1132: ; %bb.0: ; %entry 445; GFX1132-NEXT: s_clause 0x1 446; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 447; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 448; GFX1132-NEXT: s_mov_b32 s2, exec_lo 449; GFX1132-NEXT: s_mov_b32 s1, exec_lo 450; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 451; GFX1132-NEXT: ; implicit-def: $vgpr1 452; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 453; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 454; GFX1132-NEXT: s_cbranch_execz .LBB1_2 455; GFX1132-NEXT: ; %bb.1: 456; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 457; GFX1132-NEXT: v_mov_b32_e32 v1, 0 458; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 459; GFX1132-NEXT: s_mul_i32 s2, s0, s2 460; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 461; GFX1132-NEXT: v_mov_b32_e32 v2, s2 462; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 463; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 464; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 465; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 466; GFX1132-NEXT: buffer_gl0_inv 467; GFX1132-NEXT: .LBB1_2: 468; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 469; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 470; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 471; GFX1132-NEXT: s_mov_b32 s6, -1 472; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 473; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 474; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] 475; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 476; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 477; GFX1132-NEXT: s_endpgm 478entry: 479 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 480 store i32 %old, i32 addrspace(1)* %out 481 ret void 482} 483 484define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 485; 486; 487; GFX7LESS-LABEL: add_i32_varying: 488; GFX7LESS: ; %bb.0: ; %entry 489; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 490; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 491; GFX7LESS-NEXT: s_mov_b32 m0, -1 492; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 493; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 494; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 495; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 496; GFX7LESS-NEXT: s_mov_b32 s2, -1 497; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 498; GFX7LESS-NEXT: s_endpgm 499; 500; GFX8-LABEL: add_i32_varying: 501; GFX8: ; %bb.0: ; %entry 502; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 503; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 504; GFX8-NEXT: v_mov_b32_e32 v1, 0 505; GFX8-NEXT: s_mov_b64 exec, s[2:3] 506; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 507; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 508; GFX8-NEXT: v_mov_b32_e32 v2, v0 509; GFX8-NEXT: s_not_b64 exec, exec 510; GFX8-NEXT: v_mov_b32_e32 v2, 0 511; GFX8-NEXT: s_not_b64 exec, exec 512; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 513; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 514; GFX8-NEXT: s_nop 1 515; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 516; GFX8-NEXT: s_nop 1 517; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 518; GFX8-NEXT: s_nop 1 519; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 520; GFX8-NEXT: s_nop 1 521; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 522; GFX8-NEXT: s_nop 1 523; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 524; GFX8-NEXT: v_readlane_b32 s4, v2, 63 525; GFX8-NEXT: s_nop 0 526; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 527; GFX8-NEXT: s_mov_b64 exec, s[2:3] 528; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 529; GFX8-NEXT: ; implicit-def: $vgpr0 530; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 531; GFX8-NEXT: s_cbranch_execz .LBB2_2 532; GFX8-NEXT: ; %bb.1: 533; GFX8-NEXT: v_mov_b32_e32 v0, 0 534; GFX8-NEXT: v_mov_b32_e32 v3, s4 535; GFX8-NEXT: s_mov_b32 m0, -1 536; GFX8-NEXT: s_waitcnt lgkmcnt(0) 537; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 538; GFX8-NEXT: s_waitcnt lgkmcnt(0) 539; GFX8-NEXT: .LBB2_2: 540; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 541; GFX8-NEXT: s_waitcnt lgkmcnt(0) 542; GFX8-NEXT: v_readfirstlane_b32 s2, v0 543; GFX8-NEXT: v_mov_b32_e32 v0, v1 544; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 545; GFX8-NEXT: s_mov_b32 s3, 0xf000 546; GFX8-NEXT: s_mov_b32 s2, -1 547; GFX8-NEXT: s_nop 0 548; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 549; GFX8-NEXT: s_endpgm 550; 551; GFX9-LABEL: add_i32_varying: 552; GFX9: ; %bb.0: ; %entry 553; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 554; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 555; GFX9-NEXT: v_mov_b32_e32 v1, 0 556; GFX9-NEXT: s_mov_b64 exec, s[2:3] 557; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 558; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 559; GFX9-NEXT: v_mov_b32_e32 v2, v0 560; GFX9-NEXT: s_not_b64 exec, exec 561; GFX9-NEXT: v_mov_b32_e32 v2, 0 562; GFX9-NEXT: s_not_b64 exec, exec 563; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 564; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 565; GFX9-NEXT: s_nop 1 566; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 567; GFX9-NEXT: s_nop 1 568; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 569; GFX9-NEXT: s_nop 1 570; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 571; GFX9-NEXT: s_nop 1 572; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 573; GFX9-NEXT: s_nop 1 574; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 575; GFX9-NEXT: v_readlane_b32 s4, v2, 63 576; GFX9-NEXT: s_nop 0 577; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 578; GFX9-NEXT: s_mov_b64 exec, s[2:3] 579; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 580; GFX9-NEXT: ; implicit-def: $vgpr0 581; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 582; GFX9-NEXT: s_cbranch_execz .LBB2_2 583; GFX9-NEXT: ; %bb.1: 584; GFX9-NEXT: v_mov_b32_e32 v0, 0 585; GFX9-NEXT: v_mov_b32_e32 v3, s4 586; GFX9-NEXT: s_waitcnt lgkmcnt(0) 587; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 588; GFX9-NEXT: s_waitcnt lgkmcnt(0) 589; GFX9-NEXT: .LBB2_2: 590; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 591; GFX9-NEXT: s_waitcnt lgkmcnt(0) 592; GFX9-NEXT: v_readfirstlane_b32 s2, v0 593; GFX9-NEXT: v_mov_b32_e32 v0, v1 594; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 595; GFX9-NEXT: s_mov_b32 s3, 0xf000 596; GFX9-NEXT: s_mov_b32 s2, -1 597; GFX9-NEXT: s_nop 0 598; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 599; GFX9-NEXT: s_endpgm 600; 601; GFX1064-LABEL: add_i32_varying: 602; GFX1064: ; %bb.0: ; %entry 603; GFX1064-NEXT: v_mov_b32_e32 v1, v0 604; GFX1064-NEXT: s_not_b64 exec, exec 605; GFX1064-NEXT: v_mov_b32_e32 v1, 0 606; GFX1064-NEXT: s_not_b64 exec, exec 607; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 608; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 609; GFX1064-NEXT: v_mov_b32_e32 v3, 0 610; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 611; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 612; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 613; GFX1064-NEXT: v_mov_b32_e32 v2, v1 614; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 615; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 616; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 617; GFX1064-NEXT: v_mov_b32_e32 v2, s4 618; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 619; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 620; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 621; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 622; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 623; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 624; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 625; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 626; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 627; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 628; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 629; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 630; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 631; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 632; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 633; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 634; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 635; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 636; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 637; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 638; GFX1064-NEXT: s_mov_b32 s2, -1 639; GFX1064-NEXT: ; implicit-def: $vgpr0 640; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 641; GFX1064-NEXT: s_cbranch_execz .LBB2_2 642; GFX1064-NEXT: ; %bb.1: 643; GFX1064-NEXT: v_mov_b32_e32 v0, 0 644; GFX1064-NEXT: v_mov_b32_e32 v4, s7 645; GFX1064-NEXT: s_mov_b32 s3, s7 646; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 647; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 648; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 649; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 650; GFX1064-NEXT: buffer_gl0_inv 651; GFX1064-NEXT: .LBB2_2: 652; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 653; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 654; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 655; GFX1064-NEXT: v_mov_b32_e32 v0, v3 656; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 657; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 658; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 659; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 660; GFX1064-NEXT: s_endpgm 661; 662; GFX1032-LABEL: add_i32_varying: 663; GFX1032: ; %bb.0: ; %entry 664; GFX1032-NEXT: v_mov_b32_e32 v1, v0 665; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 666; GFX1032-NEXT: v_mov_b32_e32 v1, 0 667; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 668; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 669; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 671; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 672; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 673; GFX1032-NEXT: v_mov_b32_e32 v2, v1 674; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 675; GFX1032-NEXT: s_mov_b32 exec_lo, s2 676; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 677; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 678; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 679; GFX1032-NEXT: v_mov_b32_e32 v3, 0 680; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 681; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 682; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 683; GFX1032-NEXT: s_mov_b32 exec_lo, s2 684; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 685; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 686; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 687; GFX1032-NEXT: s_mov_b32 exec_lo, s2 688; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 689; GFX1032-NEXT: s_mov_b32 s2, -1 690; GFX1032-NEXT: ; implicit-def: $vgpr0 691; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 692; GFX1032-NEXT: s_cbranch_execz .LBB2_2 693; GFX1032-NEXT: ; %bb.1: 694; GFX1032-NEXT: v_mov_b32_e32 v0, 0 695; GFX1032-NEXT: v_mov_b32_e32 v4, s4 696; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 697; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 698; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 699; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 700; GFX1032-NEXT: buffer_gl0_inv 701; GFX1032-NEXT: .LBB2_2: 702; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 703; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 704; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 705; GFX1032-NEXT: v_mov_b32_e32 v0, v3 706; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 707; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 708; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 709; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 710; GFX1032-NEXT: s_endpgm 711; 712; GFX1164-LABEL: add_i32_varying: 713; GFX1164: ; %bb.0: ; %entry 714; GFX1164-NEXT: v_mov_b32_e32 v1, v0 715; GFX1164-NEXT: s_not_b64 exec, exec 716; GFX1164-NEXT: v_mov_b32_e32 v1, 0 717; GFX1164-NEXT: s_not_b64 exec, exec 718; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 719; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 720; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 721; GFX1164-NEXT: v_mov_b32_e32 v3, 0 722; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 723; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 724; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 725; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 726; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 727; GFX1164-NEXT: v_mov_b32_e32 v2, v1 728; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 729; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 730; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 731; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 732; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 733; GFX1164-NEXT: v_mov_b32_e32 v2, s4 734; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 735; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 736; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 737; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 738; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 739; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 740; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 741; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 742; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 743; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 744; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 745; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 746; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 747; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 748; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 749; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 750; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 751; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 752; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 753; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 754; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 755; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 756; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 757; GFX1164-NEXT: s_mov_b32 s2, -1 758; GFX1164-NEXT: ; implicit-def: $vgpr0 759; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 760; GFX1164-NEXT: s_cbranch_execz .LBB2_2 761; GFX1164-NEXT: ; %bb.1: 762; GFX1164-NEXT: v_mov_b32_e32 v0, 0 763; GFX1164-NEXT: v_mov_b32_e32 v4, s7 764; GFX1164-NEXT: s_mov_b32 s3, s7 765; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 766; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 767; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v4 768; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 769; GFX1164-NEXT: buffer_gl0_inv 770; GFX1164-NEXT: .LBB2_2: 771; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 772; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 773; GFX1164-NEXT: v_mov_b32_e32 v0, v3 774; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 775; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 776; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 777; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 778; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 779; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 780; GFX1164-NEXT: s_endpgm 781; 782; GFX1132-LABEL: add_i32_varying: 783; GFX1132: ; %bb.0: ; %entry 784; GFX1132-NEXT: v_mov_b32_e32 v1, v0 785; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 786; GFX1132-NEXT: v_mov_b32_e32 v1, 0 787; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 788; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 789; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 790; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 791; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 792; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 793; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 794; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 795; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 796; GFX1132-NEXT: v_mov_b32_e32 v2, v1 797; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 798; GFX1132-NEXT: s_mov_b32 exec_lo, s2 799; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 800; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 801; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 802; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 803; GFX1132-NEXT: v_mov_b32_e32 v3, 0 804; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 805; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 806; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 807; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 808; GFX1132-NEXT: s_mov_b32 exec_lo, s2 809; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 810; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 811; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 812; GFX1132-NEXT: s_mov_b32 exec_lo, s2 813; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 814; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 815; GFX1132-NEXT: s_mov_b32 s2, -1 816; GFX1132-NEXT: ; implicit-def: $vgpr0 817; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 818; GFX1132-NEXT: s_cbranch_execz .LBB2_2 819; GFX1132-NEXT: ; %bb.1: 820; GFX1132-NEXT: v_mov_b32_e32 v0, 0 821; GFX1132-NEXT: v_mov_b32_e32 v4, s4 822; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 823; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 824; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v4 825; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 826; GFX1132-NEXT: buffer_gl0_inv 827; GFX1132-NEXT: .LBB2_2: 828; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 829; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 830; GFX1132-NEXT: v_mov_b32_e32 v0, v3 831; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 832; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 833; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 834; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 835; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 836; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 837; GFX1132-NEXT: s_endpgm 838entry: 839 %lane = call i32 @llvm.amdgcn.workitem.id.x() 840 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 841 store i32 %old, i32 addrspace(1)* %out 842 ret void 843} 844 845define amdgpu_kernel void @add_i32_varying_nouse() { 846; GFX7LESS-LABEL: add_i32_varying_nouse: 847; GFX7LESS: ; %bb.0: ; %entry 848; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 849; GFX7LESS-NEXT: s_mov_b32 m0, -1 850; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 851; GFX7LESS-NEXT: ds_add_u32 v1, v0 852; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 853; GFX7LESS-NEXT: s_endpgm 854; 855; GFX8-LABEL: add_i32_varying_nouse: 856; GFX8: ; %bb.0: ; %entry 857; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 858; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 859; GFX8-NEXT: v_mov_b32_e32 v1, v0 860; GFX8-NEXT: s_not_b64 exec, exec 861; GFX8-NEXT: v_mov_b32_e32 v1, 0 862; GFX8-NEXT: s_not_b64 exec, exec 863; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 864; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 865; GFX8-NEXT: s_nop 1 866; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 867; GFX8-NEXT: s_nop 1 868; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 869; GFX8-NEXT: s_nop 1 870; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 871; GFX8-NEXT: s_nop 1 872; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 873; GFX8-NEXT: s_nop 1 874; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 875; GFX8-NEXT: v_readlane_b32 s2, v1, 63 876; GFX8-NEXT: s_mov_b64 exec, s[0:1] 877; GFX8-NEXT: s_mov_b32 s0, s2 878; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 879; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 880; GFX8-NEXT: s_cbranch_execz .LBB3_2 881; GFX8-NEXT: ; %bb.1: 882; GFX8-NEXT: v_mov_b32_e32 v0, 0 883; GFX8-NEXT: v_mov_b32_e32 v2, s0 884; GFX8-NEXT: s_mov_b32 m0, -1 885; GFX8-NEXT: s_waitcnt lgkmcnt(0) 886; GFX8-NEXT: ds_add_u32 v0, v2 887; GFX8-NEXT: s_waitcnt lgkmcnt(0) 888; GFX8-NEXT: .LBB3_2: 889; GFX8-NEXT: s_endpgm 890; 891; GFX9-LABEL: add_i32_varying_nouse: 892; GFX9: ; %bb.0: ; %entry 893; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 894; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 895; GFX9-NEXT: v_mov_b32_e32 v1, v0 896; GFX9-NEXT: s_not_b64 exec, exec 897; GFX9-NEXT: v_mov_b32_e32 v1, 0 898; GFX9-NEXT: s_not_b64 exec, exec 899; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 900; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 901; GFX9-NEXT: s_nop 1 902; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 903; GFX9-NEXT: s_nop 1 904; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 905; GFX9-NEXT: s_nop 1 906; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 907; GFX9-NEXT: s_nop 1 908; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 909; GFX9-NEXT: s_nop 1 910; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 911; GFX9-NEXT: v_readlane_b32 s2, v1, 63 912; GFX9-NEXT: s_mov_b64 exec, s[0:1] 913; GFX9-NEXT: s_mov_b32 s0, s2 914; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 915; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 916; GFX9-NEXT: s_cbranch_execz .LBB3_2 917; GFX9-NEXT: ; %bb.1: 918; GFX9-NEXT: v_mov_b32_e32 v0, 0 919; GFX9-NEXT: v_mov_b32_e32 v2, s0 920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 921; GFX9-NEXT: ds_add_u32 v0, v2 922; GFX9-NEXT: s_waitcnt lgkmcnt(0) 923; GFX9-NEXT: .LBB3_2: 924; GFX9-NEXT: s_endpgm 925; 926; GFX1064-LABEL: add_i32_varying_nouse: 927; GFX1064: ; %bb.0: ; %entry 928; GFX1064-NEXT: v_mov_b32_e32 v1, v0 929; GFX1064-NEXT: s_not_b64 exec, exec 930; GFX1064-NEXT: v_mov_b32_e32 v1, 0 931; GFX1064-NEXT: s_not_b64 exec, exec 932; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 933; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 934; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 935; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 936; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 937; GFX1064-NEXT: v_mov_b32_e32 v2, v1 938; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 939; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 940; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 941; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 942; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 943; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 944; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 945; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 946; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 947; GFX1064-NEXT: s_add_i32 s0, s2, s3 948; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 949; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 950; GFX1064-NEXT: s_cbranch_execz .LBB3_2 951; GFX1064-NEXT: ; %bb.1: 952; GFX1064-NEXT: v_mov_b32_e32 v0, 0 953; GFX1064-NEXT: v_mov_b32_e32 v3, s0 954; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 955; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 956; GFX1064-NEXT: ds_add_u32 v0, v3 957; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 958; GFX1064-NEXT: buffer_gl0_inv 959; GFX1064-NEXT: .LBB3_2: 960; GFX1064-NEXT: s_endpgm 961; 962; GFX1032-LABEL: add_i32_varying_nouse: 963; GFX1032: ; %bb.0: ; %entry 964; GFX1032-NEXT: v_mov_b32_e32 v1, v0 965; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 966; GFX1032-NEXT: v_mov_b32_e32 v1, 0 967; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 968; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 969; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 970; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 971; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 972; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 973; GFX1032-NEXT: v_mov_b32_e32 v2, v1 974; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 975; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 976; GFX1032-NEXT: s_mov_b32 exec_lo, s0 977; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 978; GFX1032-NEXT: v_mov_b32_e32 v0, v1 979; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 980; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 981; GFX1032-NEXT: s_cbranch_execz .LBB3_2 982; GFX1032-NEXT: ; %bb.1: 983; GFX1032-NEXT: v_mov_b32_e32 v3, 0 984; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 985; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 986; GFX1032-NEXT: ds_add_u32 v3, v0 987; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 988; GFX1032-NEXT: buffer_gl0_inv 989; GFX1032-NEXT: .LBB3_2: 990; GFX1032-NEXT: s_endpgm 991; 992; GFX1164-LABEL: add_i32_varying_nouse: 993; GFX1164: ; %bb.0: ; %entry 994; GFX1164-NEXT: v_mov_b32_e32 v1, v0 995; GFX1164-NEXT: s_not_b64 exec, exec 996; GFX1164-NEXT: v_mov_b32_e32 v1, 0 997; GFX1164-NEXT: s_not_b64 exec, exec 998; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 999; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1000; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1001; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1002; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1003; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1004; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1005; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1006; GFX1164-NEXT: v_mov_b32_e32 v2, v1 1007; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1008; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1009; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 1010; GFX1164-NEXT: v_permlane64_b32 v2, v1 1011; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 1012; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1013; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1014; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 1015; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 1016; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 1017; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) 1018; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 1019; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1020; GFX1164-NEXT: v_mov_b32_e32 v0, v1 1021; GFX1164-NEXT: s_mov_b64 s[0:1], exec 1022; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 1023; GFX1164-NEXT: s_cbranch_execz .LBB3_2 1024; GFX1164-NEXT: ; %bb.1: 1025; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1026; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1027; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1028; GFX1164-NEXT: ds_add_u32 v3, v0 1029; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1030; GFX1164-NEXT: buffer_gl0_inv 1031; GFX1164-NEXT: .LBB3_2: 1032; GFX1164-NEXT: s_endpgm 1033; 1034; GFX1132-LABEL: add_i32_varying_nouse: 1035; GFX1132: ; %bb.0: ; %entry 1036; GFX1132-NEXT: v_mov_b32_e32 v1, v0 1037; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 1038; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1039; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 1040; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 1041; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1042; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1043; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1044; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1045; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1046; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1047; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1048; GFX1132-NEXT: v_mov_b32_e32 v2, v1 1049; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1050; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1051; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 1052; GFX1132-NEXT: s_mov_b32 exec_lo, s0 1053; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1054; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1055; GFX1132-NEXT: v_mov_b32_e32 v0, v1 1056; GFX1132-NEXT: s_mov_b32 s0, exec_lo 1057; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 1058; GFX1132-NEXT: s_cbranch_execz .LBB3_2 1059; GFX1132-NEXT: ; %bb.1: 1060; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1061; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1062; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1063; GFX1132-NEXT: ds_add_u32 v3, v0 1064; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1065; GFX1132-NEXT: buffer_gl0_inv 1066; GFX1132-NEXT: .LBB3_2: 1067; GFX1132-NEXT: s_endpgm 1068entry: 1069 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1070 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1071 ret void 1072} 1073 1074define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1075; 1076; 1077; GFX7LESS-LABEL: add_i64_constant: 1078; GFX7LESS: ; %bb.0: ; %entry 1079; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1080; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1081; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1082; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1083; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1084; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1085; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1086; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1087; GFX7LESS-NEXT: ; %bb.1: 1088; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1089; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1090; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1091; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1092; GFX7LESS-NEXT: s_mov_b32 m0, -1 1093; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1094; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1095; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1096; GFX7LESS-NEXT: .LBB4_2: 1097; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1098; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1100; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1101; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1102; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1103; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1104; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1105; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1106; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1107; GFX7LESS-NEXT: s_mov_b32 s2, -1 1108; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1109; GFX7LESS-NEXT: s_endpgm 1110; 1111; GFX8-LABEL: add_i64_constant: 1112; GFX8: ; %bb.0: ; %entry 1113; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1114; GFX8-NEXT: s_mov_b64 s[4:5], exec 1115; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1116; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1117; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1118; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1119; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1120; GFX8-NEXT: s_cbranch_execz .LBB4_2 1121; GFX8-NEXT: ; %bb.1: 1122; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1123; GFX8-NEXT: s_mul_i32 s4, s4, 5 1124; GFX8-NEXT: v_mov_b32_e32 v0, s4 1125; GFX8-NEXT: v_mov_b32_e32 v1, 0 1126; GFX8-NEXT: s_mov_b32 m0, -1 1127; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1128; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1129; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1130; GFX8-NEXT: .LBB4_2: 1131; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1132; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1133; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1134; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1135; GFX8-NEXT: v_mov_b32_e32 v0, s2 1136; GFX8-NEXT: v_mov_b32_e32 v1, s3 1137; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1138; GFX8-NEXT: s_mov_b32 s3, 0xf000 1139; GFX8-NEXT: s_mov_b32 s2, -1 1140; GFX8-NEXT: s_nop 2 1141; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1142; GFX8-NEXT: s_endpgm 1143; 1144; GFX9-LABEL: add_i64_constant: 1145; GFX9: ; %bb.0: ; %entry 1146; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1147; GFX9-NEXT: s_mov_b64 s[4:5], exec 1148; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1149; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1150; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1151; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1152; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1153; GFX9-NEXT: s_cbranch_execz .LBB4_2 1154; GFX9-NEXT: ; %bb.1: 1155; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1156; GFX9-NEXT: s_mul_i32 s4, s4, 5 1157; GFX9-NEXT: v_mov_b32_e32 v0, s4 1158; GFX9-NEXT: v_mov_b32_e32 v1, 0 1159; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1161; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1162; GFX9-NEXT: .LBB4_2: 1163; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1166; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1167; GFX9-NEXT: v_mov_b32_e32 v0, s2 1168; GFX9-NEXT: v_mov_b32_e32 v1, s3 1169; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1170; GFX9-NEXT: s_mov_b32 s3, 0xf000 1171; GFX9-NEXT: s_mov_b32 s2, -1 1172; GFX9-NEXT: s_nop 2 1173; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1174; GFX9-NEXT: s_endpgm 1175; 1176; GFX1064-LABEL: add_i64_constant: 1177; GFX1064: ; %bb.0: ; %entry 1178; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1179; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1180; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1181; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1182; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1183; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1184; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1185; GFX1064-NEXT: s_cbranch_execz .LBB4_2 1186; GFX1064-NEXT: ; %bb.1: 1187; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1188; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1189; GFX1064-NEXT: s_mul_i32 s4, s4, 5 1190; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1191; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1192; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1193; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1194; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX1064-NEXT: buffer_gl0_inv 1196; GFX1064-NEXT: .LBB4_2: 1197; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1198; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1199; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1200; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1201; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1202; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1203; GFX1064-NEXT: s_mov_b32 s2, -1 1204; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1205; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1206; GFX1064-NEXT: s_endpgm 1207; 1208; GFX1032-LABEL: add_i64_constant: 1209; GFX1032: ; %bb.0: ; %entry 1210; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1211; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1212; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1213; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1214; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1215; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1216; GFX1032-NEXT: s_cbranch_execz .LBB4_2 1217; GFX1032-NEXT: ; %bb.1: 1218; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1219; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1220; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1221; GFX1032-NEXT: v_mov_b32_e32 v0, s3 1222; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1223; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1224; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1225; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1226; GFX1032-NEXT: buffer_gl0_inv 1227; GFX1032-NEXT: .LBB4_2: 1228; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1229; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1230; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1231; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1232; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1233; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1234; GFX1032-NEXT: s_mov_b32 s2, -1 1235; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1237; GFX1032-NEXT: s_endpgm 1238; 1239; GFX1164-LABEL: add_i64_constant: 1240; GFX1164: ; %bb.0: ; %entry 1241; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1242; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1243; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1244; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1245; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1246; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1247; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1248; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1249; GFX1164-NEXT: s_cbranch_execz .LBB4_2 1250; GFX1164-NEXT: ; %bb.1: 1251; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1252; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1253; GFX1164-NEXT: s_mul_i32 s4, s4, 5 1254; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1255; GFX1164-NEXT: v_mov_b32_e32 v0, s4 1256; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1257; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1258; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1259; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1260; GFX1164-NEXT: buffer_gl0_inv 1261; GFX1164-NEXT: .LBB4_2: 1262; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 1263; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1264; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1265; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1266; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1267; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1268; GFX1164-NEXT: s_mov_b32 s2, -1 1269; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1271; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1272; GFX1164-NEXT: s_endpgm 1273; 1274; GFX1132-LABEL: add_i64_constant: 1275; GFX1132: ; %bb.0: ; %entry 1276; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1277; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1278; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1279; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1280; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1281; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1282; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1283; GFX1132-NEXT: s_cbranch_execz .LBB4_2 1284; GFX1132-NEXT: ; %bb.1: 1285; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1286; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1287; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1288; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1289; GFX1132-NEXT: v_mov_b32_e32 v0, s3 1290; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1291; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1292; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1293; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX1132-NEXT: buffer_gl0_inv 1295; GFX1132-NEXT: .LBB4_2: 1296; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1297; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1298; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1299; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1300; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1301; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1302; GFX1132-NEXT: s_mov_b32 s2, -1 1303; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1305; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1306; GFX1132-NEXT: s_endpgm 1307entry: 1308 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1309 store i64 %old, i64 addrspace(1)* %out 1310 ret void 1311} 1312 1313define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1314; 1315; 1316; GFX7LESS-LABEL: add_i64_uniform: 1317; GFX7LESS: ; %bb.0: ; %entry 1318; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1319; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1320; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1321; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1322; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1323; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1324; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1325; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 1326; GFX7LESS-NEXT: ; %bb.1: 1327; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1328; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 1329; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1330; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1331; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1332; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 1333; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1334; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1335; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1336; GFX7LESS-NEXT: s_mov_b32 m0, -1 1337; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1338; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1339; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1340; GFX7LESS-NEXT: .LBB5_2: 1341; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1342; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1343; GFX7LESS-NEXT: s_mov_b32 s6, -1 1344; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1345; GFX7LESS-NEXT: s_mov_b32 s4, s0 1346; GFX7LESS-NEXT: s_mov_b32 s5, s1 1347; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 1348; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1349; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 1350; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 1351; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 1352; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1353; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 1354; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1355; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1356; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1357; GFX7LESS-NEXT: s_endpgm 1358; 1359; GFX8-LABEL: add_i64_uniform: 1360; GFX8: ; %bb.0: ; %entry 1361; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1362; GFX8-NEXT: s_mov_b64 s[6:7], exec 1363; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1364; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1365; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1366; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1367; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1368; GFX8-NEXT: s_cbranch_execz .LBB5_2 1369; GFX8-NEXT: ; %bb.1: 1370; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 1371; GFX8-NEXT: v_mov_b32_e32 v0, s8 1372; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1373; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 1374; GFX8-NEXT: s_mul_i32 s6, s3, s8 1375; GFX8-NEXT: v_mov_b32_e32 v3, 0 1376; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1377; GFX8-NEXT: s_mov_b32 m0, -1 1378; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1379; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1380; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1381; GFX8-NEXT: .LBB5_2: 1382; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1383; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1384; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1385; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1386; GFX8-NEXT: v_mov_b32_e32 v0, s4 1387; GFX8-NEXT: v_mov_b32_e32 v1, s5 1388; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 1389; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 1390; GFX8-NEXT: s_mov_b32 s7, 0xf000 1391; GFX8-NEXT: s_mov_b32 s6, -1 1392; GFX8-NEXT: s_mov_b32 s4, s0 1393; GFX8-NEXT: s_mov_b32 s5, s1 1394; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1395; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1396; GFX8-NEXT: s_endpgm 1397; 1398; GFX9-LABEL: add_i64_uniform: 1399; GFX9: ; %bb.0: ; %entry 1400; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1401; GFX9-NEXT: s_mov_b64 s[6:7], exec 1402; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1403; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1404; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1405; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1406; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1407; GFX9-NEXT: s_cbranch_execz .LBB5_2 1408; GFX9-NEXT: ; %bb.1: 1409; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX9-NEXT: s_mul_i32 s7, s3, s6 1412; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1413; GFX9-NEXT: s_add_i32 s8, s8, s7 1414; GFX9-NEXT: s_mul_i32 s6, s2, s6 1415; GFX9-NEXT: v_mov_b32_e32 v0, s6 1416; GFX9-NEXT: v_mov_b32_e32 v1, s8 1417; GFX9-NEXT: v_mov_b32_e32 v3, 0 1418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1419; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1420; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1421; GFX9-NEXT: .LBB5_2: 1422; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1423; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1424; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1425; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1426; GFX9-NEXT: v_mov_b32_e32 v0, s4 1427; GFX9-NEXT: v_mov_b32_e32 v1, s5 1428; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] 1429; GFX9-NEXT: s_mov_b32 s7, 0xf000 1430; GFX9-NEXT: s_mov_b32 s6, -1 1431; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1432; GFX9-NEXT: s_mov_b32 s4, s0 1433; GFX9-NEXT: s_mov_b32 s5, s1 1434; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1435; GFX9-NEXT: s_endpgm 1436; 1437; GFX1064-LABEL: add_i64_uniform: 1438; GFX1064: ; %bb.0: ; %entry 1439; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1440; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1441; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1442; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1443; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1444; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1445; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1446; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1447; GFX1064-NEXT: ; %bb.1: 1448; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1449; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1450; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1452; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1453; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1454; GFX1064-NEXT: s_add_i32 s8, s8, s7 1455; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1456; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1457; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1458; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1459; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1460; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1461; GFX1064-NEXT: buffer_gl0_inv 1462; GFX1064-NEXT: .LBB5_2: 1463; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1464; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1465; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1466; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1467; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1468; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] 1469; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1470; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1471; GFX1064-NEXT: s_mov_b32 s2, -1 1472; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1473; GFX1064-NEXT: s_endpgm 1474; 1475; GFX1032-LABEL: add_i64_uniform: 1476; GFX1032: ; %bb.0: ; %entry 1477; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1478; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1479; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1480; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1481; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1482; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1483; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1484; GFX1032-NEXT: ; %bb.1: 1485; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1486; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1487; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1488; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1489; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1490; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1491; GFX1032-NEXT: s_add_i32 s7, s7, s6 1492; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1493; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1494; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1495; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1496; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1497; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1498; GFX1032-NEXT: buffer_gl0_inv 1499; GFX1032-NEXT: .LBB5_2: 1500; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1501; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1502; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1503; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1504; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1505; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1506; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] 1507; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1508; GFX1032-NEXT: s_mov_b32 s2, -1 1509; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1510; GFX1032-NEXT: s_endpgm 1511; 1512; GFX1164-LABEL: add_i64_uniform: 1513; GFX1164: ; %bb.0: ; %entry 1514; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1515; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1516; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1517; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1518; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1519; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1520; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1521; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1522; GFX1164-NEXT: s_cbranch_execz .LBB5_2 1523; GFX1164-NEXT: ; %bb.1: 1524; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1525; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1526; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1527; GFX1164-NEXT: s_mul_i32 s7, s3, s6 1528; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 1529; GFX1164-NEXT: s_mul_i32 s6, s2, s6 1530; GFX1164-NEXT: s_add_i32 s8, s8, s7 1531; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1532; GFX1164-NEXT: v_mov_b32_e32 v1, s8 1533; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1534; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1535; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1536; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1537; GFX1164-NEXT: buffer_gl0_inv 1538; GFX1164-NEXT: .LBB5_2: 1539; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1540; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 1541; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 1542; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1543; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1544; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1545; GFX1164-NEXT: s_mov_b32 s2, -1 1546; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1547; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1548; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1549; GFX1164-NEXT: v_mov_b32_e32 v1, v3 1550; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1551; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1552; GFX1164-NEXT: s_endpgm 1553; 1554; GFX1132-LABEL: add_i64_uniform: 1555; GFX1132: ; %bb.0: ; %entry 1556; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1557; GFX1132-NEXT: s_mov_b32 s5, exec_lo 1558; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1559; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1560; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1561; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1562; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1563; GFX1132-NEXT: s_cbranch_execz .LBB5_2 1564; GFX1132-NEXT: ; %bb.1: 1565; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 1566; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1567; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1568; GFX1132-NEXT: s_mul_i32 s6, s3, s5 1569; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 1570; GFX1132-NEXT: s_mul_i32 s5, s2, s5 1571; GFX1132-NEXT: s_add_i32 s7, s7, s6 1572; GFX1132-NEXT: v_mov_b32_e32 v0, s5 1573; GFX1132-NEXT: v_mov_b32_e32 v1, s7 1574; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1575; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1576; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1577; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1578; GFX1132-NEXT: buffer_gl0_inv 1579; GFX1132-NEXT: .LBB5_2: 1580; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1581; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 1582; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 1583; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1584; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1585; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1586; GFX1132-NEXT: s_mov_b32 s2, -1 1587; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1588; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1589; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1590; GFX1132-NEXT: v_mov_b32_e32 v1, v3 1591; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1592; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1593; GFX1132-NEXT: s_endpgm 1594entry: 1595 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1596 store i64 %old, i64 addrspace(1)* %out 1597 ret void 1598} 1599 1600define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1601; 1602; 1603; GFX7LESS-LABEL: add_i64_varying: 1604; GFX7LESS: ; %bb.0: ; %entry 1605; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1606; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1607; GFX7LESS-NEXT: s_mov_b32 m0, -1 1608; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1609; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1610; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1611; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1612; GFX7LESS-NEXT: s_mov_b32 s2, -1 1613; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1614; GFX7LESS-NEXT: s_endpgm 1615; 1616; GFX8-LABEL: add_i64_varying: 1617; GFX8: ; %bb.0: ; %entry 1618; GFX8-NEXT: v_mov_b32_e32 v1, 0 1619; GFX8-NEXT: s_mov_b32 m0, -1 1620; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1621; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1622; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1623; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1624; GFX8-NEXT: s_mov_b32 s3, 0xf000 1625; GFX8-NEXT: s_mov_b32 s2, -1 1626; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1627; GFX8-NEXT: s_endpgm 1628; 1629; GFX9-LABEL: add_i64_varying: 1630; GFX9: ; %bb.0: ; %entry 1631; GFX9-NEXT: v_mov_b32_e32 v1, 0 1632; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1633; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1634; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1635; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX9-NEXT: s_mov_b32 s3, 0xf000 1637; GFX9-NEXT: s_mov_b32 s2, -1 1638; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1639; GFX9-NEXT: s_endpgm 1640; 1641; GFX10-LABEL: add_i64_varying: 1642; GFX10: ; %bb.0: ; %entry 1643; GFX10-NEXT: v_mov_b32_e32 v1, 0 1644; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1645; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1646; GFX10-NEXT: s_mov_b32 s2, -1 1647; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1648; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1649; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1650; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1651; GFX10-NEXT: buffer_gl0_inv 1652; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1653; GFX10-NEXT: s_endpgm 1654; 1655; GFX11-LABEL: add_i64_varying: 1656; GFX11: ; %bb.0: ; %entry 1657; GFX11-NEXT: v_mov_b32_e32 v1, 0 1658; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1659; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1660; GFX11-NEXT: s_mov_b32 s2, -1 1661; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1662; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1663; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1664; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1665; GFX11-NEXT: buffer_gl0_inv 1666; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1667; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1668; GFX11-NEXT: s_endpgm 1669entry: 1670 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1671 %zext = zext i32 %lane to i64 1672 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1673 store i64 %old, i64 addrspace(1)* %out 1674 ret void 1675} 1676 1677define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1678; 1679; 1680; GFX7LESS-LABEL: sub_i32_constant: 1681; GFX7LESS: ; %bb.0: ; %entry 1682; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1683; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1684; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1685; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1686; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1687; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1688; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1689; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1690; GFX7LESS-NEXT: ; %bb.1: 1691; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1692; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1693; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1694; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1695; GFX7LESS-NEXT: s_mov_b32 m0, -1 1696; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1697; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1698; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX7LESS-NEXT: .LBB7_2: 1700; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1701; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1702; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1703; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1704; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1705; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1706; GFX7LESS-NEXT: s_mov_b32 s2, -1 1707; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1708; GFX7LESS-NEXT: s_endpgm 1709; 1710; GFX8-LABEL: sub_i32_constant: 1711; GFX8: ; %bb.0: ; %entry 1712; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1713; GFX8-NEXT: s_mov_b64 s[2:3], exec 1714; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1715; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1716; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1717; GFX8-NEXT: ; implicit-def: $vgpr1 1718; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1719; GFX8-NEXT: s_cbranch_execz .LBB7_2 1720; GFX8-NEXT: ; %bb.1: 1721; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1722; GFX8-NEXT: s_mul_i32 s2, s2, 5 1723; GFX8-NEXT: v_mov_b32_e32 v1, 0 1724; GFX8-NEXT: v_mov_b32_e32 v2, s2 1725; GFX8-NEXT: s_mov_b32 m0, -1 1726; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1728; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1729; GFX8-NEXT: .LBB7_2: 1730; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1731; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1732; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1733; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1734; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1735; GFX8-NEXT: s_mov_b32 s3, 0xf000 1736; GFX8-NEXT: s_mov_b32 s2, -1 1737; GFX8-NEXT: s_nop 0 1738; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1739; GFX8-NEXT: s_endpgm 1740; 1741; GFX9-LABEL: sub_i32_constant: 1742; GFX9: ; %bb.0: ; %entry 1743; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1744; GFX9-NEXT: s_mov_b64 s[2:3], exec 1745; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1746; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1747; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1748; GFX9-NEXT: ; implicit-def: $vgpr1 1749; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1750; GFX9-NEXT: s_cbranch_execz .LBB7_2 1751; GFX9-NEXT: ; %bb.1: 1752; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1753; GFX9-NEXT: s_mul_i32 s2, s2, 5 1754; GFX9-NEXT: v_mov_b32_e32 v1, 0 1755; GFX9-NEXT: v_mov_b32_e32 v2, s2 1756; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1757; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1758; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1759; GFX9-NEXT: .LBB7_2: 1760; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1761; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1762; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1763; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1764; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1765; GFX9-NEXT: s_mov_b32 s3, 0xf000 1766; GFX9-NEXT: s_mov_b32 s2, -1 1767; GFX9-NEXT: s_nop 0 1768; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1769; GFX9-NEXT: s_endpgm 1770; 1771; GFX1064-LABEL: sub_i32_constant: 1772; GFX1064: ; %bb.0: ; %entry 1773; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1774; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1775; GFX1064-NEXT: ; implicit-def: $vgpr1 1776; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1777; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1778; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1779; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1780; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1781; GFX1064-NEXT: ; %bb.1: 1782; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1783; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1784; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1785; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1786; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1787; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1788; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1789; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1790; GFX1064-NEXT: buffer_gl0_inv 1791; GFX1064-NEXT: .LBB7_2: 1792; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1793; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1794; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1795; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1796; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1797; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1798; GFX1064-NEXT: s_mov_b32 s2, -1 1799; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1800; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1801; GFX1064-NEXT: s_endpgm 1802; 1803; GFX1032-LABEL: sub_i32_constant: 1804; GFX1032: ; %bb.0: ; %entry 1805; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1806; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1807; GFX1032-NEXT: ; implicit-def: $vgpr1 1808; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1809; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1810; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1811; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1812; GFX1032-NEXT: ; %bb.1: 1813; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1814; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1815; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1816; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1817; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1818; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1819; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1820; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1821; GFX1032-NEXT: buffer_gl0_inv 1822; GFX1032-NEXT: .LBB7_2: 1823; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1824; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1825; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1826; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1827; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1828; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1829; GFX1032-NEXT: s_mov_b32 s2, -1 1830; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1831; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1832; GFX1032-NEXT: s_endpgm 1833; 1834; GFX1164-LABEL: sub_i32_constant: 1835; GFX1164: ; %bb.0: ; %entry 1836; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1837; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1838; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1839; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1840; GFX1164-NEXT: ; implicit-def: $vgpr1 1841; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1842; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1843; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 1844; GFX1164-NEXT: s_cbranch_execz .LBB7_2 1845; GFX1164-NEXT: ; %bb.1: 1846; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1847; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1848; GFX1164-NEXT: s_mul_i32 s2, s2, 5 1849; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1850; GFX1164-NEXT: v_mov_b32_e32 v2, s2 1851; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1852; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1853; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 1854; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX1164-NEXT: buffer_gl0_inv 1856; GFX1164-NEXT: .LBB7_2: 1857; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1858; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 1859; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1860; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1861; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1862; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1863; GFX1164-NEXT: s_mov_b32 s2, -1 1864; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1865; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1866; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1867; GFX1164-NEXT: s_endpgm 1868; 1869; GFX1132-LABEL: sub_i32_constant: 1870; GFX1132: ; %bb.0: ; %entry 1871; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1872; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1873; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1874; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1875; GFX1132-NEXT: ; implicit-def: $vgpr1 1876; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1877; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 1878; GFX1132-NEXT: s_cbranch_execz .LBB7_2 1879; GFX1132-NEXT: ; %bb.1: 1880; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1881; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1882; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1883; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1884; GFX1132-NEXT: v_mov_b32_e32 v2, s3 1885; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1886; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1887; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 1888; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1889; GFX1132-NEXT: buffer_gl0_inv 1890; GFX1132-NEXT: .LBB7_2: 1891; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1892; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 1893; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1894; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1895; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1896; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1897; GFX1132-NEXT: s_mov_b32 s2, -1 1898; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1899; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1900; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1901; GFX1132-NEXT: s_endpgm 1902entry: 1903 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1904 store i32 %old, i32 addrspace(1)* %out 1905 ret void 1906} 1907 1908define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1909; 1910; 1911; GFX7LESS-LABEL: sub_i32_uniform: 1912; GFX7LESS: ; %bb.0: ; %entry 1913; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1914; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1915; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1916; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1917; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1918; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1919; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1920; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1921; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1922; GFX7LESS-NEXT: ; %bb.1: 1923; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1924; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1925; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1926; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1927; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1928; GFX7LESS-NEXT: s_mov_b32 m0, -1 1929; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1930; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1931; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1932; GFX7LESS-NEXT: .LBB8_2: 1933; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1934; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1935; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1936; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1937; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1938; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1939; GFX7LESS-NEXT: s_mov_b32 s6, -1 1940; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1941; GFX7LESS-NEXT: s_endpgm 1942; 1943; GFX8-LABEL: sub_i32_uniform: 1944; GFX8: ; %bb.0: ; %entry 1945; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1946; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1947; GFX8-NEXT: s_mov_b64 s[2:3], exec 1948; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1949; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1950; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1951; GFX8-NEXT: ; implicit-def: $vgpr1 1952; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1953; GFX8-NEXT: s_cbranch_execz .LBB8_2 1954; GFX8-NEXT: ; %bb.1: 1955; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1956; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1957; GFX8-NEXT: s_mul_i32 s2, s6, s2 1958; GFX8-NEXT: v_mov_b32_e32 v1, 0 1959; GFX8-NEXT: v_mov_b32_e32 v2, s2 1960; GFX8-NEXT: s_mov_b32 m0, -1 1961; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1962; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1963; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1964; GFX8-NEXT: .LBB8_2: 1965; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1966; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1967; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1968; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1969; GFX8-NEXT: s_mov_b32 s7, 0xf000 1970; GFX8-NEXT: s_mov_b32 s6, -1 1971; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1972; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1973; GFX8-NEXT: s_endpgm 1974; 1975; GFX9-LABEL: sub_i32_uniform: 1976; GFX9: ; %bb.0: ; %entry 1977; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1978; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1979; GFX9-NEXT: s_mov_b64 s[2:3], exec 1980; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1981; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1982; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1983; GFX9-NEXT: ; implicit-def: $vgpr1 1984; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1985; GFX9-NEXT: s_cbranch_execz .LBB8_2 1986; GFX9-NEXT: ; %bb.1: 1987; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1988; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1989; GFX9-NEXT: s_mul_i32 s2, s6, s2 1990; GFX9-NEXT: v_mov_b32_e32 v1, 0 1991; GFX9-NEXT: v_mov_b32_e32 v2, s2 1992; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1993; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1994; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1995; GFX9-NEXT: .LBB8_2: 1996; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1998; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1999; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2000; GFX9-NEXT: s_mov_b32 s7, 0xf000 2001; GFX9-NEXT: s_mov_b32 s6, -1 2002; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 2003; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 2004; GFX9-NEXT: s_endpgm 2005; 2006; GFX1064-LABEL: sub_i32_uniform: 2007; GFX1064: ; %bb.0: ; %entry 2008; GFX1064-NEXT: s_clause 0x1 2009; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2010; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 2011; GFX1064-NEXT: s_mov_b64 s[2:3], exec 2012; GFX1064-NEXT: ; implicit-def: $vgpr1 2013; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2014; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2015; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2016; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 2017; GFX1064-NEXT: s_cbranch_execz .LBB8_2 2018; GFX1064-NEXT: ; %bb.1: 2019; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2020; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2021; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2022; GFX1064-NEXT: s_mul_i32 s2, s6, s2 2023; GFX1064-NEXT: v_mov_b32_e32 v2, s2 2024; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2025; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2026; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 2027; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2028; GFX1064-NEXT: buffer_gl0_inv 2029; GFX1064-NEXT: .LBB8_2: 2030; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2031; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 2032; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2033; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 2034; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 2035; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2036; GFX1064-NEXT: s_mov_b32 s6, -1 2037; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2038; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 2039; GFX1064-NEXT: s_endpgm 2040; 2041; GFX1032-LABEL: sub_i32_uniform: 2042; GFX1032: ; %bb.0: ; %entry 2043; GFX1032-NEXT: s_clause 0x1 2044; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2045; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 2046; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2047; GFX1032-NEXT: ; implicit-def: $vgpr1 2048; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 2049; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2050; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2051; GFX1032-NEXT: s_cbranch_execz .LBB8_2 2052; GFX1032-NEXT: ; %bb.1: 2053; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 2054; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2055; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2056; GFX1032-NEXT: s_mul_i32 s1, s2, s1 2057; GFX1032-NEXT: v_mov_b32_e32 v2, s1 2058; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2059; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2060; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 2061; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2062; GFX1032-NEXT: buffer_gl0_inv 2063; GFX1032-NEXT: .LBB8_2: 2064; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2065; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 2066; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2067; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2068; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 2069; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2070; GFX1032-NEXT: s_mov_b32 s6, -1 2071; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2072; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 2073; GFX1032-NEXT: s_endpgm 2074; 2075; GFX1164-LABEL: sub_i32_uniform: 2076; GFX1164: ; %bb.0: ; %entry 2077; GFX1164-NEXT: s_clause 0x1 2078; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2079; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 2080; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2081; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2082; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2083; GFX1164-NEXT: ; implicit-def: $vgpr1 2084; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2085; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2086; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 2087; GFX1164-NEXT: s_cbranch_execz .LBB8_2 2088; GFX1164-NEXT: ; %bb.1: 2089; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2090; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2091; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2092; GFX1164-NEXT: s_mul_i32 s2, s6, s2 2093; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2094; GFX1164-NEXT: v_mov_b32_e32 v2, s2 2095; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2096; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2097; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 2098; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2099; GFX1164-NEXT: buffer_gl0_inv 2100; GFX1164-NEXT: .LBB8_2: 2101; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 2102; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2103; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 2104; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 2105; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 2106; GFX1164-NEXT: s_mov_b32 s6, -1 2107; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2108; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2109; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2110; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2111; GFX1164-NEXT: s_endpgm 2112; 2113; GFX1132-LABEL: sub_i32_uniform: 2114; GFX1132: ; %bb.0: ; %entry 2115; GFX1132-NEXT: s_clause 0x1 2116; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2117; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 2118; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2119; GFX1132-NEXT: s_mov_b32 s1, exec_lo 2120; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2121; GFX1132-NEXT: ; implicit-def: $vgpr1 2122; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2123; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 2124; GFX1132-NEXT: s_cbranch_execz .LBB8_2 2125; GFX1132-NEXT: ; %bb.1: 2126; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 2127; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2128; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2129; GFX1132-NEXT: s_mul_i32 s2, s0, s2 2130; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2131; GFX1132-NEXT: v_mov_b32_e32 v2, s2 2132; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2133; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2134; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 2135; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2136; GFX1132-NEXT: buffer_gl0_inv 2137; GFX1132-NEXT: .LBB8_2: 2138; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 2139; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2140; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 2141; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 2142; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 2143; GFX1132-NEXT: s_mov_b32 s6, -1 2144; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2145; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2146; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2147; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2148; GFX1132-NEXT: s_endpgm 2149entry: 2150 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 2151 store i32 %old, i32 addrspace(1)* %out 2152 ret void 2153} 2154 2155define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 2156; 2157; 2158; GFX7LESS-LABEL: sub_i32_varying: 2159; GFX7LESS: ; %bb.0: ; %entry 2160; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2161; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2162; GFX7LESS-NEXT: s_mov_b32 m0, -1 2163; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2164; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 2165; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2166; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2167; GFX7LESS-NEXT: s_mov_b32 s2, -1 2168; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2169; GFX7LESS-NEXT: s_endpgm 2170; 2171; GFX8-LABEL: sub_i32_varying: 2172; GFX8: ; %bb.0: ; %entry 2173; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2174; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2175; GFX8-NEXT: v_mov_b32_e32 v1, 0 2176; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2177; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2178; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2179; GFX8-NEXT: v_mov_b32_e32 v2, v0 2180; GFX8-NEXT: s_not_b64 exec, exec 2181; GFX8-NEXT: v_mov_b32_e32 v2, 0 2182; GFX8-NEXT: s_not_b64 exec, exec 2183; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2184; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2185; GFX8-NEXT: s_nop 1 2186; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2187; GFX8-NEXT: s_nop 1 2188; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2189; GFX8-NEXT: s_nop 1 2190; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2191; GFX8-NEXT: s_nop 1 2192; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2193; GFX8-NEXT: s_nop 1 2194; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2195; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2196; GFX8-NEXT: s_nop 0 2197; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2198; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2199; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2200; GFX8-NEXT: ; implicit-def: $vgpr0 2201; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2202; GFX8-NEXT: s_cbranch_execz .LBB9_2 2203; GFX8-NEXT: ; %bb.1: 2204; GFX8-NEXT: v_mov_b32_e32 v0, 0 2205; GFX8-NEXT: v_mov_b32_e32 v3, s4 2206; GFX8-NEXT: s_mov_b32 m0, -1 2207; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2208; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 2209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2210; GFX8-NEXT: .LBB9_2: 2211; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2212; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2214; GFX8-NEXT: v_mov_b32_e32 v0, v1 2215; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2216; GFX8-NEXT: s_mov_b32 s3, 0xf000 2217; GFX8-NEXT: s_mov_b32 s2, -1 2218; GFX8-NEXT: s_nop 0 2219; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2220; GFX8-NEXT: s_endpgm 2221; 2222; GFX9-LABEL: sub_i32_varying: 2223; GFX9: ; %bb.0: ; %entry 2224; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2225; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2226; GFX9-NEXT: v_mov_b32_e32 v1, 0 2227; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2228; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2229; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2230; GFX9-NEXT: v_mov_b32_e32 v2, v0 2231; GFX9-NEXT: s_not_b64 exec, exec 2232; GFX9-NEXT: v_mov_b32_e32 v2, 0 2233; GFX9-NEXT: s_not_b64 exec, exec 2234; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2235; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2236; GFX9-NEXT: s_nop 1 2237; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2238; GFX9-NEXT: s_nop 1 2239; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2240; GFX9-NEXT: s_nop 1 2241; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2242; GFX9-NEXT: s_nop 1 2243; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2244; GFX9-NEXT: s_nop 1 2245; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2246; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2247; GFX9-NEXT: s_nop 0 2248; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2249; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2250; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2251; GFX9-NEXT: ; implicit-def: $vgpr0 2252; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2253; GFX9-NEXT: s_cbranch_execz .LBB9_2 2254; GFX9-NEXT: ; %bb.1: 2255; GFX9-NEXT: v_mov_b32_e32 v0, 0 2256; GFX9-NEXT: v_mov_b32_e32 v3, s4 2257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2258; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2259; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2260; GFX9-NEXT: .LBB9_2: 2261; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2262; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2263; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2264; GFX9-NEXT: v_mov_b32_e32 v0, v1 2265; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2266; GFX9-NEXT: s_mov_b32 s3, 0xf000 2267; GFX9-NEXT: s_mov_b32 s2, -1 2268; GFX9-NEXT: s_nop 0 2269; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2270; GFX9-NEXT: s_endpgm 2271; 2272; GFX1064-LABEL: sub_i32_varying: 2273; GFX1064: ; %bb.0: ; %entry 2274; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2275; GFX1064-NEXT: s_not_b64 exec, exec 2276; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2277; GFX1064-NEXT: s_not_b64 exec, exec 2278; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2279; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2280; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2281; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2282; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2283; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2284; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2285; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2286; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2287; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2288; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2289; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2290; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2291; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2292; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2293; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2294; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2295; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2296; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2297; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2298; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2299; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2300; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2301; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2302; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2303; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2304; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2305; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2306; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2307; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2308; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2309; GFX1064-NEXT: s_mov_b32 s2, -1 2310; GFX1064-NEXT: ; implicit-def: $vgpr0 2311; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2312; GFX1064-NEXT: s_cbranch_execz .LBB9_2 2313; GFX1064-NEXT: ; %bb.1: 2314; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2315; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2316; GFX1064-NEXT: s_mov_b32 s3, s7 2317; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2318; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2319; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 2320; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2321; GFX1064-NEXT: buffer_gl0_inv 2322; GFX1064-NEXT: .LBB9_2: 2323; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2324; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2325; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2326; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2327; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2328; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2329; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2330; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2331; GFX1064-NEXT: s_endpgm 2332; 2333; GFX1032-LABEL: sub_i32_varying: 2334; GFX1032: ; %bb.0: ; %entry 2335; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2336; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2337; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2338; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2339; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2340; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2341; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2342; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2343; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2344; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2345; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2346; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2347; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2348; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2349; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2350; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2351; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2352; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2353; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2354; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2355; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2356; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2357; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2358; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2359; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2360; GFX1032-NEXT: s_mov_b32 s2, -1 2361; GFX1032-NEXT: ; implicit-def: $vgpr0 2362; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2363; GFX1032-NEXT: s_cbranch_execz .LBB9_2 2364; GFX1032-NEXT: ; %bb.1: 2365; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2366; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2367; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2368; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2369; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 2370; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2371; GFX1032-NEXT: buffer_gl0_inv 2372; GFX1032-NEXT: .LBB9_2: 2373; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2374; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2375; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2376; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2377; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2378; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2379; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2380; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2381; GFX1032-NEXT: s_endpgm 2382; 2383; GFX1164-LABEL: sub_i32_varying: 2384; GFX1164: ; %bb.0: ; %entry 2385; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2386; GFX1164-NEXT: s_not_b64 exec, exec 2387; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2388; GFX1164-NEXT: s_not_b64 exec, exec 2389; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2390; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2391; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2392; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2393; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2394; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2395; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2396; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2397; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2398; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2399; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2400; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2401; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2402; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 2403; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2404; GFX1164-NEXT: v_mov_b32_e32 v2, s4 2405; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2406; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2407; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 2408; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2409; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2410; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2411; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2412; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 2413; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 2414; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2415; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2416; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2417; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2418; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 2419; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 2420; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 2421; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2422; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 2423; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2424; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 2425; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 2426; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 2427; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2428; GFX1164-NEXT: s_mov_b32 s2, -1 2429; GFX1164-NEXT: ; implicit-def: $vgpr0 2430; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 2431; GFX1164-NEXT: s_cbranch_execz .LBB9_2 2432; GFX1164-NEXT: ; %bb.1: 2433; GFX1164-NEXT: v_mov_b32_e32 v0, 0 2434; GFX1164-NEXT: v_mov_b32_e32 v4, s7 2435; GFX1164-NEXT: s_mov_b32 s3, s7 2436; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2437; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2438; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v4 2439; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2440; GFX1164-NEXT: buffer_gl0_inv 2441; GFX1164-NEXT: .LBB9_2: 2442; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 2443; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 2444; GFX1164-NEXT: v_mov_b32_e32 v0, v3 2445; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2446; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2447; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2448; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2449; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2450; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2451; GFX1164-NEXT: s_endpgm 2452; 2453; GFX1132-LABEL: sub_i32_varying: 2454; GFX1132: ; %bb.0: ; %entry 2455; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2456; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2457; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2458; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2459; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2460; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2461; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2462; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2463; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2464; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2465; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2466; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2467; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2468; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2469; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2470; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2471; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2472; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2473; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2474; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2475; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 2476; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 2477; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2478; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2479; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2480; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2481; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2482; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 2483; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2484; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 2485; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2486; GFX1132-NEXT: s_mov_b32 s2, -1 2487; GFX1132-NEXT: ; implicit-def: $vgpr0 2488; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 2489; GFX1132-NEXT: s_cbranch_execz .LBB9_2 2490; GFX1132-NEXT: ; %bb.1: 2491; GFX1132-NEXT: v_mov_b32_e32 v0, 0 2492; GFX1132-NEXT: v_mov_b32_e32 v4, s4 2493; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2494; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2495; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v4 2496; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2497; GFX1132-NEXT: buffer_gl0_inv 2498; GFX1132-NEXT: .LBB9_2: 2499; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 2500; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 2501; GFX1132-NEXT: v_mov_b32_e32 v0, v3 2502; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2503; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2504; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2505; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2506; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2507; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2508; GFX1132-NEXT: s_endpgm 2509entry: 2510 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2511 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2512 store i32 %old, i32 addrspace(1)* %out 2513 ret void 2514} 2515 2516define amdgpu_kernel void @sub_i32_varying_nouse() { 2517; GFX7LESS-LABEL: sub_i32_varying_nouse: 2518; GFX7LESS: ; %bb.0: ; %entry 2519; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2520; GFX7LESS-NEXT: s_mov_b32 m0, -1 2521; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2522; GFX7LESS-NEXT: ds_sub_u32 v1, v0 2523; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2524; GFX7LESS-NEXT: s_endpgm 2525; 2526; GFX8-LABEL: sub_i32_varying_nouse: 2527; GFX8: ; %bb.0: ; %entry 2528; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2529; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2530; GFX8-NEXT: v_mov_b32_e32 v1, v0 2531; GFX8-NEXT: s_not_b64 exec, exec 2532; GFX8-NEXT: v_mov_b32_e32 v1, 0 2533; GFX8-NEXT: s_not_b64 exec, exec 2534; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 2535; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2536; GFX8-NEXT: s_nop 1 2537; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2538; GFX8-NEXT: s_nop 1 2539; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2540; GFX8-NEXT: s_nop 1 2541; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2542; GFX8-NEXT: s_nop 1 2543; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2544; GFX8-NEXT: s_nop 1 2545; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2546; GFX8-NEXT: v_readlane_b32 s2, v1, 63 2547; GFX8-NEXT: s_mov_b64 exec, s[0:1] 2548; GFX8-NEXT: s_mov_b32 s0, s2 2549; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2550; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2551; GFX8-NEXT: s_cbranch_execz .LBB10_2 2552; GFX8-NEXT: ; %bb.1: 2553; GFX8-NEXT: v_mov_b32_e32 v0, 0 2554; GFX8-NEXT: v_mov_b32_e32 v2, s0 2555; GFX8-NEXT: s_mov_b32 m0, -1 2556; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2557; GFX8-NEXT: ds_sub_u32 v0, v2 2558; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2559; GFX8-NEXT: .LBB10_2: 2560; GFX8-NEXT: s_endpgm 2561; 2562; GFX9-LABEL: sub_i32_varying_nouse: 2563; GFX9: ; %bb.0: ; %entry 2564; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2565; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2566; GFX9-NEXT: v_mov_b32_e32 v1, v0 2567; GFX9-NEXT: s_not_b64 exec, exec 2568; GFX9-NEXT: v_mov_b32_e32 v1, 0 2569; GFX9-NEXT: s_not_b64 exec, exec 2570; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 2571; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2572; GFX9-NEXT: s_nop 1 2573; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2574; GFX9-NEXT: s_nop 1 2575; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2576; GFX9-NEXT: s_nop 1 2577; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2578; GFX9-NEXT: s_nop 1 2579; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2580; GFX9-NEXT: s_nop 1 2581; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2582; GFX9-NEXT: v_readlane_b32 s2, v1, 63 2583; GFX9-NEXT: s_mov_b64 exec, s[0:1] 2584; GFX9-NEXT: s_mov_b32 s0, s2 2585; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2586; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2587; GFX9-NEXT: s_cbranch_execz .LBB10_2 2588; GFX9-NEXT: ; %bb.1: 2589; GFX9-NEXT: v_mov_b32_e32 v0, 0 2590; GFX9-NEXT: v_mov_b32_e32 v2, s0 2591; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2592; GFX9-NEXT: ds_sub_u32 v0, v2 2593; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2594; GFX9-NEXT: .LBB10_2: 2595; GFX9-NEXT: s_endpgm 2596; 2597; GFX1064-LABEL: sub_i32_varying_nouse: 2598; GFX1064: ; %bb.0: ; %entry 2599; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2600; GFX1064-NEXT: s_not_b64 exec, exec 2601; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2602; GFX1064-NEXT: s_not_b64 exec, exec 2603; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2604; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2605; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2606; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2607; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2608; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2609; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2610; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 2611; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2612; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2613; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2614; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 2615; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 2616; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2617; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2618; GFX1064-NEXT: s_add_i32 s0, s2, s3 2619; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2620; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2621; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2622; GFX1064-NEXT: ; %bb.1: 2623; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2624; GFX1064-NEXT: v_mov_b32_e32 v3, s0 2625; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2626; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2627; GFX1064-NEXT: ds_sub_u32 v0, v3 2628; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2629; GFX1064-NEXT: buffer_gl0_inv 2630; GFX1064-NEXT: .LBB10_2: 2631; GFX1064-NEXT: s_endpgm 2632; 2633; GFX1032-LABEL: sub_i32_varying_nouse: 2634; GFX1032: ; %bb.0: ; %entry 2635; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2636; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2637; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2638; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2639; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 2640; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2641; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2642; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2643; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2644; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2645; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2646; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 2647; GFX1032-NEXT: s_mov_b32 exec_lo, s0 2648; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2649; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2650; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2651; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2652; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2653; GFX1032-NEXT: ; %bb.1: 2654; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2655; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2656; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2657; GFX1032-NEXT: ds_sub_u32 v3, v0 2658; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2659; GFX1032-NEXT: buffer_gl0_inv 2660; GFX1032-NEXT: .LBB10_2: 2661; GFX1032-NEXT: s_endpgm 2662; 2663; GFX1164-LABEL: sub_i32_varying_nouse: 2664; GFX1164: ; %bb.0: ; %entry 2665; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2666; GFX1164-NEXT: s_not_b64 exec, exec 2667; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2668; GFX1164-NEXT: s_not_b64 exec, exec 2669; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2670; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2671; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2672; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2673; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2674; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2675; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2676; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2677; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2678; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2679; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2680; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2681; GFX1164-NEXT: v_permlane64_b32 v2, v1 2682; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2683; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2684; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2685; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2686; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2687; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2688; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) 2689; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 2690; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2691; GFX1164-NEXT: v_mov_b32_e32 v0, v1 2692; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2693; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 2694; GFX1164-NEXT: s_cbranch_execz .LBB10_2 2695; GFX1164-NEXT: ; %bb.1: 2696; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2697; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2698; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2699; GFX1164-NEXT: ds_sub_u32 v3, v0 2700; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2701; GFX1164-NEXT: buffer_gl0_inv 2702; GFX1164-NEXT: .LBB10_2: 2703; GFX1164-NEXT: s_endpgm 2704; 2705; GFX1132-LABEL: sub_i32_varying_nouse: 2706; GFX1132: ; %bb.0: ; %entry 2707; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2708; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2709; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2710; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2711; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 2712; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2713; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2714; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2715; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2716; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2717; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2718; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2719; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2720; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2721; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2722; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 2723; GFX1132-NEXT: s_mov_b32 exec_lo, s0 2724; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2725; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2726; GFX1132-NEXT: v_mov_b32_e32 v0, v1 2727; GFX1132-NEXT: s_mov_b32 s0, exec_lo 2728; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 2729; GFX1132-NEXT: s_cbranch_execz .LBB10_2 2730; GFX1132-NEXT: ; %bb.1: 2731; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2732; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2733; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2734; GFX1132-NEXT: ds_sub_u32 v3, v0 2735; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2736; GFX1132-NEXT: buffer_gl0_inv 2737; GFX1132-NEXT: .LBB10_2: 2738; GFX1132-NEXT: s_endpgm 2739entry: 2740 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2741 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2742 ret void 2743} 2744 2745define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2746; 2747; 2748; GFX7LESS-LABEL: sub_i64_constant: 2749; GFX7LESS: ; %bb.0: ; %entry 2750; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2751; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2752; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2753; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 2754; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2755; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2756; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2757; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 2758; GFX7LESS-NEXT: ; %bb.1: 2759; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2760; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 2761; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2762; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 2763; GFX7LESS-NEXT: s_mov_b32 m0, -1 2764; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2765; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2766; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2767; GFX7LESS-NEXT: .LBB11_2: 2768; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2769; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2770; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 2771; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 2772; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2773; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2774; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2775; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2776; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2777; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2778; GFX7LESS-NEXT: s_mov_b32 s2, -1 2779; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2780; GFX7LESS-NEXT: s_endpgm 2781; 2782; GFX8-LABEL: sub_i64_constant: 2783; GFX8: ; %bb.0: ; %entry 2784; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2785; GFX8-NEXT: s_mov_b64 s[4:5], exec 2786; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2787; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2788; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2789; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2790; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2791; GFX8-NEXT: s_cbranch_execz .LBB11_2 2792; GFX8-NEXT: ; %bb.1: 2793; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2794; GFX8-NEXT: s_mul_i32 s4, s4, 5 2795; GFX8-NEXT: v_mov_b32_e32 v0, s4 2796; GFX8-NEXT: v_mov_b32_e32 v1, 0 2797; GFX8-NEXT: s_mov_b32 m0, -1 2798; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2799; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2800; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2801; GFX8-NEXT: .LBB11_2: 2802; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2803; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2804; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2805; GFX8-NEXT: v_readfirstlane_b32 s3, v1 2806; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2807; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2808; GFX8-NEXT: v_mov_b32_e32 v2, s3 2809; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2810; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2811; GFX8-NEXT: s_mov_b32 s3, 0xf000 2812; GFX8-NEXT: s_mov_b32 s2, -1 2813; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2814; GFX8-NEXT: s_endpgm 2815; 2816; GFX9-LABEL: sub_i64_constant: 2817; GFX9: ; %bb.0: ; %entry 2818; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2819; GFX9-NEXT: s_mov_b64 s[4:5], exec 2820; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2821; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2822; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2823; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2824; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2825; GFX9-NEXT: s_cbranch_execz .LBB11_2 2826; GFX9-NEXT: ; %bb.1: 2827; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2828; GFX9-NEXT: s_mul_i32 s4, s4, 5 2829; GFX9-NEXT: v_mov_b32_e32 v0, s4 2830; GFX9-NEXT: v_mov_b32_e32 v1, 0 2831; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2832; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2833; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2834; GFX9-NEXT: .LBB11_2: 2835; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2836; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2838; GFX9-NEXT: v_readfirstlane_b32 s3, v1 2839; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2840; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2841; GFX9-NEXT: v_mov_b32_e32 v2, s3 2842; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2843; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2844; GFX9-NEXT: s_mov_b32 s3, 0xf000 2845; GFX9-NEXT: s_mov_b32 s2, -1 2846; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2847; GFX9-NEXT: s_endpgm 2848; 2849; GFX1064-LABEL: sub_i64_constant: 2850; GFX1064: ; %bb.0: ; %entry 2851; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2852; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2853; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2854; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2855; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2856; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2857; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2858; GFX1064-NEXT: s_cbranch_execz .LBB11_2 2859; GFX1064-NEXT: ; %bb.1: 2860; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2861; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2862; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2863; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2864; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2865; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2866; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2867; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2868; GFX1064-NEXT: buffer_gl0_inv 2869; GFX1064-NEXT: .LBB11_2: 2870; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2871; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2872; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2873; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2874; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2875; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2876; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2877; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2878; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2879; GFX1064-NEXT: s_mov_b32 s2, -1 2880; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2881; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2882; GFX1064-NEXT: s_endpgm 2883; 2884; GFX1032-LABEL: sub_i64_constant: 2885; GFX1032: ; %bb.0: ; %entry 2886; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2887; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2888; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2889; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2890; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2891; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2892; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2893; GFX1032-NEXT: ; %bb.1: 2894; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2895; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2896; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2897; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2898; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2899; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2900; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2901; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2902; GFX1032-NEXT: buffer_gl0_inv 2903; GFX1032-NEXT: .LBB11_2: 2904; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2905; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2906; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2907; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2908; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2909; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2910; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2911; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2912; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2913; GFX1032-NEXT: s_mov_b32 s2, -1 2914; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2915; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2916; GFX1032-NEXT: s_endpgm 2917; 2918; GFX1164-LABEL: sub_i64_constant: 2919; GFX1164: ; %bb.0: ; %entry 2920; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2921; GFX1164-NEXT: s_mov_b64 s[4:5], exec 2922; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2923; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2924; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2925; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2926; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2927; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2928; GFX1164-NEXT: s_cbranch_execz .LBB11_2 2929; GFX1164-NEXT: ; %bb.1: 2930; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2931; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2932; GFX1164-NEXT: s_mul_i32 s4, s4, 5 2933; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2934; GFX1164-NEXT: v_mov_b32_e32 v0, s4 2935; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2936; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2937; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2938; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2939; GFX1164-NEXT: buffer_gl0_inv 2940; GFX1164-NEXT: .LBB11_2: 2941; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 2942; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2943; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2944; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 2945; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2946; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2947; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2948; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2949; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2950; GFX1164-NEXT: s_mov_b32 s2, -1 2951; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2952; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2953; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2954; GFX1164-NEXT: s_endpgm 2955; 2956; GFX1132-LABEL: sub_i64_constant: 2957; GFX1132: ; %bb.0: ; %entry 2958; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2959; GFX1132-NEXT: s_mov_b32 s3, exec_lo 2960; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2961; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2962; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 2963; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2964; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 2965; GFX1132-NEXT: s_cbranch_execz .LBB11_2 2966; GFX1132-NEXT: ; %bb.1: 2967; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 2968; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2969; GFX1132-NEXT: s_mul_i32 s3, s3, 5 2970; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2971; GFX1132-NEXT: v_mov_b32_e32 v0, s3 2972; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2973; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2974; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2975; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2976; GFX1132-NEXT: buffer_gl0_inv 2977; GFX1132-NEXT: .LBB11_2: 2978; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 2979; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2980; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2981; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 2982; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2983; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2984; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2985; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2986; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2987; GFX1132-NEXT: s_mov_b32 s2, -1 2988; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2989; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2990; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2991; GFX1132-NEXT: s_endpgm 2992entry: 2993 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2994 store i64 %old, i64 addrspace(1)* %out 2995 ret void 2996} 2997 2998define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2999; 3000; 3001; GFX7LESS-LABEL: sub_i64_uniform: 3002; GFX7LESS: ; %bb.0: ; %entry 3003; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 3004; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 3005; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 3006; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 3007; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3008; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3009; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 3010; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 3011; GFX7LESS-NEXT: ; %bb.1: 3012; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3013; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 3014; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3015; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 3016; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 3017; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 3018; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 3019; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 3020; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 3021; GFX7LESS-NEXT: s_mov_b32 m0, -1 3022; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3023; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3024; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3025; GFX7LESS-NEXT: .LBB12_2: 3026; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 3027; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 3028; GFX7LESS-NEXT: s_mov_b32 s6, -1 3029; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3030; GFX7LESS-NEXT: s_mov_b32 s4, s0 3031; GFX7LESS-NEXT: s_mov_b32 s5, s1 3032; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 3033; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 3034; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 3035; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 3036; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 3037; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 3038; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 3039; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 3040; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 3041; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3042; GFX7LESS-NEXT: s_endpgm 3043; 3044; GFX8-LABEL: sub_i64_uniform: 3045; GFX8: ; %bb.0: ; %entry 3046; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3047; GFX8-NEXT: s_mov_b64 s[6:7], exec 3048; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3049; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3050; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3051; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3052; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3053; GFX8-NEXT: s_cbranch_execz .LBB12_2 3054; GFX8-NEXT: ; %bb.1: 3055; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 3056; GFX8-NEXT: v_mov_b32_e32 v0, s8 3057; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3058; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 3059; GFX8-NEXT: s_mul_i32 s6, s3, s8 3060; GFX8-NEXT: v_mov_b32_e32 v3, 0 3061; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 3062; GFX8-NEXT: s_mov_b32 m0, -1 3063; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3064; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3065; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3066; GFX8-NEXT: .LBB12_2: 3067; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3068; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3069; GFX8-NEXT: s_mov_b32 s4, s0 3070; GFX8-NEXT: s_mov_b32 s5, s1 3071; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 3072; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 3073; GFX8-NEXT: v_readfirstlane_b32 s0, v0 3074; GFX8-NEXT: v_readfirstlane_b32 s1, v1 3075; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 3076; GFX8-NEXT: v_mov_b32_e32 v3, s1 3077; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 3078; GFX8-NEXT: s_mov_b32 s7, 0xf000 3079; GFX8-NEXT: s_mov_b32 s6, -1 3080; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 3081; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3082; GFX8-NEXT: s_endpgm 3083; 3084; GFX9-LABEL: sub_i64_uniform: 3085; GFX9: ; %bb.0: ; %entry 3086; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3087; GFX9-NEXT: s_mov_b64 s[6:7], exec 3088; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3089; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3090; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3091; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3092; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3093; GFX9-NEXT: s_cbranch_execz .LBB12_2 3094; GFX9-NEXT: ; %bb.1: 3095; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3096; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3097; GFX9-NEXT: s_mul_i32 s7, s3, s6 3098; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 3099; GFX9-NEXT: s_add_i32 s8, s8, s7 3100; GFX9-NEXT: s_mul_i32 s6, s2, s6 3101; GFX9-NEXT: v_mov_b32_e32 v0, s6 3102; GFX9-NEXT: v_mov_b32_e32 v1, s8 3103; GFX9-NEXT: v_mov_b32_e32 v3, 0 3104; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3105; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3106; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3107; GFX9-NEXT: .LBB12_2: 3108; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3109; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3110; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3111; GFX9-NEXT: s_mov_b32 s4, s0 3112; GFX9-NEXT: s_mov_b32 s5, s1 3113; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 3114; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3115; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3116; GFX9-NEXT: v_mov_b32_e32 v1, v4 3117; GFX9-NEXT: v_mov_b32_e32 v2, s1 3118; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 3119; GFX9-NEXT: s_mov_b32 s7, 0xf000 3120; GFX9-NEXT: s_mov_b32 s6, -1 3121; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 3122; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3123; GFX9-NEXT: s_endpgm 3124; 3125; GFX1064-LABEL: sub_i64_uniform: 3126; GFX1064: ; %bb.0: ; %entry 3127; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3128; GFX1064-NEXT: s_mov_b64 s[6:7], exec 3129; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3130; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3131; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3132; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3133; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3134; GFX1064-NEXT: s_cbranch_execz .LBB12_2 3135; GFX1064-NEXT: ; %bb.1: 3136; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3137; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3138; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3139; GFX1064-NEXT: s_mul_i32 s7, s3, s6 3140; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 3141; GFX1064-NEXT: s_mul_i32 s6, s2, s6 3142; GFX1064-NEXT: s_add_i32 s8, s8, s7 3143; GFX1064-NEXT: v_mov_b32_e32 v0, s6 3144; GFX1064-NEXT: v_mov_b32_e32 v1, s8 3145; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3146; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3147; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3148; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3149; GFX1064-NEXT: buffer_gl0_inv 3150; GFX1064-NEXT: .LBB12_2: 3151; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3152; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3153; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3154; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3155; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 3156; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 3157; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3158; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3159; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3160; GFX1064-NEXT: v_mov_b32_e32 v1, v4 3161; GFX1064-NEXT: s_mov_b32 s2, -1 3162; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3163; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3164; GFX1064-NEXT: s_endpgm 3165; 3166; GFX1032-LABEL: sub_i64_uniform: 3167; GFX1032: ; %bb.0: ; %entry 3168; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3169; GFX1032-NEXT: s_mov_b32 s5, exec_lo 3170; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3171; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3172; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 3173; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3174; GFX1032-NEXT: s_cbranch_execz .LBB12_2 3175; GFX1032-NEXT: ; %bb.1: 3176; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 3177; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3178; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3179; GFX1032-NEXT: s_mul_i32 s6, s3, s5 3180; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 3181; GFX1032-NEXT: s_mul_i32 s5, s2, s5 3182; GFX1032-NEXT: s_add_i32 s7, s7, s6 3183; GFX1032-NEXT: v_mov_b32_e32 v0, s5 3184; GFX1032-NEXT: v_mov_b32_e32 v1, s7 3185; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3186; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3187; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3188; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3189; GFX1032-NEXT: buffer_gl0_inv 3190; GFX1032-NEXT: .LBB12_2: 3191; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3192; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3193; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3194; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 3195; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 3196; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] 3197; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3198; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3199; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3200; GFX1032-NEXT: v_mov_b32_e32 v1, v4 3201; GFX1032-NEXT: s_mov_b32 s2, -1 3202; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3203; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3204; GFX1032-NEXT: s_endpgm 3205; 3206; GFX1164-LABEL: sub_i64_uniform: 3207; GFX1164: ; %bb.0: ; %entry 3208; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3209; GFX1164-NEXT: s_mov_b64 s[6:7], exec 3210; GFX1164-NEXT: s_mov_b64 s[4:5], exec 3211; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3212; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3213; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3214; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 3215; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 3216; GFX1164-NEXT: s_cbranch_execz .LBB12_2 3217; GFX1164-NEXT: ; %bb.1: 3218; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3219; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3220; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3221; GFX1164-NEXT: s_mul_i32 s7, s3, s6 3222; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 3223; GFX1164-NEXT: s_mul_i32 s6, s2, s6 3224; GFX1164-NEXT: s_add_i32 s8, s8, s7 3225; GFX1164-NEXT: v_mov_b32_e32 v0, s6 3226; GFX1164-NEXT: v_mov_b32_e32 v1, s8 3227; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3228; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3229; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3230; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3231; GFX1164-NEXT: buffer_gl0_inv 3232; GFX1164-NEXT: .LBB12_2: 3233; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3234; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3235; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3236; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 3237; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 3238; GFX1164-NEXT: s_waitcnt_depctr 0xfff 3239; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3240; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3241; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3242; GFX1164-NEXT: s_mov_b32 s2, -1 3243; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3244; GFX1164-NEXT: v_mov_b32_e32 v1, v5 3245; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3246; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3247; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3248; GFX1164-NEXT: s_endpgm 3249; 3250; GFX1132-LABEL: sub_i64_uniform: 3251; GFX1132: ; %bb.0: ; %entry 3252; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3253; GFX1132-NEXT: s_mov_b32 s5, exec_lo 3254; GFX1132-NEXT: s_mov_b32 s4, exec_lo 3255; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3256; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 3257; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3258; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 3259; GFX1132-NEXT: s_cbranch_execz .LBB12_2 3260; GFX1132-NEXT: ; %bb.1: 3261; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 3262; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3263; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3264; GFX1132-NEXT: s_mul_i32 s6, s3, s5 3265; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 3266; GFX1132-NEXT: s_mul_i32 s5, s2, s5 3267; GFX1132-NEXT: s_add_i32 s7, s7, s6 3268; GFX1132-NEXT: v_mov_b32_e32 v0, s5 3269; GFX1132-NEXT: v_mov_b32_e32 v1, s7 3270; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3271; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3272; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3273; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3274; GFX1132-NEXT: buffer_gl0_inv 3275; GFX1132-NEXT: .LBB12_2: 3276; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 3277; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3278; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3279; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 3280; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 3281; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3282; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3283; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3284; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3285; GFX1132-NEXT: s_mov_b32 s2, -1 3286; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3287; GFX1132-NEXT: v_mov_b32_e32 v1, v5 3288; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3289; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3290; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3291; GFX1132-NEXT: s_endpgm 3292entry: 3293 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 3294 store i64 %old, i64 addrspace(1)* %out 3295 ret void 3296} 3297 3298define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 3299; 3300; 3301; GFX7LESS-LABEL: sub_i64_varying: 3302; GFX7LESS: ; %bb.0: ; %entry 3303; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3304; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3305; GFX7LESS-NEXT: s_mov_b32 m0, -1 3306; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3307; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3308; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3309; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3310; GFX7LESS-NEXT: s_mov_b32 s2, -1 3311; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3312; GFX7LESS-NEXT: s_endpgm 3313; 3314; GFX8-LABEL: sub_i64_varying: 3315; GFX8: ; %bb.0: ; %entry 3316; GFX8-NEXT: v_mov_b32_e32 v1, 0 3317; GFX8-NEXT: s_mov_b32 m0, -1 3318; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3319; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3320; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3321; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3322; GFX8-NEXT: s_mov_b32 s3, 0xf000 3323; GFX8-NEXT: s_mov_b32 s2, -1 3324; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3325; GFX8-NEXT: s_endpgm 3326; 3327; GFX9-LABEL: sub_i64_varying: 3328; GFX9: ; %bb.0: ; %entry 3329; GFX9-NEXT: v_mov_b32_e32 v1, 0 3330; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3331; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3332; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3333; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3334; GFX9-NEXT: s_mov_b32 s3, 0xf000 3335; GFX9-NEXT: s_mov_b32 s2, -1 3336; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3337; GFX9-NEXT: s_endpgm 3338; 3339; GFX10-LABEL: sub_i64_varying: 3340; GFX10: ; %bb.0: ; %entry 3341; GFX10-NEXT: v_mov_b32_e32 v1, 0 3342; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3343; GFX10-NEXT: s_mov_b32 s3, 0x31016000 3344; GFX10-NEXT: s_mov_b32 s2, -1 3345; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3346; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3347; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3348; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3349; GFX10-NEXT: buffer_gl0_inv 3350; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3351; GFX10-NEXT: s_endpgm 3352; 3353; GFX11-LABEL: sub_i64_varying: 3354; GFX11: ; %bb.0: ; %entry 3355; GFX11-NEXT: v_mov_b32_e32 v1, 0 3356; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3357; GFX11-NEXT: s_mov_b32 s3, 0x31016000 3358; GFX11-NEXT: s_mov_b32 s2, -1 3359; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3360; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3361; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3362; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3363; GFX11-NEXT: buffer_gl0_inv 3364; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3365; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3366; GFX11-NEXT: s_endpgm 3367entry: 3368 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3369 %zext = zext i32 %lane to i64 3370 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 3371 store i64 %old, i64 addrspace(1)* %out 3372 ret void 3373} 3374 3375define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 3376; 3377; 3378; GFX7LESS-LABEL: and_i32_varying: 3379; GFX7LESS: ; %bb.0: ; %entry 3380; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3381; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3382; GFX7LESS-NEXT: s_mov_b32 m0, -1 3383; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3384; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 3385; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3386; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3387; GFX7LESS-NEXT: s_mov_b32 s2, -1 3388; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3389; GFX7LESS-NEXT: s_endpgm 3390; 3391; GFX8-LABEL: and_i32_varying: 3392; GFX8: ; %bb.0: ; %entry 3393; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3394; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3395; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3396; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3397; GFX8-NEXT: v_mov_b32_e32 v1, -1 3398; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3399; GFX8-NEXT: v_mov_b32_e32 v2, v0 3400; GFX8-NEXT: s_not_b64 exec, exec 3401; GFX8-NEXT: v_mov_b32_e32 v2, -1 3402; GFX8-NEXT: s_not_b64 exec, exec 3403; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3404; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3405; GFX8-NEXT: s_nop 1 3406; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3407; GFX8-NEXT: s_nop 1 3408; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3409; GFX8-NEXT: s_nop 1 3410; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3411; GFX8-NEXT: s_nop 1 3412; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3413; GFX8-NEXT: s_nop 1 3414; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3415; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3416; GFX8-NEXT: s_nop 0 3417; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3418; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3419; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3420; GFX8-NEXT: ; implicit-def: $vgpr0 3421; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3422; GFX8-NEXT: s_cbranch_execz .LBB14_2 3423; GFX8-NEXT: ; %bb.1: 3424; GFX8-NEXT: v_mov_b32_e32 v0, 0 3425; GFX8-NEXT: v_mov_b32_e32 v3, s4 3426; GFX8-NEXT: s_mov_b32 m0, -1 3427; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3428; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 3429; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3430; GFX8-NEXT: .LBB14_2: 3431; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3432; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3433; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3434; GFX8-NEXT: v_mov_b32_e32 v0, v1 3435; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 3436; GFX8-NEXT: s_mov_b32 s3, 0xf000 3437; GFX8-NEXT: s_mov_b32 s2, -1 3438; GFX8-NEXT: s_nop 0 3439; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3440; GFX8-NEXT: s_endpgm 3441; 3442; GFX9-LABEL: and_i32_varying: 3443; GFX9: ; %bb.0: ; %entry 3444; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3445; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3446; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3447; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3448; GFX9-NEXT: v_mov_b32_e32 v1, -1 3449; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3450; GFX9-NEXT: v_mov_b32_e32 v2, v0 3451; GFX9-NEXT: s_not_b64 exec, exec 3452; GFX9-NEXT: v_mov_b32_e32 v2, -1 3453; GFX9-NEXT: s_not_b64 exec, exec 3454; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3455; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3456; GFX9-NEXT: s_nop 1 3457; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3458; GFX9-NEXT: s_nop 1 3459; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3460; GFX9-NEXT: s_nop 1 3461; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3462; GFX9-NEXT: s_nop 1 3463; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3464; GFX9-NEXT: s_nop 1 3465; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3466; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3467; GFX9-NEXT: s_nop 0 3468; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3469; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3470; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3471; GFX9-NEXT: ; implicit-def: $vgpr0 3472; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3473; GFX9-NEXT: s_cbranch_execz .LBB14_2 3474; GFX9-NEXT: ; %bb.1: 3475; GFX9-NEXT: v_mov_b32_e32 v0, 0 3476; GFX9-NEXT: v_mov_b32_e32 v3, s4 3477; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3478; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 3479; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3480; GFX9-NEXT: .LBB14_2: 3481; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3482; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3483; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3484; GFX9-NEXT: v_mov_b32_e32 v0, v1 3485; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 3486; GFX9-NEXT: s_mov_b32 s3, 0xf000 3487; GFX9-NEXT: s_mov_b32 s2, -1 3488; GFX9-NEXT: s_nop 0 3489; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3490; GFX9-NEXT: s_endpgm 3491; 3492; GFX1064-LABEL: and_i32_varying: 3493; GFX1064: ; %bb.0: ; %entry 3494; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3495; GFX1064-NEXT: s_not_b64 exec, exec 3496; GFX1064-NEXT: v_mov_b32_e32 v1, -1 3497; GFX1064-NEXT: s_not_b64 exec, exec 3498; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3499; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3500; GFX1064-NEXT: v_mov_b32_e32 v3, -1 3501; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3502; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3503; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3504; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3505; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3506; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3507; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3508; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3509; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3510; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3511; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3512; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3513; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3514; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3515; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3516; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3517; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3518; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3519; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3520; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3521; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3522; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3523; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3524; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3525; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3526; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3527; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3528; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3529; GFX1064-NEXT: s_mov_b32 s2, -1 3530; GFX1064-NEXT: ; implicit-def: $vgpr0 3531; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3532; GFX1064-NEXT: s_cbranch_execz .LBB14_2 3533; GFX1064-NEXT: ; %bb.1: 3534; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3535; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3536; GFX1064-NEXT: s_mov_b32 s3, s7 3537; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3538; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3539; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 3540; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3541; GFX1064-NEXT: buffer_gl0_inv 3542; GFX1064-NEXT: .LBB14_2: 3543; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3544; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3545; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3546; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3547; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 3548; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3549; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3550; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3551; GFX1064-NEXT: s_endpgm 3552; 3553; GFX1032-LABEL: and_i32_varying: 3554; GFX1032: ; %bb.0: ; %entry 3555; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3556; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3557; GFX1032-NEXT: v_mov_b32_e32 v1, -1 3558; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3559; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3560; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3561; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3562; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3563; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3564; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3565; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3566; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3567; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3568; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3569; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3570; GFX1032-NEXT: v_mov_b32_e32 v3, -1 3571; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3572; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3573; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3574; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3575; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3576; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3577; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3578; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3579; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3580; GFX1032-NEXT: s_mov_b32 s2, -1 3581; GFX1032-NEXT: ; implicit-def: $vgpr0 3582; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3583; GFX1032-NEXT: s_cbranch_execz .LBB14_2 3584; GFX1032-NEXT: ; %bb.1: 3585; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3586; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3587; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3588; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3589; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 3590; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3591; GFX1032-NEXT: buffer_gl0_inv 3592; GFX1032-NEXT: .LBB14_2: 3593; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3594; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3595; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3596; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3597; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 3598; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3599; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3600; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3601; GFX1032-NEXT: s_endpgm 3602; 3603; GFX1164-LABEL: and_i32_varying: 3604; GFX1164: ; %bb.0: ; %entry 3605; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3606; GFX1164-NEXT: s_not_b64 exec, exec 3607; GFX1164-NEXT: v_mov_b32_e32 v1, -1 3608; GFX1164-NEXT: s_not_b64 exec, exec 3609; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3610; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3611; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3612; GFX1164-NEXT: v_mov_b32_e32 v3, -1 3613; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3614; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3615; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3616; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3617; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3618; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3619; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3620; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3621; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3622; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3623; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3624; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3625; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3626; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3627; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3628; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3629; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3630; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3631; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3632; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3633; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3634; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3635; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3636; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3637; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3638; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3639; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3640; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3641; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3642; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 3643; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3644; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3645; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3646; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3647; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3648; GFX1164-NEXT: s_mov_b32 s2, -1 3649; GFX1164-NEXT: ; implicit-def: $vgpr0 3650; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3651; GFX1164-NEXT: s_cbranch_execz .LBB14_2 3652; GFX1164-NEXT: ; %bb.1: 3653; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3654; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3655; GFX1164-NEXT: s_mov_b32 s3, s7 3656; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3657; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3658; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 3659; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3660; GFX1164-NEXT: buffer_gl0_inv 3661; GFX1164-NEXT: .LBB14_2: 3662; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3663; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3664; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3665; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3666; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 3667; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3668; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3669; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3670; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3671; GFX1164-NEXT: s_endpgm 3672; 3673; GFX1132-LABEL: and_i32_varying: 3674; GFX1132: ; %bb.0: ; %entry 3675; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3676; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3677; GFX1132-NEXT: v_mov_b32_e32 v1, -1 3678; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3679; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3680; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3681; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3682; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3683; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3684; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3685; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3686; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3687; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3688; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3689; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3690; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3691; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3692; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3693; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3694; GFX1132-NEXT: v_mov_b32_e32 v3, -1 3695; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3696; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3697; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 3698; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3699; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3700; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3701; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3702; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3703; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3704; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 3705; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3706; GFX1132-NEXT: s_mov_b32 s2, -1 3707; GFX1132-NEXT: ; implicit-def: $vgpr0 3708; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3709; GFX1132-NEXT: s_cbranch_execz .LBB14_2 3710; GFX1132-NEXT: ; %bb.1: 3711; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3712; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3713; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3714; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3715; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 3716; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3717; GFX1132-NEXT: buffer_gl0_inv 3718; GFX1132-NEXT: .LBB14_2: 3719; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3720; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3721; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3722; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3723; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 3724; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3725; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3726; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3727; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3728; GFX1132-NEXT: s_endpgm 3729entry: 3730 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3731 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3732 store i32 %old, i32 addrspace(1)* %out 3733 ret void 3734} 3735 3736define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 3737; 3738; 3739; GFX7LESS-LABEL: or_i32_varying: 3740; GFX7LESS: ; %bb.0: ; %entry 3741; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3742; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3743; GFX7LESS-NEXT: s_mov_b32 m0, -1 3744; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3745; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 3746; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3747; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3748; GFX7LESS-NEXT: s_mov_b32 s2, -1 3749; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3750; GFX7LESS-NEXT: s_endpgm 3751; 3752; GFX8-LABEL: or_i32_varying: 3753; GFX8: ; %bb.0: ; %entry 3754; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3755; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3756; GFX8-NEXT: v_mov_b32_e32 v1, 0 3757; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3758; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3759; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3760; GFX8-NEXT: v_mov_b32_e32 v2, v0 3761; GFX8-NEXT: s_not_b64 exec, exec 3762; GFX8-NEXT: v_mov_b32_e32 v2, 0 3763; GFX8-NEXT: s_not_b64 exec, exec 3764; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3765; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3766; GFX8-NEXT: s_nop 1 3767; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3768; GFX8-NEXT: s_nop 1 3769; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3770; GFX8-NEXT: s_nop 1 3771; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3772; GFX8-NEXT: s_nop 1 3773; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3774; GFX8-NEXT: s_nop 1 3775; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3776; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3777; GFX8-NEXT: s_nop 0 3778; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3779; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3780; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3781; GFX8-NEXT: ; implicit-def: $vgpr0 3782; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3783; GFX8-NEXT: s_cbranch_execz .LBB15_2 3784; GFX8-NEXT: ; %bb.1: 3785; GFX8-NEXT: v_mov_b32_e32 v0, 0 3786; GFX8-NEXT: v_mov_b32_e32 v3, s4 3787; GFX8-NEXT: s_mov_b32 m0, -1 3788; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3789; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 3790; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3791; GFX8-NEXT: .LBB15_2: 3792; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3793; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3794; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3795; GFX8-NEXT: v_mov_b32_e32 v0, v1 3796; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 3797; GFX8-NEXT: s_mov_b32 s3, 0xf000 3798; GFX8-NEXT: s_mov_b32 s2, -1 3799; GFX8-NEXT: s_nop 0 3800; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3801; GFX8-NEXT: s_endpgm 3802; 3803; GFX9-LABEL: or_i32_varying: 3804; GFX9: ; %bb.0: ; %entry 3805; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3806; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3807; GFX9-NEXT: v_mov_b32_e32 v1, 0 3808; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3809; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3810; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3811; GFX9-NEXT: v_mov_b32_e32 v2, v0 3812; GFX9-NEXT: s_not_b64 exec, exec 3813; GFX9-NEXT: v_mov_b32_e32 v2, 0 3814; GFX9-NEXT: s_not_b64 exec, exec 3815; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3816; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3817; GFX9-NEXT: s_nop 1 3818; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3819; GFX9-NEXT: s_nop 1 3820; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3821; GFX9-NEXT: s_nop 1 3822; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3823; GFX9-NEXT: s_nop 1 3824; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3825; GFX9-NEXT: s_nop 1 3826; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3827; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3828; GFX9-NEXT: s_nop 0 3829; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3830; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3831; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3832; GFX9-NEXT: ; implicit-def: $vgpr0 3833; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3834; GFX9-NEXT: s_cbranch_execz .LBB15_2 3835; GFX9-NEXT: ; %bb.1: 3836; GFX9-NEXT: v_mov_b32_e32 v0, 0 3837; GFX9-NEXT: v_mov_b32_e32 v3, s4 3838; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3839; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3840; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3841; GFX9-NEXT: .LBB15_2: 3842; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3843; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3844; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3845; GFX9-NEXT: v_mov_b32_e32 v0, v1 3846; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3847; GFX9-NEXT: s_mov_b32 s3, 0xf000 3848; GFX9-NEXT: s_mov_b32 s2, -1 3849; GFX9-NEXT: s_nop 0 3850; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3851; GFX9-NEXT: s_endpgm 3852; 3853; GFX1064-LABEL: or_i32_varying: 3854; GFX1064: ; %bb.0: ; %entry 3855; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3856; GFX1064-NEXT: s_not_b64 exec, exec 3857; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3858; GFX1064-NEXT: s_not_b64 exec, exec 3859; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3860; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3861; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3862; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3863; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3864; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3865; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3866; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3867; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3868; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3869; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3870; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3871; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3872; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3873; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3874; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3875; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3876; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3877; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3878; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3879; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3880; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3881; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3882; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3883; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3884; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3885; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3886; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3887; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3888; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3889; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3890; GFX1064-NEXT: s_mov_b32 s2, -1 3891; GFX1064-NEXT: ; implicit-def: $vgpr0 3892; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3893; GFX1064-NEXT: s_cbranch_execz .LBB15_2 3894; GFX1064-NEXT: ; %bb.1: 3895; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3896; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3897; GFX1064-NEXT: s_mov_b32 s3, s7 3898; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3899; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3900; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 3901; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3902; GFX1064-NEXT: buffer_gl0_inv 3903; GFX1064-NEXT: .LBB15_2: 3904; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3905; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3906; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3907; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3908; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3909; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3910; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3911; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3912; GFX1064-NEXT: s_endpgm 3913; 3914; GFX1032-LABEL: or_i32_varying: 3915; GFX1032: ; %bb.0: ; %entry 3916; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3917; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3918; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3919; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3920; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3921; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3922; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3923; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3924; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3925; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3926; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3927; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3928; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3929; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3930; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3931; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3932; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3933; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3934; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3935; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3936; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3937; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3938; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3939; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3940; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3941; GFX1032-NEXT: s_mov_b32 s2, -1 3942; GFX1032-NEXT: ; implicit-def: $vgpr0 3943; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3944; GFX1032-NEXT: s_cbranch_execz .LBB15_2 3945; GFX1032-NEXT: ; %bb.1: 3946; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3947; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3948; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3949; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3950; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 3951; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3952; GFX1032-NEXT: buffer_gl0_inv 3953; GFX1032-NEXT: .LBB15_2: 3954; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3955; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3956; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3957; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3958; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3959; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3960; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3961; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3962; GFX1032-NEXT: s_endpgm 3963; 3964; GFX1164-LABEL: or_i32_varying: 3965; GFX1164: ; %bb.0: ; %entry 3966; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3967; GFX1164-NEXT: s_not_b64 exec, exec 3968; GFX1164-NEXT: v_mov_b32_e32 v1, 0 3969; GFX1164-NEXT: s_not_b64 exec, exec 3970; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3971; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3972; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3973; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3974; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3975; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3976; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3977; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3978; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3979; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3980; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3981; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3982; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3983; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3984; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3985; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3986; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3987; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3988; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3989; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3990; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3991; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3992; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3993; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3994; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3995; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3996; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3997; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3998; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3999; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4000; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4001; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4002; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4003; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 4004; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4005; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4006; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4007; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4008; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4009; GFX1164-NEXT: s_mov_b32 s2, -1 4010; GFX1164-NEXT: ; implicit-def: $vgpr0 4011; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4012; GFX1164-NEXT: s_cbranch_execz .LBB15_2 4013; GFX1164-NEXT: ; %bb.1: 4014; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4015; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4016; GFX1164-NEXT: s_mov_b32 s3, s7 4017; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4018; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4019; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v4 4020; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4021; GFX1164-NEXT: buffer_gl0_inv 4022; GFX1164-NEXT: .LBB15_2: 4023; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4024; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4025; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4026; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4027; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 4028; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4029; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4030; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4031; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4032; GFX1164-NEXT: s_endpgm 4033; 4034; GFX1132-LABEL: or_i32_varying: 4035; GFX1132: ; %bb.0: ; %entry 4036; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4037; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4038; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4039; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4040; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4041; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4042; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4043; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4044; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4045; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4046; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4047; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4048; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4049; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4050; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4051; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4052; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4053; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4054; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4055; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4056; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4057; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4058; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4059; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4060; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4061; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4062; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4063; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4064; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4065; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4066; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4067; GFX1132-NEXT: s_mov_b32 s2, -1 4068; GFX1132-NEXT: ; implicit-def: $vgpr0 4069; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4070; GFX1132-NEXT: s_cbranch_execz .LBB15_2 4071; GFX1132-NEXT: ; %bb.1: 4072; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4073; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4074; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4075; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4076; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v4 4077; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4078; GFX1132-NEXT: buffer_gl0_inv 4079; GFX1132-NEXT: .LBB15_2: 4080; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4081; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4082; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4083; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4084; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 4085; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4086; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4087; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4088; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4089; GFX1132-NEXT: s_endpgm 4090entry: 4091 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4092 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4093 store i32 %old, i32 addrspace(1)* %out 4094 ret void 4095} 4096 4097define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 4098; 4099; 4100; GFX7LESS-LABEL: xor_i32_varying: 4101; GFX7LESS: ; %bb.0: ; %entry 4102; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4103; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4104; GFX7LESS-NEXT: s_mov_b32 m0, -1 4105; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4106; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 4107; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4108; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4109; GFX7LESS-NEXT: s_mov_b32 s2, -1 4110; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4111; GFX7LESS-NEXT: s_endpgm 4112; 4113; GFX8-LABEL: xor_i32_varying: 4114; GFX8: ; %bb.0: ; %entry 4115; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4116; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4117; GFX8-NEXT: v_mov_b32_e32 v1, 0 4118; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4119; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4120; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4121; GFX8-NEXT: v_mov_b32_e32 v2, v0 4122; GFX8-NEXT: s_not_b64 exec, exec 4123; GFX8-NEXT: v_mov_b32_e32 v2, 0 4124; GFX8-NEXT: s_not_b64 exec, exec 4125; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4126; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4127; GFX8-NEXT: s_nop 1 4128; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4129; GFX8-NEXT: s_nop 1 4130; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4131; GFX8-NEXT: s_nop 1 4132; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4133; GFX8-NEXT: s_nop 1 4134; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4135; GFX8-NEXT: s_nop 1 4136; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4137; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4138; GFX8-NEXT: s_nop 0 4139; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4140; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4141; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4142; GFX8-NEXT: ; implicit-def: $vgpr0 4143; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4144; GFX8-NEXT: s_cbranch_execz .LBB16_2 4145; GFX8-NEXT: ; %bb.1: 4146; GFX8-NEXT: v_mov_b32_e32 v0, 0 4147; GFX8-NEXT: v_mov_b32_e32 v3, s4 4148; GFX8-NEXT: s_mov_b32 m0, -1 4149; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4150; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 4151; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4152; GFX8-NEXT: .LBB16_2: 4153; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4154; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4155; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4156; GFX8-NEXT: v_mov_b32_e32 v0, v1 4157; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 4158; GFX8-NEXT: s_mov_b32 s3, 0xf000 4159; GFX8-NEXT: s_mov_b32 s2, -1 4160; GFX8-NEXT: s_nop 0 4161; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4162; GFX8-NEXT: s_endpgm 4163; 4164; GFX9-LABEL: xor_i32_varying: 4165; GFX9: ; %bb.0: ; %entry 4166; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4167; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4168; GFX9-NEXT: v_mov_b32_e32 v1, 0 4169; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4170; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4171; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4172; GFX9-NEXT: v_mov_b32_e32 v2, v0 4173; GFX9-NEXT: s_not_b64 exec, exec 4174; GFX9-NEXT: v_mov_b32_e32 v2, 0 4175; GFX9-NEXT: s_not_b64 exec, exec 4176; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4177; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4178; GFX9-NEXT: s_nop 1 4179; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4180; GFX9-NEXT: s_nop 1 4181; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4182; GFX9-NEXT: s_nop 1 4183; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4184; GFX9-NEXT: s_nop 1 4185; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4186; GFX9-NEXT: s_nop 1 4187; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4188; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4189; GFX9-NEXT: s_nop 0 4190; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4191; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4192; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4193; GFX9-NEXT: ; implicit-def: $vgpr0 4194; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4195; GFX9-NEXT: s_cbranch_execz .LBB16_2 4196; GFX9-NEXT: ; %bb.1: 4197; GFX9-NEXT: v_mov_b32_e32 v0, 0 4198; GFX9-NEXT: v_mov_b32_e32 v3, s4 4199; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4200; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 4201; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4202; GFX9-NEXT: .LBB16_2: 4203; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4204; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4205; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4206; GFX9-NEXT: v_mov_b32_e32 v0, v1 4207; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 4208; GFX9-NEXT: s_mov_b32 s3, 0xf000 4209; GFX9-NEXT: s_mov_b32 s2, -1 4210; GFX9-NEXT: s_nop 0 4211; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4212; GFX9-NEXT: s_endpgm 4213; 4214; GFX1064-LABEL: xor_i32_varying: 4215; GFX1064: ; %bb.0: ; %entry 4216; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4217; GFX1064-NEXT: s_not_b64 exec, exec 4218; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4219; GFX1064-NEXT: s_not_b64 exec, exec 4220; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4221; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4222; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4223; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4224; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4225; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4226; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4227; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4228; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4229; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4230; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4231; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4232; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4233; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4234; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4235; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4236; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4237; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4238; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4239; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4240; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4241; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4242; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4243; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4244; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4245; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4246; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4247; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4248; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4249; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4250; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4251; GFX1064-NEXT: s_mov_b32 s2, -1 4252; GFX1064-NEXT: ; implicit-def: $vgpr0 4253; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4254; GFX1064-NEXT: s_cbranch_execz .LBB16_2 4255; GFX1064-NEXT: ; %bb.1: 4256; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4257; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4258; GFX1064-NEXT: s_mov_b32 s3, s7 4259; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4260; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4261; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 4262; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4263; GFX1064-NEXT: buffer_gl0_inv 4264; GFX1064-NEXT: .LBB16_2: 4265; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4266; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4267; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4268; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4269; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 4270; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4271; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4272; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4273; GFX1064-NEXT: s_endpgm 4274; 4275; GFX1032-LABEL: xor_i32_varying: 4276; GFX1032: ; %bb.0: ; %entry 4277; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4278; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4279; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4280; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4281; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4282; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4283; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4284; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4285; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4286; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4287; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4288; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4289; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4290; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4291; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4292; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4293; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4294; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4295; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4296; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4297; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4298; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4299; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4300; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4301; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4302; GFX1032-NEXT: s_mov_b32 s2, -1 4303; GFX1032-NEXT: ; implicit-def: $vgpr0 4304; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4305; GFX1032-NEXT: s_cbranch_execz .LBB16_2 4306; GFX1032-NEXT: ; %bb.1: 4307; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4308; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4309; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4310; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4311; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 4312; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4313; GFX1032-NEXT: buffer_gl0_inv 4314; GFX1032-NEXT: .LBB16_2: 4315; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4316; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4317; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4318; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4319; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 4320; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4321; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4322; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4323; GFX1032-NEXT: s_endpgm 4324; 4325; GFX1164-LABEL: xor_i32_varying: 4326; GFX1164: ; %bb.0: ; %entry 4327; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4328; GFX1164-NEXT: s_not_b64 exec, exec 4329; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4330; GFX1164-NEXT: s_not_b64 exec, exec 4331; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4332; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4333; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4334; GFX1164-NEXT: v_mov_b32_e32 v3, 0 4335; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4336; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4337; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4338; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4339; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4340; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4341; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4342; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4343; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4344; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4345; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4346; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4347; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4348; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4349; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4350; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4351; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4352; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4353; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4354; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4355; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4356; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4357; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4358; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4359; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4360; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4361; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4362; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4363; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4364; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 4365; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4366; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4367; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4368; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4369; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4370; GFX1164-NEXT: s_mov_b32 s2, -1 4371; GFX1164-NEXT: ; implicit-def: $vgpr0 4372; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4373; GFX1164-NEXT: s_cbranch_execz .LBB16_2 4374; GFX1164-NEXT: ; %bb.1: 4375; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4376; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4377; GFX1164-NEXT: s_mov_b32 s3, s7 4378; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4379; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4380; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v4 4381; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4382; GFX1164-NEXT: buffer_gl0_inv 4383; GFX1164-NEXT: .LBB16_2: 4384; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4385; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4386; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4387; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4388; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 4389; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4390; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4391; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4392; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4393; GFX1164-NEXT: s_endpgm 4394; 4395; GFX1132-LABEL: xor_i32_varying: 4396; GFX1132: ; %bb.0: ; %entry 4397; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4398; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4399; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4400; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4401; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4402; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4403; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4404; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4405; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4406; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4407; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4408; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4409; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4410; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4411; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4412; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4413; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4414; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4415; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4416; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4417; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4418; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4419; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4420; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4421; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4422; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4423; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4424; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4425; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4426; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4427; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4428; GFX1132-NEXT: s_mov_b32 s2, -1 4429; GFX1132-NEXT: ; implicit-def: $vgpr0 4430; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4431; GFX1132-NEXT: s_cbranch_execz .LBB16_2 4432; GFX1132-NEXT: ; %bb.1: 4433; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4434; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4435; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4436; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4437; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v4 4438; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4439; GFX1132-NEXT: buffer_gl0_inv 4440; GFX1132-NEXT: .LBB16_2: 4441; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4442; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4443; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4444; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4445; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 4446; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4447; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4448; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4449; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4450; GFX1132-NEXT: s_endpgm 4451entry: 4452 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4453 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4454 store i32 %old, i32 addrspace(1)* %out 4455 ret void 4456} 4457 4458define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 4459; 4460; 4461; GFX7LESS-LABEL: max_i32_varying: 4462; GFX7LESS: ; %bb.0: ; %entry 4463; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4464; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4465; GFX7LESS-NEXT: s_mov_b32 m0, -1 4466; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4467; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 4468; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4469; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4470; GFX7LESS-NEXT: s_mov_b32 s2, -1 4471; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4472; GFX7LESS-NEXT: s_endpgm 4473; 4474; GFX8-LABEL: max_i32_varying: 4475; GFX8: ; %bb.0: ; %entry 4476; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4477; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4478; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4479; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4480; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4481; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4482; GFX8-NEXT: v_mov_b32_e32 v2, v0 4483; GFX8-NEXT: s_not_b64 exec, exec 4484; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 4485; GFX8-NEXT: s_not_b64 exec, exec 4486; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4487; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4488; GFX8-NEXT: s_nop 1 4489; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4490; GFX8-NEXT: s_nop 1 4491; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4492; GFX8-NEXT: s_nop 1 4493; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4494; GFX8-NEXT: s_nop 1 4495; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4496; GFX8-NEXT: s_nop 1 4497; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4498; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4499; GFX8-NEXT: s_nop 0 4500; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4501; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4502; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4503; GFX8-NEXT: ; implicit-def: $vgpr0 4504; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4505; GFX8-NEXT: s_cbranch_execz .LBB17_2 4506; GFX8-NEXT: ; %bb.1: 4507; GFX8-NEXT: v_mov_b32_e32 v0, 0 4508; GFX8-NEXT: v_mov_b32_e32 v3, s4 4509; GFX8-NEXT: s_mov_b32 m0, -1 4510; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4511; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 4512; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4513; GFX8-NEXT: .LBB17_2: 4514; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4515; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4516; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4517; GFX8-NEXT: v_mov_b32_e32 v0, v1 4518; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 4519; GFX8-NEXT: s_mov_b32 s3, 0xf000 4520; GFX8-NEXT: s_mov_b32 s2, -1 4521; GFX8-NEXT: s_nop 0 4522; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4523; GFX8-NEXT: s_endpgm 4524; 4525; GFX9-LABEL: max_i32_varying: 4526; GFX9: ; %bb.0: ; %entry 4527; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4528; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4529; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4530; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4531; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4532; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4533; GFX9-NEXT: v_mov_b32_e32 v2, v0 4534; GFX9-NEXT: s_not_b64 exec, exec 4535; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 4536; GFX9-NEXT: s_not_b64 exec, exec 4537; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4538; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4539; GFX9-NEXT: s_nop 1 4540; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4541; GFX9-NEXT: s_nop 1 4542; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4543; GFX9-NEXT: s_nop 1 4544; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4545; GFX9-NEXT: s_nop 1 4546; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4547; GFX9-NEXT: s_nop 1 4548; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4549; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4550; GFX9-NEXT: s_nop 0 4551; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4552; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4553; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4554; GFX9-NEXT: ; implicit-def: $vgpr0 4555; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4556; GFX9-NEXT: s_cbranch_execz .LBB17_2 4557; GFX9-NEXT: ; %bb.1: 4558; GFX9-NEXT: v_mov_b32_e32 v0, 0 4559; GFX9-NEXT: v_mov_b32_e32 v3, s4 4560; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4561; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 4562; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4563; GFX9-NEXT: .LBB17_2: 4564; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4565; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4566; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4567; GFX9-NEXT: v_mov_b32_e32 v0, v1 4568; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 4569; GFX9-NEXT: s_mov_b32 s3, 0xf000 4570; GFX9-NEXT: s_mov_b32 s2, -1 4571; GFX9-NEXT: s_nop 0 4572; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4573; GFX9-NEXT: s_endpgm 4574; 4575; GFX1064-LABEL: max_i32_varying: 4576; GFX1064: ; %bb.0: ; %entry 4577; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4578; GFX1064-NEXT: s_not_b64 exec, exec 4579; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 4580; GFX1064-NEXT: s_not_b64 exec, exec 4581; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4582; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4583; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 4584; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4585; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4586; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4587; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4588; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4589; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4590; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4591; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4592; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4593; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4594; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4595; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4596; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4597; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4598; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4599; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4600; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4601; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4602; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4603; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4604; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4605; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4606; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4607; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4608; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4609; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4610; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4611; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4612; GFX1064-NEXT: s_mov_b32 s2, -1 4613; GFX1064-NEXT: ; implicit-def: $vgpr0 4614; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4615; GFX1064-NEXT: s_cbranch_execz .LBB17_2 4616; GFX1064-NEXT: ; %bb.1: 4617; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4618; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4619; GFX1064-NEXT: s_mov_b32 s3, s7 4620; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4621; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4622; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 4623; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4624; GFX1064-NEXT: buffer_gl0_inv 4625; GFX1064-NEXT: .LBB17_2: 4626; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4627; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4628; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4629; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4630; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 4631; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4632; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4633; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4634; GFX1064-NEXT: s_endpgm 4635; 4636; GFX1032-LABEL: max_i32_varying: 4637; GFX1032: ; %bb.0: ; %entry 4638; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4639; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4640; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 4641; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4642; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4643; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4644; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4645; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4646; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4647; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4648; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4649; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4650; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4651; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4652; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4653; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 4654; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4655; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4656; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4657; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4658; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4659; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4660; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4661; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4662; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4663; GFX1032-NEXT: s_mov_b32 s2, -1 4664; GFX1032-NEXT: ; implicit-def: $vgpr0 4665; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4666; GFX1032-NEXT: s_cbranch_execz .LBB17_2 4667; GFX1032-NEXT: ; %bb.1: 4668; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4669; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4670; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4671; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4672; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 4673; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4674; GFX1032-NEXT: buffer_gl0_inv 4675; GFX1032-NEXT: .LBB17_2: 4676; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4677; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4678; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4679; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4680; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 4681; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4682; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4683; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4684; GFX1032-NEXT: s_endpgm 4685; 4686; GFX1164-LABEL: max_i32_varying: 4687; GFX1164: ; %bb.0: ; %entry 4688; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4689; GFX1164-NEXT: s_not_b64 exec, exec 4690; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 4691; GFX1164-NEXT: s_not_b64 exec, exec 4692; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4693; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4694; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4695; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 4696; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4697; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4698; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4699; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4700; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4701; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4702; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4703; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4704; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4705; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4706; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4707; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4708; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4709; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4710; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4711; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4712; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4713; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4714; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4715; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4716; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4717; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4718; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4719; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4720; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4721; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4722; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4723; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4724; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4725; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 4726; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4727; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4728; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4729; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4730; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4731; GFX1164-NEXT: s_mov_b32 s2, -1 4732; GFX1164-NEXT: ; implicit-def: $vgpr0 4733; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4734; GFX1164-NEXT: s_cbranch_execz .LBB17_2 4735; GFX1164-NEXT: ; %bb.1: 4736; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4737; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4738; GFX1164-NEXT: s_mov_b32 s3, s7 4739; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4740; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4741; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 4742; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4743; GFX1164-NEXT: buffer_gl0_inv 4744; GFX1164-NEXT: .LBB17_2: 4745; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4746; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4747; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4748; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4749; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 4750; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4751; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4752; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4753; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4754; GFX1164-NEXT: s_endpgm 4755; 4756; GFX1132-LABEL: max_i32_varying: 4757; GFX1132: ; %bb.0: ; %entry 4758; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4759; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4760; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 4761; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4762; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4763; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4764; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4765; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4766; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4767; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4768; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4769; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4770; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4771; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4772; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4773; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4774; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4775; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4776; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4777; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 4778; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4779; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4780; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4781; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4782; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4783; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4784; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4785; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4786; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4787; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4788; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4789; GFX1132-NEXT: s_mov_b32 s2, -1 4790; GFX1132-NEXT: ; implicit-def: $vgpr0 4791; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4792; GFX1132-NEXT: s_cbranch_execz .LBB17_2 4793; GFX1132-NEXT: ; %bb.1: 4794; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4795; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4796; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4797; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4798; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 4799; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4800; GFX1132-NEXT: buffer_gl0_inv 4801; GFX1132-NEXT: .LBB17_2: 4802; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4803; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4804; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4805; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4806; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 4807; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4808; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4809; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4810; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 4811; GFX1132-NEXT: s_endpgm 4812entry: 4813 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4814 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4815 store i32 %old, i32 addrspace(1)* %out 4816 ret void 4817} 4818 4819define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 4820; 4821; 4822; GFX7LESS-LABEL: max_i64_constant: 4823; GFX7LESS: ; %bb.0: ; %entry 4824; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4825; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4826; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4827; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4828; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4829; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4830; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 4831; GFX7LESS-NEXT: ; %bb.1: 4832; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4833; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4834; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4835; GFX7LESS-NEXT: s_mov_b32 m0, -1 4836; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4837; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4838; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4839; GFX7LESS-NEXT: .LBB18_2: 4840; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4841; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4842; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4843; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4844; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 4845; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4846; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4847; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4848; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4849; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4850; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 4851; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4852; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4853; GFX7LESS-NEXT: s_mov_b32 s2, -1 4854; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4855; GFX7LESS-NEXT: s_endpgm 4856; 4857; GFX8-LABEL: max_i64_constant: 4858; GFX8: ; %bb.0: ; %entry 4859; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4860; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4861; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4862; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4863; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4864; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4865; GFX8-NEXT: s_cbranch_execz .LBB18_2 4866; GFX8-NEXT: ; %bb.1: 4867; GFX8-NEXT: v_mov_b32_e32 v0, 5 4868; GFX8-NEXT: v_mov_b32_e32 v2, 0 4869; GFX8-NEXT: v_mov_b32_e32 v1, 0 4870; GFX8-NEXT: s_mov_b32 m0, -1 4871; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4872; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4873; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4874; GFX8-NEXT: .LBB18_2: 4875; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4876; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4877; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4878; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 4879; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4880; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4881; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4882; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4883; GFX8-NEXT: v_mov_b32_e32 v2, s3 4884; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4885; GFX8-NEXT: v_mov_b32_e32 v2, s2 4886; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4887; GFX8-NEXT: s_mov_b32 s3, 0xf000 4888; GFX8-NEXT: s_mov_b32 s2, -1 4889; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4890; GFX8-NEXT: s_endpgm 4891; 4892; GFX9-LABEL: max_i64_constant: 4893; GFX9: ; %bb.0: ; %entry 4894; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4895; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4896; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4897; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4898; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4899; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4900; GFX9-NEXT: s_cbranch_execz .LBB18_2 4901; GFX9-NEXT: ; %bb.1: 4902; GFX9-NEXT: v_mov_b32_e32 v0, 5 4903; GFX9-NEXT: v_mov_b32_e32 v1, 0 4904; GFX9-NEXT: v_mov_b32_e32 v2, 0 4905; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4906; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4907; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4908; GFX9-NEXT: .LBB18_2: 4909; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4910; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4911; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4912; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 4913; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4914; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4915; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4916; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4917; GFX9-NEXT: v_mov_b32_e32 v2, s3 4918; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4919; GFX9-NEXT: v_mov_b32_e32 v2, s2 4920; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4921; GFX9-NEXT: s_mov_b32 s3, 0xf000 4922; GFX9-NEXT: s_mov_b32 s2, -1 4923; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4924; GFX9-NEXT: s_endpgm 4925; 4926; GFX1064-LABEL: max_i64_constant: 4927; GFX1064: ; %bb.0: ; %entry 4928; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4929; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4930; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4931; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4932; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4933; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4934; GFX1064-NEXT: s_cbranch_execz .LBB18_2 4935; GFX1064-NEXT: ; %bb.1: 4936; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4937; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4938; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4939; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4940; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4941; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4942; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4943; GFX1064-NEXT: buffer_gl0_inv 4944; GFX1064-NEXT: .LBB18_2: 4945; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4946; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4947; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4948; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4949; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4950; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4951; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4952; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4953; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4954; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4955; GFX1064-NEXT: s_mov_b32 s2, -1 4956; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4957; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4958; GFX1064-NEXT: s_endpgm 4959; 4960; GFX1032-LABEL: max_i64_constant: 4961; GFX1032: ; %bb.0: ; %entry 4962; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4963; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4964; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4965; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4966; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4967; GFX1032-NEXT: s_cbranch_execz .LBB18_2 4968; GFX1032-NEXT: ; %bb.1: 4969; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4970; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4971; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4972; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4973; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4974; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4975; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4976; GFX1032-NEXT: buffer_gl0_inv 4977; GFX1032-NEXT: .LBB18_2: 4978; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4979; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4980; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4981; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4982; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4983; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4984; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4985; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4986; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4987; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4988; GFX1032-NEXT: s_mov_b32 s2, -1 4989; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4990; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4991; GFX1032-NEXT: s_endpgm 4992; 4993; GFX1164-LABEL: max_i64_constant: 4994; GFX1164: ; %bb.0: ; %entry 4995; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4996; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4997; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4998; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4999; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5000; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5001; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5002; GFX1164-NEXT: s_cbranch_execz .LBB18_2 5003; GFX1164-NEXT: ; %bb.1: 5004; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5005; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5006; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5007; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5008; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5009; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 5010; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5011; GFX1164-NEXT: buffer_gl0_inv 5012; GFX1164-NEXT: .LBB18_2: 5013; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5014; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5015; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5016; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 5017; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5018; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5019; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 5020; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5021; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5022; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5023; GFX1164-NEXT: s_mov_b32 s2, -1 5024; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5025; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5026; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5027; GFX1164-NEXT: s_endpgm 5028; 5029; GFX1132-LABEL: max_i64_constant: 5030; GFX1132: ; %bb.0: ; %entry 5031; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5032; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5033; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5034; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5035; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5036; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5037; GFX1132-NEXT: s_cbranch_execz .LBB18_2 5038; GFX1132-NEXT: ; %bb.1: 5039; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5040; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5041; GFX1132-NEXT: v_mov_b32_e32 v2, 0 5042; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5043; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5044; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 5045; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5046; GFX1132-NEXT: buffer_gl0_inv 5047; GFX1132-NEXT: .LBB18_2: 5048; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5049; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5050; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5051; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 5052; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 5053; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5054; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 5055; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5056; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5057; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5058; GFX1132-NEXT: s_mov_b32 s2, -1 5059; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5060; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5061; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5062; GFX1132-NEXT: s_endpgm 5063entry: 5064 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 5065 store i64 %old, i64 addrspace(1)* %out 5066 ret void 5067} 5068 5069define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 5070; 5071; 5072; GFX7LESS-LABEL: min_i32_varying: 5073; GFX7LESS: ; %bb.0: ; %entry 5074; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5075; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5076; GFX7LESS-NEXT: s_mov_b32 m0, -1 5077; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5078; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 5079; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5080; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5081; GFX7LESS-NEXT: s_mov_b32 s2, -1 5082; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5083; GFX7LESS-NEXT: s_endpgm 5084; 5085; GFX8-LABEL: min_i32_varying: 5086; GFX8: ; %bb.0: ; %entry 5087; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5088; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5089; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5090; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5091; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 5092; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5093; GFX8-NEXT: v_mov_b32_e32 v2, v0 5094; GFX8-NEXT: s_not_b64 exec, exec 5095; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 5096; GFX8-NEXT: s_not_b64 exec, exec 5097; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5098; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5099; GFX8-NEXT: s_nop 1 5100; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5101; GFX8-NEXT: s_nop 1 5102; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5103; GFX8-NEXT: s_nop 1 5104; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5105; GFX8-NEXT: s_nop 1 5106; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5107; GFX8-NEXT: s_nop 1 5108; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5109; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5110; GFX8-NEXT: s_nop 0 5111; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5112; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5113; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5114; GFX8-NEXT: ; implicit-def: $vgpr0 5115; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5116; GFX8-NEXT: s_cbranch_execz .LBB19_2 5117; GFX8-NEXT: ; %bb.1: 5118; GFX8-NEXT: v_mov_b32_e32 v0, 0 5119; GFX8-NEXT: v_mov_b32_e32 v3, s4 5120; GFX8-NEXT: s_mov_b32 m0, -1 5121; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5122; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 5123; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5124; GFX8-NEXT: .LBB19_2: 5125; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5126; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5127; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5128; GFX8-NEXT: v_mov_b32_e32 v0, v1 5129; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 5130; GFX8-NEXT: s_mov_b32 s3, 0xf000 5131; GFX8-NEXT: s_mov_b32 s2, -1 5132; GFX8-NEXT: s_nop 0 5133; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5134; GFX8-NEXT: s_endpgm 5135; 5136; GFX9-LABEL: min_i32_varying: 5137; GFX9: ; %bb.0: ; %entry 5138; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5139; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5140; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5141; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5142; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 5143; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5144; GFX9-NEXT: v_mov_b32_e32 v2, v0 5145; GFX9-NEXT: s_not_b64 exec, exec 5146; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 5147; GFX9-NEXT: s_not_b64 exec, exec 5148; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5149; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5150; GFX9-NEXT: s_nop 1 5151; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5152; GFX9-NEXT: s_nop 1 5153; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5154; GFX9-NEXT: s_nop 1 5155; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5156; GFX9-NEXT: s_nop 1 5157; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5158; GFX9-NEXT: s_nop 1 5159; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5160; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5161; GFX9-NEXT: s_nop 0 5162; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5163; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5164; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5165; GFX9-NEXT: ; implicit-def: $vgpr0 5166; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5167; GFX9-NEXT: s_cbranch_execz .LBB19_2 5168; GFX9-NEXT: ; %bb.1: 5169; GFX9-NEXT: v_mov_b32_e32 v0, 0 5170; GFX9-NEXT: v_mov_b32_e32 v3, s4 5171; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5172; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 5173; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5174; GFX9-NEXT: .LBB19_2: 5175; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5176; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5177; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5178; GFX9-NEXT: v_mov_b32_e32 v0, v1 5179; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 5180; GFX9-NEXT: s_mov_b32 s3, 0xf000 5181; GFX9-NEXT: s_mov_b32 s2, -1 5182; GFX9-NEXT: s_nop 0 5183; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5184; GFX9-NEXT: s_endpgm 5185; 5186; GFX1064-LABEL: min_i32_varying: 5187; GFX1064: ; %bb.0: ; %entry 5188; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5189; GFX1064-NEXT: s_not_b64 exec, exec 5190; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 5191; GFX1064-NEXT: s_not_b64 exec, exec 5192; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5193; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5194; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 5195; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5196; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5197; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5198; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5199; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5200; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5201; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5202; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5203; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5204; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5205; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5206; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5207; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5208; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5209; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5210; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5211; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5212; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5213; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5214; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5215; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5216; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5217; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5218; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5219; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5220; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5221; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5222; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5223; GFX1064-NEXT: s_mov_b32 s2, -1 5224; GFX1064-NEXT: ; implicit-def: $vgpr0 5225; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5226; GFX1064-NEXT: s_cbranch_execz .LBB19_2 5227; GFX1064-NEXT: ; %bb.1: 5228; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5229; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5230; GFX1064-NEXT: s_mov_b32 s3, s7 5231; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5232; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5233; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 5234; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5235; GFX1064-NEXT: buffer_gl0_inv 5236; GFX1064-NEXT: .LBB19_2: 5237; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5238; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5239; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5240; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5241; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 5242; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5243; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5244; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5245; GFX1064-NEXT: s_endpgm 5246; 5247; GFX1032-LABEL: min_i32_varying: 5248; GFX1032: ; %bb.0: ; %entry 5249; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5250; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5251; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 5252; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5253; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5254; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5255; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5256; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5257; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5258; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5259; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5260; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5261; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5262; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5263; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5264; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 5265; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5266; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5267; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5268; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5269; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5270; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5271; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5272; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5273; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5274; GFX1032-NEXT: s_mov_b32 s2, -1 5275; GFX1032-NEXT: ; implicit-def: $vgpr0 5276; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5277; GFX1032-NEXT: s_cbranch_execz .LBB19_2 5278; GFX1032-NEXT: ; %bb.1: 5279; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5280; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5281; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5282; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5283; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 5284; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5285; GFX1032-NEXT: buffer_gl0_inv 5286; GFX1032-NEXT: .LBB19_2: 5287; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5288; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5289; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5290; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5291; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 5292; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5293; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5294; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5295; GFX1032-NEXT: s_endpgm 5296; 5297; GFX1164-LABEL: min_i32_varying: 5298; GFX1164: ; %bb.0: ; %entry 5299; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5300; GFX1164-NEXT: s_not_b64 exec, exec 5301; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 5302; GFX1164-NEXT: s_not_b64 exec, exec 5303; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5304; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5305; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5306; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 5307; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5308; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5309; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5310; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5311; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5312; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5313; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5314; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5315; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5316; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5317; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5318; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5319; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5320; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5321; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5322; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5323; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5324; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5325; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5326; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5327; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5328; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5329; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5330; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5331; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5332; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5333; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5334; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5335; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5336; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 5337; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5338; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5339; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5340; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5341; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5342; GFX1164-NEXT: s_mov_b32 s2, -1 5343; GFX1164-NEXT: ; implicit-def: $vgpr0 5344; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5345; GFX1164-NEXT: s_cbranch_execz .LBB19_2 5346; GFX1164-NEXT: ; %bb.1: 5347; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5348; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5349; GFX1164-NEXT: s_mov_b32 s3, s7 5350; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5351; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5352; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 5353; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5354; GFX1164-NEXT: buffer_gl0_inv 5355; GFX1164-NEXT: .LBB19_2: 5356; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5357; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5358; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5359; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5360; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 5361; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5362; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5363; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5364; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5365; GFX1164-NEXT: s_endpgm 5366; 5367; GFX1132-LABEL: min_i32_varying: 5368; GFX1132: ; %bb.0: ; %entry 5369; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5370; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5371; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 5372; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5373; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5374; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5375; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5376; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5377; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5378; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5379; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5380; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5381; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5382; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5383; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5384; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5385; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5386; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5387; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5388; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 5389; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5390; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5391; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 5392; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5393; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5394; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5395; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5396; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5397; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5398; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 5399; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5400; GFX1132-NEXT: s_mov_b32 s2, -1 5401; GFX1132-NEXT: ; implicit-def: $vgpr0 5402; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5403; GFX1132-NEXT: s_cbranch_execz .LBB19_2 5404; GFX1132-NEXT: ; %bb.1: 5405; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5406; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5407; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5408; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5409; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 5410; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5411; GFX1132-NEXT: buffer_gl0_inv 5412; GFX1132-NEXT: .LBB19_2: 5413; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5414; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5415; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5416; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5417; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 5418; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5419; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5420; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5421; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5422; GFX1132-NEXT: s_endpgm 5423entry: 5424 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5425 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5426 store i32 %old, i32 addrspace(1)* %out 5427 ret void 5428} 5429 5430define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 5431; 5432; 5433; GFX7LESS-LABEL: min_i64_constant: 5434; GFX7LESS: ; %bb.0: ; %entry 5435; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5436; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5437; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5438; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5439; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5440; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5441; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 5442; GFX7LESS-NEXT: ; %bb.1: 5443; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5444; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5445; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5446; GFX7LESS-NEXT: s_mov_b32 m0, -1 5447; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5448; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5449; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5450; GFX7LESS-NEXT: .LBB20_2: 5451; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5452; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5453; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5454; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5455; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 5456; GFX7LESS-NEXT: s_mov_b32 s2, -1 5457; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5458; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5459; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 5460; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 5461; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5462; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5463; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5464; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5465; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5466; GFX7LESS-NEXT: s_endpgm 5467; 5468; GFX8-LABEL: min_i64_constant: 5469; GFX8: ; %bb.0: ; %entry 5470; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5471; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5472; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5473; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5474; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5475; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5476; GFX8-NEXT: s_cbranch_execz .LBB20_2 5477; GFX8-NEXT: ; %bb.1: 5478; GFX8-NEXT: v_mov_b32_e32 v0, 5 5479; GFX8-NEXT: v_mov_b32_e32 v2, 0 5480; GFX8-NEXT: v_mov_b32_e32 v1, 0 5481; GFX8-NEXT: s_mov_b32 m0, -1 5482; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5483; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5484; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5485; GFX8-NEXT: .LBB20_2: 5486; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5487; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5488; GFX8-NEXT: v_readfirstlane_b32 s4, v0 5489; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 5490; GFX8-NEXT: v_readfirstlane_b32 s5, v1 5491; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5492; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5493; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5494; GFX8-NEXT: v_mov_b32_e32 v2, s5 5495; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5496; GFX8-NEXT: v_mov_b32_e32 v2, s4 5497; GFX8-NEXT: s_mov_b32 s2, -1 5498; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5499; GFX8-NEXT: s_mov_b32 s3, 0xf000 5500; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5501; GFX8-NEXT: s_endpgm 5502; 5503; GFX9-LABEL: min_i64_constant: 5504; GFX9: ; %bb.0: ; %entry 5505; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5506; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5507; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5508; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5509; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5510; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5511; GFX9-NEXT: s_cbranch_execz .LBB20_2 5512; GFX9-NEXT: ; %bb.1: 5513; GFX9-NEXT: v_mov_b32_e32 v0, 5 5514; GFX9-NEXT: v_mov_b32_e32 v1, 0 5515; GFX9-NEXT: v_mov_b32_e32 v2, 0 5516; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5517; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5518; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5519; GFX9-NEXT: .LBB20_2: 5520; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5521; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5522; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5523; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 5524; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5525; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5526; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5527; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5528; GFX9-NEXT: v_mov_b32_e32 v2, s5 5529; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5530; GFX9-NEXT: v_mov_b32_e32 v2, s4 5531; GFX9-NEXT: s_mov_b32 s2, -1 5532; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5533; GFX9-NEXT: s_mov_b32 s3, 0xf000 5534; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5535; GFX9-NEXT: s_endpgm 5536; 5537; GFX1064-LABEL: min_i64_constant: 5538; GFX1064: ; %bb.0: ; %entry 5539; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5540; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5541; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5542; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5543; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5544; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5545; GFX1064-NEXT: s_cbranch_execz .LBB20_2 5546; GFX1064-NEXT: ; %bb.1: 5547; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5548; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5549; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5550; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5551; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5552; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5553; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5554; GFX1064-NEXT: buffer_gl0_inv 5555; GFX1064-NEXT: .LBB20_2: 5556; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5557; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5558; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5559; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5560; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5561; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5562; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5563; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5564; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5565; GFX1064-NEXT: s_mov_b32 s2, -1 5566; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5567; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5568; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5569; GFX1064-NEXT: s_endpgm 5570; 5571; GFX1032-LABEL: min_i64_constant: 5572; GFX1032: ; %bb.0: ; %entry 5573; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5574; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5575; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5576; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5577; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5578; GFX1032-NEXT: s_cbranch_execz .LBB20_2 5579; GFX1032-NEXT: ; %bb.1: 5580; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5581; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5582; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5583; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5584; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5585; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5586; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5587; GFX1032-NEXT: buffer_gl0_inv 5588; GFX1032-NEXT: .LBB20_2: 5589; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5590; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5591; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5592; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5593; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5594; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5595; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5596; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5597; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5598; GFX1032-NEXT: s_mov_b32 s2, -1 5599; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5600; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5601; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5602; GFX1032-NEXT: s_endpgm 5603; 5604; GFX1164-LABEL: min_i64_constant: 5605; GFX1164: ; %bb.0: ; %entry 5606; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5607; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5608; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5609; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5610; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5611; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5612; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5613; GFX1164-NEXT: s_cbranch_execz .LBB20_2 5614; GFX1164-NEXT: ; %bb.1: 5615; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5616; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5617; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5618; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5619; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5620; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5621; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5622; GFX1164-NEXT: buffer_gl0_inv 5623; GFX1164-NEXT: .LBB20_2: 5624; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5625; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5626; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5627; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5628; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5629; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5630; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5631; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5632; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5633; GFX1164-NEXT: s_mov_b32 s2, -1 5634; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5635; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5636; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5637; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5638; GFX1164-NEXT: s_endpgm 5639; 5640; GFX1132-LABEL: min_i64_constant: 5641; GFX1132: ; %bb.0: ; %entry 5642; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5643; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5644; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5645; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5646; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5647; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5648; GFX1132-NEXT: s_cbranch_execz .LBB20_2 5649; GFX1132-NEXT: ; %bb.1: 5650; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5651; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5652; GFX1132-NEXT: v_mov_b32_e32 v2, 0 5653; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5654; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5655; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5656; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5657; GFX1132-NEXT: buffer_gl0_inv 5658; GFX1132-NEXT: .LBB20_2: 5659; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5660; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5661; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5662; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5663; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5664; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5665; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5666; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5667; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5668; GFX1132-NEXT: s_mov_b32 s2, -1 5669; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5670; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5671; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5672; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5673; GFX1132-NEXT: s_endpgm 5674entry: 5675 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 5676 store i64 %old, i64 addrspace(1)* %out 5677 ret void 5678} 5679 5680define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 5681; 5682; 5683; GFX7LESS-LABEL: umax_i32_varying: 5684; GFX7LESS: ; %bb.0: ; %entry 5685; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5686; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5687; GFX7LESS-NEXT: s_mov_b32 m0, -1 5688; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5689; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 5690; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5691; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5692; GFX7LESS-NEXT: s_mov_b32 s2, -1 5693; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5694; GFX7LESS-NEXT: s_endpgm 5695; 5696; GFX8-LABEL: umax_i32_varying: 5697; GFX8: ; %bb.0: ; %entry 5698; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5699; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5700; GFX8-NEXT: v_mov_b32_e32 v1, 0 5701; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5702; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5703; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5704; GFX8-NEXT: v_mov_b32_e32 v2, v0 5705; GFX8-NEXT: s_not_b64 exec, exec 5706; GFX8-NEXT: v_mov_b32_e32 v2, 0 5707; GFX8-NEXT: s_not_b64 exec, exec 5708; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5709; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5710; GFX8-NEXT: s_nop 1 5711; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5712; GFX8-NEXT: s_nop 1 5713; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5714; GFX8-NEXT: s_nop 1 5715; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5716; GFX8-NEXT: s_nop 1 5717; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5718; GFX8-NEXT: s_nop 1 5719; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5720; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5721; GFX8-NEXT: s_nop 0 5722; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5723; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5724; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5725; GFX8-NEXT: ; implicit-def: $vgpr0 5726; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5727; GFX8-NEXT: s_cbranch_execz .LBB21_2 5728; GFX8-NEXT: ; %bb.1: 5729; GFX8-NEXT: v_mov_b32_e32 v0, 0 5730; GFX8-NEXT: v_mov_b32_e32 v3, s4 5731; GFX8-NEXT: s_mov_b32 m0, -1 5732; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5733; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 5734; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5735; GFX8-NEXT: .LBB21_2: 5736; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5737; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5738; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5739; GFX8-NEXT: v_mov_b32_e32 v0, v1 5740; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 5741; GFX8-NEXT: s_mov_b32 s3, 0xf000 5742; GFX8-NEXT: s_mov_b32 s2, -1 5743; GFX8-NEXT: s_nop 0 5744; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5745; GFX8-NEXT: s_endpgm 5746; 5747; GFX9-LABEL: umax_i32_varying: 5748; GFX9: ; %bb.0: ; %entry 5749; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5750; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5751; GFX9-NEXT: v_mov_b32_e32 v1, 0 5752; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5753; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5754; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5755; GFX9-NEXT: v_mov_b32_e32 v2, v0 5756; GFX9-NEXT: s_not_b64 exec, exec 5757; GFX9-NEXT: v_mov_b32_e32 v2, 0 5758; GFX9-NEXT: s_not_b64 exec, exec 5759; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5760; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5761; GFX9-NEXT: s_nop 1 5762; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5763; GFX9-NEXT: s_nop 1 5764; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5765; GFX9-NEXT: s_nop 1 5766; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5767; GFX9-NEXT: s_nop 1 5768; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5769; GFX9-NEXT: s_nop 1 5770; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5771; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5772; GFX9-NEXT: s_nop 0 5773; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5774; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5775; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5776; GFX9-NEXT: ; implicit-def: $vgpr0 5777; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5778; GFX9-NEXT: s_cbranch_execz .LBB21_2 5779; GFX9-NEXT: ; %bb.1: 5780; GFX9-NEXT: v_mov_b32_e32 v0, 0 5781; GFX9-NEXT: v_mov_b32_e32 v3, s4 5782; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5783; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 5784; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5785; GFX9-NEXT: .LBB21_2: 5786; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5787; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5788; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5789; GFX9-NEXT: v_mov_b32_e32 v0, v1 5790; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 5791; GFX9-NEXT: s_mov_b32 s3, 0xf000 5792; GFX9-NEXT: s_mov_b32 s2, -1 5793; GFX9-NEXT: s_nop 0 5794; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5795; GFX9-NEXT: s_endpgm 5796; 5797; GFX1064-LABEL: umax_i32_varying: 5798; GFX1064: ; %bb.0: ; %entry 5799; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5800; GFX1064-NEXT: s_not_b64 exec, exec 5801; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5802; GFX1064-NEXT: s_not_b64 exec, exec 5803; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5804; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5805; GFX1064-NEXT: v_mov_b32_e32 v3, 0 5806; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5807; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5808; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5809; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5810; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5811; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5812; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5813; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5814; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5815; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5816; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5817; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5818; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5819; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5820; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5821; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5822; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5823; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5824; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5825; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5826; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5827; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5828; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5829; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5830; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5831; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5832; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5833; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5834; GFX1064-NEXT: s_mov_b32 s2, -1 5835; GFX1064-NEXT: ; implicit-def: $vgpr0 5836; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5837; GFX1064-NEXT: s_cbranch_execz .LBB21_2 5838; GFX1064-NEXT: ; %bb.1: 5839; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5840; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5841; GFX1064-NEXT: s_mov_b32 s3, s7 5842; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5843; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5844; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 5845; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5846; GFX1064-NEXT: buffer_gl0_inv 5847; GFX1064-NEXT: .LBB21_2: 5848; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5849; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5850; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5851; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5852; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 5853; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5854; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5855; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5856; GFX1064-NEXT: s_endpgm 5857; 5858; GFX1032-LABEL: umax_i32_varying: 5859; GFX1032: ; %bb.0: ; %entry 5860; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5861; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5862; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5863; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5864; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5865; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5866; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5867; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5868; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5869; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5870; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5871; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5872; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5873; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5874; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5875; GFX1032-NEXT: v_mov_b32_e32 v3, 0 5876; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5877; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5878; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5879; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5880; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5881; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5882; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5883; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5884; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5885; GFX1032-NEXT: s_mov_b32 s2, -1 5886; GFX1032-NEXT: ; implicit-def: $vgpr0 5887; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5888; GFX1032-NEXT: s_cbranch_execz .LBB21_2 5889; GFX1032-NEXT: ; %bb.1: 5890; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5891; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5892; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5893; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5894; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 5895; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5896; GFX1032-NEXT: buffer_gl0_inv 5897; GFX1032-NEXT: .LBB21_2: 5898; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5899; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5900; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5901; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5902; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 5903; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5904; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5905; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5906; GFX1032-NEXT: s_endpgm 5907; 5908; GFX1164-LABEL: umax_i32_varying: 5909; GFX1164: ; %bb.0: ; %entry 5910; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5911; GFX1164-NEXT: s_not_b64 exec, exec 5912; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5913; GFX1164-NEXT: s_not_b64 exec, exec 5914; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5915; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5916; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5917; GFX1164-NEXT: v_mov_b32_e32 v3, 0 5918; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5919; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5920; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5921; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5922; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5923; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5924; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5925; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5926; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5927; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5928; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5929; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5930; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5931; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5932; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5933; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5934; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5935; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5936; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5937; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5938; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5939; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5940; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5941; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5942; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5943; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5944; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5945; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5946; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5947; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 5948; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5949; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5950; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5951; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5952; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5953; GFX1164-NEXT: s_mov_b32 s2, -1 5954; GFX1164-NEXT: ; implicit-def: $vgpr0 5955; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5956; GFX1164-NEXT: s_cbranch_execz .LBB21_2 5957; GFX1164-NEXT: ; %bb.1: 5958; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5959; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5960; GFX1164-NEXT: s_mov_b32 s3, s7 5961; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5962; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5963; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v4 5964; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5965; GFX1164-NEXT: buffer_gl0_inv 5966; GFX1164-NEXT: .LBB21_2: 5967; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5968; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5969; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5970; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5971; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 5972; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5973; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5974; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5975; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 5976; GFX1164-NEXT: s_endpgm 5977; 5978; GFX1132-LABEL: umax_i32_varying: 5979; GFX1132: ; %bb.0: ; %entry 5980; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5981; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5982; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5983; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5984; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5985; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5986; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5987; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5988; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5989; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5990; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5991; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5992; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5993; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5994; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5995; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5996; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5997; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5998; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5999; GFX1132-NEXT: v_mov_b32_e32 v3, 0 6000; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 6001; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 6002; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6003; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6004; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6005; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6006; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6007; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 6008; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6009; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 6010; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6011; GFX1132-NEXT: s_mov_b32 s2, -1 6012; GFX1132-NEXT: ; implicit-def: $vgpr0 6013; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 6014; GFX1132-NEXT: s_cbranch_execz .LBB21_2 6015; GFX1132-NEXT: ; %bb.1: 6016; GFX1132-NEXT: v_mov_b32_e32 v0, 0 6017; GFX1132-NEXT: v_mov_b32_e32 v4, s4 6018; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6019; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6020; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v4 6021; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6022; GFX1132-NEXT: buffer_gl0_inv 6023; GFX1132-NEXT: .LBB21_2: 6024; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 6025; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 6026; GFX1132-NEXT: v_mov_b32_e32 v0, v3 6027; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6028; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 6029; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6030; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6031; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6032; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6033; GFX1132-NEXT: s_endpgm 6034entry: 6035 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6036 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 6037 store i32 %old, i32 addrspace(1)* %out 6038 ret void 6039} 6040 6041define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 6042; 6043; 6044; GFX7LESS-LABEL: umax_i64_constant: 6045; GFX7LESS: ; %bb.0: ; %entry 6046; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6047; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6048; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6049; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6050; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6051; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6052; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 6053; GFX7LESS-NEXT: ; %bb.1: 6054; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6055; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6056; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6057; GFX7LESS-NEXT: s_mov_b32 m0, -1 6058; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6059; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6060; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6061; GFX7LESS-NEXT: .LBB22_2: 6062; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6063; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6064; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6065; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6066; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6067; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6068; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6069; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6070; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 6071; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6072; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 6073; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6074; GFX7LESS-NEXT: s_mov_b32 s2, -1 6075; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6076; GFX7LESS-NEXT: s_endpgm 6077; 6078; GFX8-LABEL: umax_i64_constant: 6079; GFX8: ; %bb.0: ; %entry 6080; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6081; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6082; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6083; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6084; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6085; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6086; GFX8-NEXT: s_cbranch_execz .LBB22_2 6087; GFX8-NEXT: ; %bb.1: 6088; GFX8-NEXT: v_mov_b32_e32 v0, 5 6089; GFX8-NEXT: v_mov_b32_e32 v2, 0 6090; GFX8-NEXT: v_mov_b32_e32 v1, 0 6091; GFX8-NEXT: s_mov_b32 m0, -1 6092; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6093; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6094; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6095; GFX8-NEXT: .LBB22_2: 6096; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6097; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6098; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6099; GFX8-NEXT: v_readfirstlane_b32 s3, v1 6100; GFX8-NEXT: v_mov_b32_e32 v1, 0 6101; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6102; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6103; GFX8-NEXT: v_mov_b32_e32 v2, s2 6104; GFX8-NEXT: v_mov_b32_e32 v1, s3 6105; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6106; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6107; GFX8-NEXT: s_mov_b32 s3, 0xf000 6108; GFX8-NEXT: s_mov_b32 s2, -1 6109; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6110; GFX8-NEXT: s_endpgm 6111; 6112; GFX9-LABEL: umax_i64_constant: 6113; GFX9: ; %bb.0: ; %entry 6114; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6115; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6116; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6117; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6118; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6119; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6120; GFX9-NEXT: s_cbranch_execz .LBB22_2 6121; GFX9-NEXT: ; %bb.1: 6122; GFX9-NEXT: v_mov_b32_e32 v0, 5 6123; GFX9-NEXT: v_mov_b32_e32 v1, 0 6124; GFX9-NEXT: v_mov_b32_e32 v2, 0 6125; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6126; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6127; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6128; GFX9-NEXT: .LBB22_2: 6129; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6130; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6131; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6132; GFX9-NEXT: v_readfirstlane_b32 s3, v1 6133; GFX9-NEXT: v_mov_b32_e32 v1, 0 6134; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6135; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6136; GFX9-NEXT: v_mov_b32_e32 v2, s2 6137; GFX9-NEXT: v_mov_b32_e32 v1, s3 6138; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6139; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6140; GFX9-NEXT: s_mov_b32 s3, 0xf000 6141; GFX9-NEXT: s_mov_b32 s2, -1 6142; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6143; GFX9-NEXT: s_endpgm 6144; 6145; GFX1064-LABEL: umax_i64_constant: 6146; GFX1064: ; %bb.0: ; %entry 6147; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6148; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6149; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6150; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6151; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6152; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6153; GFX1064-NEXT: s_cbranch_execz .LBB22_2 6154; GFX1064-NEXT: ; %bb.1: 6155; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6156; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6157; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6158; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6159; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6160; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6161; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6162; GFX1064-NEXT: buffer_gl0_inv 6163; GFX1064-NEXT: .LBB22_2: 6164; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6165; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6166; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6167; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6168; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6169; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6170; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6171; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6172; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 6173; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6174; GFX1064-NEXT: s_mov_b32 s2, -1 6175; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6176; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6177; GFX1064-NEXT: s_endpgm 6178; 6179; GFX1032-LABEL: umax_i64_constant: 6180; GFX1032: ; %bb.0: ; %entry 6181; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6182; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6183; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6184; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6185; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6186; GFX1032-NEXT: s_cbranch_execz .LBB22_2 6187; GFX1032-NEXT: ; %bb.1: 6188; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6189; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6190; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6191; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6192; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6193; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6194; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6195; GFX1032-NEXT: buffer_gl0_inv 6196; GFX1032-NEXT: .LBB22_2: 6197; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6198; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6199; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6200; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6201; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6202; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6203; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6204; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6205; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6206; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6207; GFX1032-NEXT: s_mov_b32 s2, -1 6208; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6209; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6210; GFX1032-NEXT: s_endpgm 6211; 6212; GFX1164-LABEL: umax_i64_constant: 6213; GFX1164: ; %bb.0: ; %entry 6214; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6215; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6216; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6217; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6218; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6219; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6220; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6221; GFX1164-NEXT: s_cbranch_execz .LBB22_2 6222; GFX1164-NEXT: ; %bb.1: 6223; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6224; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6225; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6226; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6227; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6228; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6229; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6230; GFX1164-NEXT: buffer_gl0_inv 6231; GFX1164-NEXT: .LBB22_2: 6232; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6233; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6234; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6235; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6236; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6237; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6238; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6239; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6240; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 6241; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6242; GFX1164-NEXT: s_mov_b32 s2, -1 6243; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6244; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6245; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6246; GFX1164-NEXT: s_endpgm 6247; 6248; GFX1132-LABEL: umax_i64_constant: 6249; GFX1132: ; %bb.0: ; %entry 6250; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6251; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6252; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6253; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6254; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6255; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6256; GFX1132-NEXT: s_cbranch_execz .LBB22_2 6257; GFX1132-NEXT: ; %bb.1: 6258; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6259; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6260; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6261; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6262; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6263; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6264; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6265; GFX1132-NEXT: buffer_gl0_inv 6266; GFX1132-NEXT: .LBB22_2: 6267; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6268; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6269; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6270; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6271; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6272; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6273; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6274; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6275; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6276; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6277; GFX1132-NEXT: s_mov_b32 s2, -1 6278; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6279; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6280; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6281; GFX1132-NEXT: s_endpgm 6282entry: 6283 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 6284 store i64 %old, i64 addrspace(1)* %out 6285 ret void 6286} 6287 6288define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 6289; 6290; 6291; GFX7LESS-LABEL: umin_i32_varying: 6292; GFX7LESS: ; %bb.0: ; %entry 6293; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6294; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6295; GFX7LESS-NEXT: s_mov_b32 m0, -1 6296; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6297; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 6298; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6299; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6300; GFX7LESS-NEXT: s_mov_b32 s2, -1 6301; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 6302; GFX7LESS-NEXT: s_endpgm 6303; 6304; GFX8-LABEL: umin_i32_varying: 6305; GFX8: ; %bb.0: ; %entry 6306; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6307; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6308; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6309; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6310; GFX8-NEXT: v_mov_b32_e32 v1, -1 6311; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6312; GFX8-NEXT: v_mov_b32_e32 v2, v0 6313; GFX8-NEXT: s_not_b64 exec, exec 6314; GFX8-NEXT: v_mov_b32_e32 v2, -1 6315; GFX8-NEXT: s_not_b64 exec, exec 6316; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6317; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6318; GFX8-NEXT: s_nop 1 6319; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6320; GFX8-NEXT: s_nop 1 6321; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6322; GFX8-NEXT: s_nop 1 6323; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6324; GFX8-NEXT: s_nop 1 6325; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6326; GFX8-NEXT: s_nop 1 6327; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6328; GFX8-NEXT: v_readlane_b32 s4, v2, 63 6329; GFX8-NEXT: s_nop 0 6330; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6331; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6332; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6333; GFX8-NEXT: ; implicit-def: $vgpr0 6334; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6335; GFX8-NEXT: s_cbranch_execz .LBB23_2 6336; GFX8-NEXT: ; %bb.1: 6337; GFX8-NEXT: v_mov_b32_e32 v0, 0 6338; GFX8-NEXT: v_mov_b32_e32 v3, s4 6339; GFX8-NEXT: s_mov_b32 m0, -1 6340; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6341; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 6342; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6343; GFX8-NEXT: .LBB23_2: 6344; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6345; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6346; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6347; GFX8-NEXT: v_mov_b32_e32 v0, v1 6348; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 6349; GFX8-NEXT: s_mov_b32 s3, 0xf000 6350; GFX8-NEXT: s_mov_b32 s2, -1 6351; GFX8-NEXT: s_nop 0 6352; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 6353; GFX8-NEXT: s_endpgm 6354; 6355; GFX9-LABEL: umin_i32_varying: 6356; GFX9: ; %bb.0: ; %entry 6357; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6358; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6359; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6360; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6361; GFX9-NEXT: v_mov_b32_e32 v1, -1 6362; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6363; GFX9-NEXT: v_mov_b32_e32 v2, v0 6364; GFX9-NEXT: s_not_b64 exec, exec 6365; GFX9-NEXT: v_mov_b32_e32 v2, -1 6366; GFX9-NEXT: s_not_b64 exec, exec 6367; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6368; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6369; GFX9-NEXT: s_nop 1 6370; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6371; GFX9-NEXT: s_nop 1 6372; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6373; GFX9-NEXT: s_nop 1 6374; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6375; GFX9-NEXT: s_nop 1 6376; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6377; GFX9-NEXT: s_nop 1 6378; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6379; GFX9-NEXT: v_readlane_b32 s4, v2, 63 6380; GFX9-NEXT: s_nop 0 6381; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6382; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6383; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6384; GFX9-NEXT: ; implicit-def: $vgpr0 6385; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6386; GFX9-NEXT: s_cbranch_execz .LBB23_2 6387; GFX9-NEXT: ; %bb.1: 6388; GFX9-NEXT: v_mov_b32_e32 v0, 0 6389; GFX9-NEXT: v_mov_b32_e32 v3, s4 6390; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6391; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 6392; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6393; GFX9-NEXT: .LBB23_2: 6394; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6395; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6396; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6397; GFX9-NEXT: v_mov_b32_e32 v0, v1 6398; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 6399; GFX9-NEXT: s_mov_b32 s3, 0xf000 6400; GFX9-NEXT: s_mov_b32 s2, -1 6401; GFX9-NEXT: s_nop 0 6402; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 6403; GFX9-NEXT: s_endpgm 6404; 6405; GFX1064-LABEL: umin_i32_varying: 6406; GFX1064: ; %bb.0: ; %entry 6407; GFX1064-NEXT: v_mov_b32_e32 v1, v0 6408; GFX1064-NEXT: s_not_b64 exec, exec 6409; GFX1064-NEXT: v_mov_b32_e32 v1, -1 6410; GFX1064-NEXT: s_not_b64 exec, exec 6411; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6412; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6413; GFX1064-NEXT: v_mov_b32_e32 v3, -1 6414; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6415; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6416; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6417; GFX1064-NEXT: v_mov_b32_e32 v2, v1 6418; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6419; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6420; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 6421; GFX1064-NEXT: v_mov_b32_e32 v2, s4 6422; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6423; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 6424; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6425; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6426; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6427; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6428; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 6429; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 6430; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6431; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6432; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6433; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 6434; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 6435; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 6436; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6437; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6438; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 6439; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 6440; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 6441; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6442; GFX1064-NEXT: s_mov_b32 s2, -1 6443; GFX1064-NEXT: ; implicit-def: $vgpr0 6444; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 6445; GFX1064-NEXT: s_cbranch_execz .LBB23_2 6446; GFX1064-NEXT: ; %bb.1: 6447; GFX1064-NEXT: v_mov_b32_e32 v0, 0 6448; GFX1064-NEXT: v_mov_b32_e32 v4, s7 6449; GFX1064-NEXT: s_mov_b32 s3, s7 6450; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6451; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6452; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 6453; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6454; GFX1064-NEXT: buffer_gl0_inv 6455; GFX1064-NEXT: .LBB23_2: 6456; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6457; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 6458; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 6459; GFX1064-NEXT: v_mov_b32_e32 v0, v3 6460; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 6461; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6462; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6463; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 6464; GFX1064-NEXT: s_endpgm 6465; 6466; GFX1032-LABEL: umin_i32_varying: 6467; GFX1032: ; %bb.0: ; %entry 6468; GFX1032-NEXT: v_mov_b32_e32 v1, v0 6469; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6470; GFX1032-NEXT: v_mov_b32_e32 v1, -1 6471; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6472; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6473; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6474; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6475; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6476; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6477; GFX1032-NEXT: v_mov_b32_e32 v2, v1 6478; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6479; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6480; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6481; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6482; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6483; GFX1032-NEXT: v_mov_b32_e32 v3, -1 6484; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 6485; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 6486; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6487; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6488; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6489; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6490; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 6491; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6492; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6493; GFX1032-NEXT: s_mov_b32 s2, -1 6494; GFX1032-NEXT: ; implicit-def: $vgpr0 6495; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 6496; GFX1032-NEXT: s_cbranch_execz .LBB23_2 6497; GFX1032-NEXT: ; %bb.1: 6498; GFX1032-NEXT: v_mov_b32_e32 v0, 0 6499; GFX1032-NEXT: v_mov_b32_e32 v4, s4 6500; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6501; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6502; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 6503; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6504; GFX1032-NEXT: buffer_gl0_inv 6505; GFX1032-NEXT: .LBB23_2: 6506; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6507; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 6508; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 6509; GFX1032-NEXT: v_mov_b32_e32 v0, v3 6510; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 6511; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6512; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6513; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 6514; GFX1032-NEXT: s_endpgm 6515; 6516; GFX1164-LABEL: umin_i32_varying: 6517; GFX1164: ; %bb.0: ; %entry 6518; GFX1164-NEXT: v_mov_b32_e32 v1, v0 6519; GFX1164-NEXT: s_not_b64 exec, exec 6520; GFX1164-NEXT: v_mov_b32_e32 v1, -1 6521; GFX1164-NEXT: s_not_b64 exec, exec 6522; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6523; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6524; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6525; GFX1164-NEXT: v_mov_b32_e32 v3, -1 6526; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6527; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6528; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6529; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6530; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6531; GFX1164-NEXT: v_mov_b32_e32 v2, v1 6532; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6533; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6534; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6535; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 6536; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6537; GFX1164-NEXT: v_mov_b32_e32 v2, s4 6538; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6539; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6540; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 6541; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6542; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6543; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6544; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6545; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 6546; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 6547; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6548; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6549; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6550; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6551; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 6552; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 6553; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 6554; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6555; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 6556; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6557; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 6558; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 6559; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 6560; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6561; GFX1164-NEXT: s_mov_b32 s2, -1 6562; GFX1164-NEXT: ; implicit-def: $vgpr0 6563; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 6564; GFX1164-NEXT: s_cbranch_execz .LBB23_2 6565; GFX1164-NEXT: ; %bb.1: 6566; GFX1164-NEXT: v_mov_b32_e32 v0, 0 6567; GFX1164-NEXT: v_mov_b32_e32 v4, s7 6568; GFX1164-NEXT: s_mov_b32 s3, s7 6569; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6570; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6571; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 6572; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6573; GFX1164-NEXT: buffer_gl0_inv 6574; GFX1164-NEXT: .LBB23_2: 6575; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 6576; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 6577; GFX1164-NEXT: v_mov_b32_e32 v0, v3 6578; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6579; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 6580; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6581; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6582; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6583; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6584; GFX1164-NEXT: s_endpgm 6585; 6586; GFX1132-LABEL: umin_i32_varying: 6587; GFX1132: ; %bb.0: ; %entry 6588; GFX1132-NEXT: v_mov_b32_e32 v1, v0 6589; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6590; GFX1132-NEXT: v_mov_b32_e32 v1, -1 6591; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6592; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6593; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6594; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6595; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6596; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6597; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6598; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6599; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6600; GFX1132-NEXT: v_mov_b32_e32 v2, v1 6601; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6602; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6603; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6604; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6605; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6606; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6607; GFX1132-NEXT: v_mov_b32_e32 v3, -1 6608; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 6609; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 6610; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6611; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6612; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6613; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6614; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6615; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 6616; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6617; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 6618; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6619; GFX1132-NEXT: s_mov_b32 s2, -1 6620; GFX1132-NEXT: ; implicit-def: $vgpr0 6621; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 6622; GFX1132-NEXT: s_cbranch_execz .LBB23_2 6623; GFX1132-NEXT: ; %bb.1: 6624; GFX1132-NEXT: v_mov_b32_e32 v0, 0 6625; GFX1132-NEXT: v_mov_b32_e32 v4, s4 6626; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6627; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6628; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 6629; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6630; GFX1132-NEXT: buffer_gl0_inv 6631; GFX1132-NEXT: .LBB23_2: 6632; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 6633; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 6634; GFX1132-NEXT: v_mov_b32_e32 v0, v3 6635; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6636; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 6637; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6638; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6639; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6640; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6641; GFX1132-NEXT: s_endpgm 6642entry: 6643 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6644 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 6645 store i32 %old, i32 addrspace(1)* %out 6646 ret void 6647} 6648 6649define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 6650; 6651; 6652; GFX7LESS-LABEL: umin_i64_constant: 6653; GFX7LESS: ; %bb.0: ; %entry 6654; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6655; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6656; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6657; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6658; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6659; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6660; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 6661; GFX7LESS-NEXT: ; %bb.1: 6662; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6663; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6664; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6665; GFX7LESS-NEXT: s_mov_b32 m0, -1 6666; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6667; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6668; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6669; GFX7LESS-NEXT: .LBB24_2: 6670; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6671; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6672; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6673; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6674; GFX7LESS-NEXT: s_mov_b32 s2, -1 6675; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6676; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6677; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 6678; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6679; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6680; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6681; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6682; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6683; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6684; GFX7LESS-NEXT: s_endpgm 6685; 6686; GFX8-LABEL: umin_i64_constant: 6687; GFX8: ; %bb.0: ; %entry 6688; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6689; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6690; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6691; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6692; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6693; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6694; GFX8-NEXT: s_cbranch_execz .LBB24_2 6695; GFX8-NEXT: ; %bb.1: 6696; GFX8-NEXT: v_mov_b32_e32 v0, 5 6697; GFX8-NEXT: v_mov_b32_e32 v2, 0 6698; GFX8-NEXT: v_mov_b32_e32 v1, 0 6699; GFX8-NEXT: s_mov_b32 m0, -1 6700; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6701; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6702; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6703; GFX8-NEXT: .LBB24_2: 6704; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6705; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6706; GFX8-NEXT: v_readfirstlane_b32 s4, v0 6707; GFX8-NEXT: v_readfirstlane_b32 s5, v1 6708; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6709; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6710; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6711; GFX8-NEXT: v_mov_b32_e32 v2, s5 6712; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6713; GFX8-NEXT: v_mov_b32_e32 v2, s4 6714; GFX8-NEXT: s_mov_b32 s2, -1 6715; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6716; GFX8-NEXT: s_mov_b32 s3, 0xf000 6717; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6718; GFX8-NEXT: s_endpgm 6719; 6720; GFX9-LABEL: umin_i64_constant: 6721; GFX9: ; %bb.0: ; %entry 6722; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6723; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6724; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6725; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6726; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6727; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6728; GFX9-NEXT: s_cbranch_execz .LBB24_2 6729; GFX9-NEXT: ; %bb.1: 6730; GFX9-NEXT: v_mov_b32_e32 v0, 5 6731; GFX9-NEXT: v_mov_b32_e32 v1, 0 6732; GFX9-NEXT: v_mov_b32_e32 v2, 0 6733; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6734; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6735; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6736; GFX9-NEXT: .LBB24_2: 6737; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6738; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6739; GFX9-NEXT: v_readfirstlane_b32 s4, v0 6740; GFX9-NEXT: v_readfirstlane_b32 s5, v1 6741; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6742; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6743; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6744; GFX9-NEXT: v_mov_b32_e32 v2, s5 6745; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6746; GFX9-NEXT: v_mov_b32_e32 v2, s4 6747; GFX9-NEXT: s_mov_b32 s2, -1 6748; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6749; GFX9-NEXT: s_mov_b32 s3, 0xf000 6750; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6751; GFX9-NEXT: s_endpgm 6752; 6753; GFX1064-LABEL: umin_i64_constant: 6754; GFX1064: ; %bb.0: ; %entry 6755; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6756; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6757; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6758; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6759; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6760; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6761; GFX1064-NEXT: s_cbranch_execz .LBB24_2 6762; GFX1064-NEXT: ; %bb.1: 6763; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6764; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6765; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6766; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6767; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6768; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6769; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6770; GFX1064-NEXT: buffer_gl0_inv 6771; GFX1064-NEXT: .LBB24_2: 6772; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6773; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6774; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6775; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6776; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6777; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6778; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6779; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6780; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6781; GFX1064-NEXT: s_mov_b32 s2, -1 6782; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6783; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6784; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6785; GFX1064-NEXT: s_endpgm 6786; 6787; GFX1032-LABEL: umin_i64_constant: 6788; GFX1032: ; %bb.0: ; %entry 6789; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6790; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6791; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6792; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6793; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6794; GFX1032-NEXT: s_cbranch_execz .LBB24_2 6795; GFX1032-NEXT: ; %bb.1: 6796; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6797; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6798; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6799; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6800; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6801; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6802; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6803; GFX1032-NEXT: buffer_gl0_inv 6804; GFX1032-NEXT: .LBB24_2: 6805; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6806; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6807; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6808; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6809; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6810; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6811; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6812; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6813; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6814; GFX1032-NEXT: s_mov_b32 s2, -1 6815; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6816; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6817; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6818; GFX1032-NEXT: s_endpgm 6819; 6820; GFX1164-LABEL: umin_i64_constant: 6821; GFX1164: ; %bb.0: ; %entry 6822; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6823; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6824; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6825; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6826; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6827; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6828; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6829; GFX1164-NEXT: s_cbranch_execz .LBB24_2 6830; GFX1164-NEXT: ; %bb.1: 6831; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6832; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6833; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6834; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6835; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6836; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6837; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6838; GFX1164-NEXT: buffer_gl0_inv 6839; GFX1164-NEXT: .LBB24_2: 6840; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6841; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6842; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6843; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6844; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6845; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6846; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6847; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6848; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6849; GFX1164-NEXT: s_mov_b32 s2, -1 6850; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6851; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6852; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6853; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6854; GFX1164-NEXT: s_endpgm 6855; 6856; GFX1132-LABEL: umin_i64_constant: 6857; GFX1132: ; %bb.0: ; %entry 6858; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6859; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6860; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6861; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6862; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6863; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6864; GFX1132-NEXT: s_cbranch_execz .LBB24_2 6865; GFX1132-NEXT: ; %bb.1: 6866; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6867; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6868; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6869; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6870; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6871; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6872; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6873; GFX1132-NEXT: buffer_gl0_inv 6874; GFX1132-NEXT: .LBB24_2: 6875; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6876; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6877; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6878; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6879; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6880; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6881; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6882; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6883; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6884; GFX1132-NEXT: s_mov_b32 s2, -1 6885; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6886; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6887; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6888; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 6889; GFX1132-NEXT: s_endpgm 6890entry: 6891 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 6892 store i64 %old, i64 addrspace(1)* %out 6893 ret void 6894} 6895