1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11 12@local_var32 = addrspace(3) global i32 undef, align 4 13@local_var64 = addrspace(3) global i64 undef, align 8 14 15; Show what the atomic optimization pass will do for local pointers. 16 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 18; 19; 20; GFX7LESS-LABEL: add_i32_constant: 21; GFX7LESS: ; %bb.0: ; %entry 22; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 23; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 24; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 25; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 26; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 27; GFX7LESS-NEXT: ; implicit-def: $vgpr1 28; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 29; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 30; GFX7LESS-NEXT: ; %bb.1: 31; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 32; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 33; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 34; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 35; GFX7LESS-NEXT: s_mov_b32 m0, -1 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: .LBB0_2: 40; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 41; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 42; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 43; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 44; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 45; GFX7LESS-NEXT: s_mov_b32 s2, -1 46; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; GFX7LESS-NEXT: s_endpgm 48; 49; GFX8-LABEL: add_i32_constant: 50; GFX8: ; %bb.0: ; %entry 51; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 52; GFX8-NEXT: s_mov_b64 s[2:3], exec 53; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 54; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 55; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 56; GFX8-NEXT: ; implicit-def: $vgpr1 57; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 58; GFX8-NEXT: s_cbranch_execz .LBB0_2 59; GFX8-NEXT: ; %bb.1: 60; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 61; GFX8-NEXT: s_mul_i32 s2, s2, 5 62; GFX8-NEXT: v_mov_b32_e32 v1, 0 63; GFX8-NEXT: v_mov_b32_e32 v2, s2 64; GFX8-NEXT: s_mov_b32 m0, -1 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: .LBB0_2: 69; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 70; GFX8-NEXT: s_waitcnt lgkmcnt(0) 71; GFX8-NEXT: v_readfirstlane_b32 s2, v1 72; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 73; GFX8-NEXT: s_mov_b32 s3, 0xf000 74; GFX8-NEXT: s_mov_b32 s2, -1 75; GFX8-NEXT: s_nop 1 76; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX8-NEXT: s_endpgm 78; 79; GFX9-LABEL: add_i32_constant: 80; GFX9: ; %bb.0: ; %entry 81; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 82; GFX9-NEXT: s_mov_b64 s[2:3], exec 83; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 84; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 85; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 86; GFX9-NEXT: ; implicit-def: $vgpr1 87; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX9-NEXT: s_cbranch_execz .LBB0_2 89; GFX9-NEXT: ; %bb.1: 90; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 91; GFX9-NEXT: s_mul_i32 s2, s2, 5 92; GFX9-NEXT: v_mov_b32_e32 v1, 0 93; GFX9-NEXT: v_mov_b32_e32 v2, s2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: .LBB0_2: 98; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 99; GFX9-NEXT: s_waitcnt lgkmcnt(0) 100; GFX9-NEXT: v_readfirstlane_b32 s2, v1 101; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 102; GFX9-NEXT: s_mov_b32 s3, 0xf000 103; GFX9-NEXT: s_mov_b32 s2, -1 104; GFX9-NEXT: s_nop 1 105; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; GFX9-NEXT: s_endpgm 107; 108; GFX1064-LABEL: add_i32_constant: 109; GFX1064: ; %bb.0: ; %entry 110; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 111; GFX1064-NEXT: s_mov_b64 s[2:3], exec 112; GFX1064-NEXT: ; implicit-def: $vgpr1 113; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 114; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 115; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 116; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 117; GFX1064-NEXT: s_cbranch_execz .LBB0_2 118; GFX1064-NEXT: ; %bb.1: 119; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 120; GFX1064-NEXT: v_mov_b32_e32 v1, 0 121; GFX1064-NEXT: s_mul_i32 s2, s2, 5 122; GFX1064-NEXT: v_mov_b32_e32 v2, s2 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 125; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 126; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 127; GFX1064-NEXT: buffer_gl0_inv 128; GFX1064-NEXT: .LBB0_2: 129; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 130; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 131; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 132; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 133; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 134; GFX1064-NEXT: s_mov_b32 s2, -1 135; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 136; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 137; GFX1064-NEXT: s_endpgm 138; 139; GFX1032-LABEL: add_i32_constant: 140; GFX1032: ; %bb.0: ; %entry 141; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 142; GFX1032-NEXT: s_mov_b32 s3, exec_lo 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz .LBB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 150; GFX1032-NEXT: v_mov_b32_e32 v1, 0 151; GFX1032-NEXT: s_mul_i32 s3, s3, 5 152; GFX1032-NEXT: v_mov_b32_e32 v2, s3 153; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 154; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 155; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 156; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 157; GFX1032-NEXT: buffer_gl0_inv 158; GFX1032-NEXT: .LBB0_2: 159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 166; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 167; GFX1032-NEXT: s_endpgm 168; 169; GFX1164-LABEL: add_i32_constant: 170; GFX1164: ; %bb.0: ; %entry 171; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 172; GFX1164-NEXT: s_mov_b64 s[2:3], exec 173; GFX1164-NEXT: s_mov_b64 s[4:5], exec 174; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 175; GFX1164-NEXT: ; implicit-def: $vgpr1 176; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 177; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 178; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 179; GFX1164-NEXT: s_cbranch_execz .LBB0_2 180; GFX1164-NEXT: ; %bb.1: 181; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 182; GFX1164-NEXT: v_mov_b32_e32 v1, 0 183; GFX1164-NEXT: s_mul_i32 s2, s2, 5 184; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 185; GFX1164-NEXT: v_mov_b32_e32 v2, s2 186; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 187; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 188; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 189; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 190; GFX1164-NEXT: buffer_gl0_inv 191; GFX1164-NEXT: .LBB0_2: 192; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 193; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 194; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 195; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 196; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 197; GFX1164-NEXT: s_mov_b32 s2, -1 198; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 199; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 200; GFX1164-NEXT: s_endpgm 201; 202; GFX1132-LABEL: add_i32_constant: 203; GFX1132: ; %bb.0: ; %entry 204; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 205; GFX1132-NEXT: s_mov_b32 s3, exec_lo 206; GFX1132-NEXT: s_mov_b32 s2, exec_lo 207; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 208; GFX1132-NEXT: ; implicit-def: $vgpr1 209; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 210; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 211; GFX1132-NEXT: s_cbranch_execz .LBB0_2 212; GFX1132-NEXT: ; %bb.1: 213; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 214; GFX1132-NEXT: v_mov_b32_e32 v1, 0 215; GFX1132-NEXT: s_mul_i32 s3, s3, 5 216; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 217; GFX1132-NEXT: v_mov_b32_e32 v2, s3 218; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 219; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 220; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 221; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 222; GFX1132-NEXT: buffer_gl0_inv 223; GFX1132-NEXT: .LBB0_2: 224; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 225; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 226; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 227; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 228; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 229; GFX1132-NEXT: s_mov_b32 s2, -1 230; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 231; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 232; GFX1132-NEXT: s_endpgm 233entry: 234 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 235 store i32 %old, i32 addrspace(1)* %out 236 ret void 237} 238 239define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 240; 241; 242; GFX7LESS-LABEL: add_i32_uniform: 243; GFX7LESS: ; %bb.0: ; %entry 244; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 245; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 246; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 247; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 248; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 249; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 250; GFX7LESS-NEXT: ; implicit-def: $vgpr1 251; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 252; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 253; GFX7LESS-NEXT: ; %bb.1: 254; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 255; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 256; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 257; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 258; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 259; GFX7LESS-NEXT: s_mov_b32 m0, -1 260; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 261; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 262; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 263; GFX7LESS-NEXT: .LBB1_2: 264; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 265; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 266; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 267; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 268; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 269; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 270; GFX7LESS-NEXT: s_mov_b32 s6, -1 271; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 272; GFX7LESS-NEXT: s_endpgm 273; 274; GFX8-LABEL: add_i32_uniform: 275; GFX8: ; %bb.0: ; %entry 276; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 277; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 278; GFX8-NEXT: s_mov_b64 s[2:3], exec 279; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 280; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 281; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 282; GFX8-NEXT: ; implicit-def: $vgpr1 283; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 284; GFX8-NEXT: s_cbranch_execz .LBB1_2 285; GFX8-NEXT: ; %bb.1: 286; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 287; GFX8-NEXT: s_waitcnt lgkmcnt(0) 288; GFX8-NEXT: s_mul_i32 s2, s6, s2 289; GFX8-NEXT: v_mov_b32_e32 v1, 0 290; GFX8-NEXT: v_mov_b32_e32 v2, s2 291; GFX8-NEXT: s_mov_b32 m0, -1 292; GFX8-NEXT: s_waitcnt lgkmcnt(0) 293; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 294; GFX8-NEXT: s_waitcnt lgkmcnt(0) 295; GFX8-NEXT: .LBB1_2: 296; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 297; GFX8-NEXT: s_waitcnt lgkmcnt(0) 298; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 299; GFX8-NEXT: v_readfirstlane_b32 s0, v1 300; GFX8-NEXT: s_mov_b32 s7, 0xf000 301; GFX8-NEXT: s_mov_b32 s6, -1 302; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 303; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 304; GFX8-NEXT: s_endpgm 305; 306; GFX9-LABEL: add_i32_uniform: 307; GFX9: ; %bb.0: ; %entry 308; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 309; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 310; GFX9-NEXT: s_mov_b64 s[2:3], exec 311; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 312; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 313; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 314; GFX9-NEXT: ; implicit-def: $vgpr1 315; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 316; GFX9-NEXT: s_cbranch_execz .LBB1_2 317; GFX9-NEXT: ; %bb.1: 318; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 319; GFX9-NEXT: s_waitcnt lgkmcnt(0) 320; GFX9-NEXT: s_mul_i32 s2, s6, s2 321; GFX9-NEXT: v_mov_b32_e32 v1, 0 322; GFX9-NEXT: v_mov_b32_e32 v2, s2 323; GFX9-NEXT: s_waitcnt lgkmcnt(0) 324; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 325; GFX9-NEXT: s_waitcnt lgkmcnt(0) 326; GFX9-NEXT: .LBB1_2: 327; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 329; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 330; GFX9-NEXT: v_readfirstlane_b32 s0, v1 331; GFX9-NEXT: s_mov_b32 s7, 0xf000 332; GFX9-NEXT: s_mov_b32 s6, -1 333; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 334; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 335; GFX9-NEXT: s_endpgm 336; 337; GFX1064-LABEL: add_i32_uniform: 338; GFX1064: ; %bb.0: ; %entry 339; GFX1064-NEXT: s_clause 0x1 340; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 341; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 342; GFX1064-NEXT: s_mov_b64 s[2:3], exec 343; GFX1064-NEXT: ; implicit-def: $vgpr1 344; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 345; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 346; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 347; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 348; GFX1064-NEXT: s_cbranch_execz .LBB1_2 349; GFX1064-NEXT: ; %bb.1: 350; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 351; GFX1064-NEXT: v_mov_b32_e32 v1, 0 352; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 353; GFX1064-NEXT: s_mul_i32 s2, s6, s2 354; GFX1064-NEXT: v_mov_b32_e32 v2, s2 355; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 356; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 357; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 358; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 359; GFX1064-NEXT: buffer_gl0_inv 360; GFX1064-NEXT: .LBB1_2: 361; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 362; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 363; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 364; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 365; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 366; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1] 367; GFX1064-NEXT: s_mov_b32 s6, -1 368; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 369; GFX1064-NEXT: s_endpgm 370; 371; GFX1032-LABEL: add_i32_uniform: 372; GFX1032: ; %bb.0: ; %entry 373; GFX1032-NEXT: s_clause 0x1 374; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 375; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 376; GFX1032-NEXT: s_mov_b32 s3, exec_lo 377; GFX1032-NEXT: ; implicit-def: $vgpr1 378; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 379; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 380; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 381; GFX1032-NEXT: s_cbranch_execz .LBB1_2 382; GFX1032-NEXT: ; %bb.1: 383; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 384; GFX1032-NEXT: v_mov_b32_e32 v1, 0 385; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 386; GFX1032-NEXT: s_mul_i32 s1, s2, s1 387; GFX1032-NEXT: v_mov_b32_e32 v2, s1 388; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 389; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 390; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 391; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 392; GFX1032-NEXT: buffer_gl0_inv 393; GFX1032-NEXT: .LBB1_2: 394; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 395; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 396; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 397; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 398; GFX1032-NEXT: s_mov_b32 s6, -1 399; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 400; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] 401; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 402; GFX1032-NEXT: s_endpgm 403; 404; GFX1164-LABEL: add_i32_uniform: 405; GFX1164: ; %bb.0: ; %entry 406; GFX1164-NEXT: s_clause 0x1 407; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 408; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 409; GFX1164-NEXT: s_mov_b64 s[2:3], exec 410; GFX1164-NEXT: s_mov_b64 s[0:1], exec 411; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 412; GFX1164-NEXT: ; implicit-def: $vgpr1 413; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 414; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 415; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 416; GFX1164-NEXT: s_cbranch_execz .LBB1_2 417; GFX1164-NEXT: ; %bb.1: 418; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 419; GFX1164-NEXT: v_mov_b32_e32 v1, 0 420; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 421; GFX1164-NEXT: s_mul_i32 s2, s6, s2 422; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 423; GFX1164-NEXT: v_mov_b32_e32 v2, s2 424; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 425; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 426; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 427; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 428; GFX1164-NEXT: buffer_gl0_inv 429; GFX1164-NEXT: .LBB1_2: 430; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 431; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 432; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 433; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 434; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 435; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1] 436; GFX1164-NEXT: s_mov_b32 s6, -1 437; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 438; GFX1164-NEXT: s_endpgm 439; 440; GFX1132-LABEL: add_i32_uniform: 441; GFX1132: ; %bb.0: ; %entry 442; GFX1132-NEXT: s_clause 0x1 443; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 444; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 445; GFX1132-NEXT: s_mov_b32 s2, exec_lo 446; GFX1132-NEXT: s_mov_b32 s1, exec_lo 447; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 448; GFX1132-NEXT: ; implicit-def: $vgpr1 449; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 450; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 451; GFX1132-NEXT: s_cbranch_execz .LBB1_2 452; GFX1132-NEXT: ; %bb.1: 453; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 454; GFX1132-NEXT: v_mov_b32_e32 v1, 0 455; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 456; GFX1132-NEXT: s_mul_i32 s2, s0, s2 457; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 458; GFX1132-NEXT: v_mov_b32_e32 v2, s2 459; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 460; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 461; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 462; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 463; GFX1132-NEXT: buffer_gl0_inv 464; GFX1132-NEXT: .LBB1_2: 465; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 466; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 467; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 468; GFX1132-NEXT: s_mov_b32 s6, -1 469; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 470; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 471; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] 472; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 473; GFX1132-NEXT: s_endpgm 474entry: 475 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 476 store i32 %old, i32 addrspace(1)* %out 477 ret void 478} 479 480define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 481; 482; 483; GFX7LESS-LABEL: add_i32_varying: 484; GFX7LESS: ; %bb.0: ; %entry 485; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 486; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 487; GFX7LESS-NEXT: s_mov_b32 m0, -1 488; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 489; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 490; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 491; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 492; GFX7LESS-NEXT: s_mov_b32 s2, -1 493; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 494; GFX7LESS-NEXT: s_endpgm 495; 496; GFX8-LABEL: add_i32_varying: 497; GFX8: ; %bb.0: ; %entry 498; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 499; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 500; GFX8-NEXT: v_mov_b32_e32 v1, 0 501; GFX8-NEXT: s_mov_b64 exec, s[2:3] 502; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 503; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 504; GFX8-NEXT: v_mov_b32_e32 v2, v0 505; GFX8-NEXT: s_not_b64 exec, exec 506; GFX8-NEXT: v_mov_b32_e32 v2, 0 507; GFX8-NEXT: s_not_b64 exec, exec 508; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 509; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 510; GFX8-NEXT: s_nop 1 511; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 512; GFX8-NEXT: s_nop 1 513; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 514; GFX8-NEXT: s_nop 1 515; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 516; GFX8-NEXT: s_nop 1 517; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 518; GFX8-NEXT: s_nop 1 519; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 520; GFX8-NEXT: v_readlane_b32 s4, v2, 63 521; GFX8-NEXT: s_nop 0 522; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 523; GFX8-NEXT: s_mov_b64 exec, s[2:3] 524; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 525; GFX8-NEXT: ; implicit-def: $vgpr0 526; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 527; GFX8-NEXT: s_cbranch_execz .LBB2_2 528; GFX8-NEXT: ; %bb.1: 529; GFX8-NEXT: v_mov_b32_e32 v0, 0 530; GFX8-NEXT: v_mov_b32_e32 v3, s4 531; GFX8-NEXT: s_mov_b32 m0, -1 532; GFX8-NEXT: s_waitcnt lgkmcnt(0) 533; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 534; GFX8-NEXT: s_waitcnt lgkmcnt(0) 535; GFX8-NEXT: .LBB2_2: 536; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 537; GFX8-NEXT: s_waitcnt lgkmcnt(0) 538; GFX8-NEXT: v_readfirstlane_b32 s2, v0 539; GFX8-NEXT: v_mov_b32_e32 v0, v1 540; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 541; GFX8-NEXT: s_mov_b32 s3, 0xf000 542; GFX8-NEXT: s_mov_b32 s2, -1 543; GFX8-NEXT: s_nop 0 544; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 545; GFX8-NEXT: s_endpgm 546; 547; GFX9-LABEL: add_i32_varying: 548; GFX9: ; %bb.0: ; %entry 549; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 550; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 551; GFX9-NEXT: v_mov_b32_e32 v1, 0 552; GFX9-NEXT: s_mov_b64 exec, s[2:3] 553; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 554; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 555; GFX9-NEXT: v_mov_b32_e32 v2, v0 556; GFX9-NEXT: s_not_b64 exec, exec 557; GFX9-NEXT: v_mov_b32_e32 v2, 0 558; GFX9-NEXT: s_not_b64 exec, exec 559; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 560; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 561; GFX9-NEXT: s_nop 1 562; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 563; GFX9-NEXT: s_nop 1 564; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 565; GFX9-NEXT: s_nop 1 566; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 567; GFX9-NEXT: s_nop 1 568; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 569; GFX9-NEXT: s_nop 1 570; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 571; GFX9-NEXT: v_readlane_b32 s4, v2, 63 572; GFX9-NEXT: s_nop 0 573; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 574; GFX9-NEXT: s_mov_b64 exec, s[2:3] 575; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 576; GFX9-NEXT: ; implicit-def: $vgpr0 577; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 578; GFX9-NEXT: s_cbranch_execz .LBB2_2 579; GFX9-NEXT: ; %bb.1: 580; GFX9-NEXT: v_mov_b32_e32 v0, 0 581; GFX9-NEXT: v_mov_b32_e32 v3, s4 582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 583; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 584; GFX9-NEXT: s_waitcnt lgkmcnt(0) 585; GFX9-NEXT: .LBB2_2: 586; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 587; GFX9-NEXT: s_waitcnt lgkmcnt(0) 588; GFX9-NEXT: v_readfirstlane_b32 s2, v0 589; GFX9-NEXT: v_mov_b32_e32 v0, v1 590; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 591; GFX9-NEXT: s_mov_b32 s3, 0xf000 592; GFX9-NEXT: s_mov_b32 s2, -1 593; GFX9-NEXT: s_nop 0 594; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 595; GFX9-NEXT: s_endpgm 596; 597; GFX1064-LABEL: add_i32_varying: 598; GFX1064: ; %bb.0: ; %entry 599; GFX1064-NEXT: v_mov_b32_e32 v1, v0 600; GFX1064-NEXT: s_not_b64 exec, exec 601; GFX1064-NEXT: v_mov_b32_e32 v1, 0 602; GFX1064-NEXT: s_not_b64 exec, exec 603; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 604; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 605; GFX1064-NEXT: v_mov_b32_e32 v3, 0 606; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 607; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 608; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 609; GFX1064-NEXT: v_mov_b32_e32 v2, v1 610; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 611; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 612; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 613; GFX1064-NEXT: v_mov_b32_e32 v2, s4 614; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 615; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 616; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 617; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 618; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 619; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 620; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 621; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 622; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 623; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 624; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 625; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 626; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 627; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 628; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 629; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 630; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 631; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 632; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 633; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 634; GFX1064-NEXT: s_mov_b32 s2, -1 635; GFX1064-NEXT: ; implicit-def: $vgpr0 636; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 637; GFX1064-NEXT: s_cbranch_execz .LBB2_2 638; GFX1064-NEXT: ; %bb.1: 639; GFX1064-NEXT: v_mov_b32_e32 v0, 0 640; GFX1064-NEXT: v_mov_b32_e32 v4, s7 641; GFX1064-NEXT: s_mov_b32 s3, s7 642; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 643; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 644; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 645; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 646; GFX1064-NEXT: buffer_gl0_inv 647; GFX1064-NEXT: .LBB2_2: 648; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 649; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 650; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 651; GFX1064-NEXT: v_mov_b32_e32 v0, v3 652; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 653; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 654; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 655; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 656; GFX1064-NEXT: s_endpgm 657; 658; GFX1032-LABEL: add_i32_varying: 659; GFX1032: ; %bb.0: ; %entry 660; GFX1032-NEXT: v_mov_b32_e32 v1, v0 661; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 662; GFX1032-NEXT: v_mov_b32_e32 v1, 0 663; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 664; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 665; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 666; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX1032-NEXT: v_mov_b32_e32 v2, v1 670; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 671; GFX1032-NEXT: s_mov_b32 exec_lo, s2 672; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 673; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 674; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 675; GFX1032-NEXT: v_mov_b32_e32 v3, 0 676; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 677; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 678; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 679; GFX1032-NEXT: s_mov_b32 exec_lo, s2 680; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 681; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 682; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 683; GFX1032-NEXT: s_mov_b32 exec_lo, s2 684; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 685; GFX1032-NEXT: s_mov_b32 s2, -1 686; GFX1032-NEXT: ; implicit-def: $vgpr0 687; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 688; GFX1032-NEXT: s_cbranch_execz .LBB2_2 689; GFX1032-NEXT: ; %bb.1: 690; GFX1032-NEXT: v_mov_b32_e32 v0, 0 691; GFX1032-NEXT: v_mov_b32_e32 v4, s4 692; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 693; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 694; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 695; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 696; GFX1032-NEXT: buffer_gl0_inv 697; GFX1032-NEXT: .LBB2_2: 698; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 699; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 700; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 701; GFX1032-NEXT: v_mov_b32_e32 v0, v3 702; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 703; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 704; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 705; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 706; GFX1032-NEXT: s_endpgm 707; 708; GFX1164-LABEL: add_i32_varying: 709; GFX1164: ; %bb.0: ; %entry 710; GFX1164-NEXT: v_mov_b32_e32 v1, v0 711; GFX1164-NEXT: s_not_b64 exec, exec 712; GFX1164-NEXT: v_mov_b32_e32 v1, 0 713; GFX1164-NEXT: s_not_b64 exec, exec 714; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 715; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 716; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 717; GFX1164-NEXT: v_mov_b32_e32 v3, 0 718; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 719; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 720; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 721; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 722; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 723; GFX1164-NEXT: v_mov_b32_e32 v2, v1 724; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 725; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 726; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 727; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 728; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 729; GFX1164-NEXT: v_mov_b32_e32 v2, s4 730; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 731; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 732; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 733; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 734; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 735; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 736; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 737; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 738; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 739; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 740; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 741; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 742; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 743; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 744; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 745; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 746; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 747; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 748; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 749; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 750; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 751; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 752; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 753; GFX1164-NEXT: s_mov_b32 s2, -1 754; GFX1164-NEXT: ; implicit-def: $vgpr0 755; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 756; GFX1164-NEXT: s_cbranch_execz .LBB2_2 757; GFX1164-NEXT: ; %bb.1: 758; GFX1164-NEXT: v_mov_b32_e32 v0, 0 759; GFX1164-NEXT: v_mov_b32_e32 v4, s7 760; GFX1164-NEXT: s_mov_b32 s3, s7 761; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 762; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 763; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v4 764; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 765; GFX1164-NEXT: buffer_gl0_inv 766; GFX1164-NEXT: .LBB2_2: 767; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 768; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 769; GFX1164-NEXT: v_mov_b32_e32 v0, v3 770; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 771; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 772; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 773; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 774; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 775; GFX1164-NEXT: s_endpgm 776; 777; GFX1132-LABEL: add_i32_varying: 778; GFX1132: ; %bb.0: ; %entry 779; GFX1132-NEXT: v_mov_b32_e32 v1, v0 780; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 781; GFX1132-NEXT: v_mov_b32_e32 v1, 0 782; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 783; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 784; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 785; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 786; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 787; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 788; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 789; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 790; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 791; GFX1132-NEXT: v_mov_b32_e32 v2, v1 792; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 793; GFX1132-NEXT: s_mov_b32 exec_lo, s2 794; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 795; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 796; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 797; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 798; GFX1132-NEXT: v_mov_b32_e32 v3, 0 799; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 800; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 801; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 802; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 803; GFX1132-NEXT: s_mov_b32 exec_lo, s2 804; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 805; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 806; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 807; GFX1132-NEXT: s_mov_b32 exec_lo, s2 808; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 809; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 810; GFX1132-NEXT: s_mov_b32 s2, -1 811; GFX1132-NEXT: ; implicit-def: $vgpr0 812; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 813; GFX1132-NEXT: s_cbranch_execz .LBB2_2 814; GFX1132-NEXT: ; %bb.1: 815; GFX1132-NEXT: v_mov_b32_e32 v0, 0 816; GFX1132-NEXT: v_mov_b32_e32 v4, s4 817; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 818; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 819; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v4 820; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 821; GFX1132-NEXT: buffer_gl0_inv 822; GFX1132-NEXT: .LBB2_2: 823; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 824; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 825; GFX1132-NEXT: v_mov_b32_e32 v0, v3 826; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 827; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 828; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 829; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 830; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 831; GFX1132-NEXT: s_endpgm 832entry: 833 %lane = call i32 @llvm.amdgcn.workitem.id.x() 834 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 835 store i32 %old, i32 addrspace(1)* %out 836 ret void 837} 838 839define amdgpu_kernel void @add_i32_varying_nouse() { 840; GFX7LESS-LABEL: add_i32_varying_nouse: 841; GFX7LESS: ; %bb.0: ; %entry 842; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 843; GFX7LESS-NEXT: s_mov_b32 m0, -1 844; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 845; GFX7LESS-NEXT: ds_add_u32 v1, v0 846; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 847; GFX7LESS-NEXT: s_endpgm 848; 849; GFX8-LABEL: add_i32_varying_nouse: 850; GFX8: ; %bb.0: ; %entry 851; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 852; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 853; GFX8-NEXT: v_mov_b32_e32 v1, v0 854; GFX8-NEXT: s_not_b64 exec, exec 855; GFX8-NEXT: v_mov_b32_e32 v1, 0 856; GFX8-NEXT: s_not_b64 exec, exec 857; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 858; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 859; GFX8-NEXT: s_nop 1 860; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 861; GFX8-NEXT: s_nop 1 862; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 863; GFX8-NEXT: s_nop 1 864; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 865; GFX8-NEXT: s_nop 1 866; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 867; GFX8-NEXT: s_nop 1 868; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 869; GFX8-NEXT: v_readlane_b32 s2, v1, 63 870; GFX8-NEXT: s_mov_b64 exec, s[0:1] 871; GFX8-NEXT: s_mov_b32 s0, s2 872; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 873; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 874; GFX8-NEXT: s_cbranch_execz .LBB3_2 875; GFX8-NEXT: ; %bb.1: 876; GFX8-NEXT: v_mov_b32_e32 v0, 0 877; GFX8-NEXT: v_mov_b32_e32 v2, s0 878; GFX8-NEXT: s_mov_b32 m0, -1 879; GFX8-NEXT: s_waitcnt lgkmcnt(0) 880; GFX8-NEXT: ds_add_u32 v0, v2 881; GFX8-NEXT: s_waitcnt lgkmcnt(0) 882; GFX8-NEXT: .LBB3_2: 883; GFX8-NEXT: s_endpgm 884; 885; GFX9-LABEL: add_i32_varying_nouse: 886; GFX9: ; %bb.0: ; %entry 887; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 888; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 889; GFX9-NEXT: v_mov_b32_e32 v1, v0 890; GFX9-NEXT: s_not_b64 exec, exec 891; GFX9-NEXT: v_mov_b32_e32 v1, 0 892; GFX9-NEXT: s_not_b64 exec, exec 893; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 894; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 895; GFX9-NEXT: s_nop 1 896; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 897; GFX9-NEXT: s_nop 1 898; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 899; GFX9-NEXT: s_nop 1 900; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 901; GFX9-NEXT: s_nop 1 902; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 903; GFX9-NEXT: s_nop 1 904; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 905; GFX9-NEXT: v_readlane_b32 s2, v1, 63 906; GFX9-NEXT: s_mov_b64 exec, s[0:1] 907; GFX9-NEXT: s_mov_b32 s0, s2 908; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 909; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 910; GFX9-NEXT: s_cbranch_execz .LBB3_2 911; GFX9-NEXT: ; %bb.1: 912; GFX9-NEXT: v_mov_b32_e32 v0, 0 913; GFX9-NEXT: v_mov_b32_e32 v2, s0 914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 915; GFX9-NEXT: ds_add_u32 v0, v2 916; GFX9-NEXT: s_waitcnt lgkmcnt(0) 917; GFX9-NEXT: .LBB3_2: 918; GFX9-NEXT: s_endpgm 919; 920; GFX1064-LABEL: add_i32_varying_nouse: 921; GFX1064: ; %bb.0: ; %entry 922; GFX1064-NEXT: v_mov_b32_e32 v1, v0 923; GFX1064-NEXT: s_not_b64 exec, exec 924; GFX1064-NEXT: v_mov_b32_e32 v1, 0 925; GFX1064-NEXT: s_not_b64 exec, exec 926; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 927; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 928; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 929; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 930; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 931; GFX1064-NEXT: v_mov_b32_e32 v2, v1 932; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 933; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 934; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 935; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 936; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 937; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 938; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 939; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 940; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 941; GFX1064-NEXT: s_add_i32 s0, s2, s3 942; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 943; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 944; GFX1064-NEXT: s_cbranch_execz .LBB3_2 945; GFX1064-NEXT: ; %bb.1: 946; GFX1064-NEXT: v_mov_b32_e32 v0, 0 947; GFX1064-NEXT: v_mov_b32_e32 v3, s0 948; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 949; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 950; GFX1064-NEXT: ds_add_u32 v0, v3 951; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 952; GFX1064-NEXT: buffer_gl0_inv 953; GFX1064-NEXT: .LBB3_2: 954; GFX1064-NEXT: s_endpgm 955; 956; GFX1032-LABEL: add_i32_varying_nouse: 957; GFX1032: ; %bb.0: ; %entry 958; GFX1032-NEXT: v_mov_b32_e32 v1, v0 959; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 960; GFX1032-NEXT: v_mov_b32_e32 v1, 0 961; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 962; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 963; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 964; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 965; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 966; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 967; GFX1032-NEXT: v_mov_b32_e32 v2, v1 968; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 969; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 970; GFX1032-NEXT: s_mov_b32 exec_lo, s0 971; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 972; GFX1032-NEXT: v_mov_b32_e32 v0, v1 973; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 974; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 975; GFX1032-NEXT: s_cbranch_execz .LBB3_2 976; GFX1032-NEXT: ; %bb.1: 977; GFX1032-NEXT: v_mov_b32_e32 v3, 0 978; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 979; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 980; GFX1032-NEXT: ds_add_u32 v3, v0 981; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 982; GFX1032-NEXT: buffer_gl0_inv 983; GFX1032-NEXT: .LBB3_2: 984; GFX1032-NEXT: s_endpgm 985; 986; GFX1164-LABEL: add_i32_varying_nouse: 987; GFX1164: ; %bb.0: ; %entry 988; GFX1164-NEXT: v_mov_b32_e32 v1, v0 989; GFX1164-NEXT: s_not_b64 exec, exec 990; GFX1164-NEXT: v_mov_b32_e32 v1, 0 991; GFX1164-NEXT: s_not_b64 exec, exec 992; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 993; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 994; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 995; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 996; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 997; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 998; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 999; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1000; GFX1164-NEXT: v_mov_b32_e32 v2, v1 1001; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1002; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1003; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 1004; GFX1164-NEXT: v_permlane64_b32 v2, v1 1005; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 1006; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1007; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1008; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 1009; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 1010; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 1011; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) 1012; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 1013; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1014; GFX1164-NEXT: v_mov_b32_e32 v0, v1 1015; GFX1164-NEXT: s_mov_b64 s[0:1], exec 1016; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 1017; GFX1164-NEXT: s_cbranch_execz .LBB3_2 1018; GFX1164-NEXT: ; %bb.1: 1019; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1020; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1021; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1022; GFX1164-NEXT: ds_add_u32 v3, v0 1023; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX1164-NEXT: buffer_gl0_inv 1025; GFX1164-NEXT: .LBB3_2: 1026; GFX1164-NEXT: s_endpgm 1027; 1028; GFX1132-LABEL: add_i32_varying_nouse: 1029; GFX1132: ; %bb.0: ; %entry 1030; GFX1132-NEXT: v_mov_b32_e32 v1, v0 1031; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 1032; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1033; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 1034; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 1035; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1036; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1037; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1038; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1039; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1040; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1041; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1042; GFX1132-NEXT: v_mov_b32_e32 v2, v1 1043; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1044; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1045; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 1046; GFX1132-NEXT: s_mov_b32 exec_lo, s0 1047; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1048; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1049; GFX1132-NEXT: v_mov_b32_e32 v0, v1 1050; GFX1132-NEXT: s_mov_b32 s0, exec_lo 1051; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 1052; GFX1132-NEXT: s_cbranch_execz .LBB3_2 1053; GFX1132-NEXT: ; %bb.1: 1054; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1055; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1056; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1057; GFX1132-NEXT: ds_add_u32 v3, v0 1058; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1059; GFX1132-NEXT: buffer_gl0_inv 1060; GFX1132-NEXT: .LBB3_2: 1061; GFX1132-NEXT: s_endpgm 1062entry: 1063 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1064 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1065 ret void 1066} 1067 1068define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1069; 1070; 1071; GFX7LESS-LABEL: add_i64_constant: 1072; GFX7LESS: ; %bb.0: ; %entry 1073; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1074; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1075; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1076; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1077; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1078; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1079; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1080; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1081; GFX7LESS-NEXT: ; %bb.1: 1082; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1083; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1084; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1085; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1086; GFX7LESS-NEXT: s_mov_b32 m0, -1 1087; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1089; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1090; GFX7LESS-NEXT: .LBB4_2: 1091; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1092; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1093; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1094; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1095; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1096; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1097; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1098; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1099; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1100; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1101; GFX7LESS-NEXT: s_mov_b32 s2, -1 1102; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1103; GFX7LESS-NEXT: s_endpgm 1104; 1105; GFX8-LABEL: add_i64_constant: 1106; GFX8: ; %bb.0: ; %entry 1107; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1108; GFX8-NEXT: s_mov_b64 s[4:5], exec 1109; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1110; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1111; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1112; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1113; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1114; GFX8-NEXT: s_cbranch_execz .LBB4_2 1115; GFX8-NEXT: ; %bb.1: 1116; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1117; GFX8-NEXT: s_mul_i32 s4, s4, 5 1118; GFX8-NEXT: v_mov_b32_e32 v0, s4 1119; GFX8-NEXT: v_mov_b32_e32 v1, 0 1120; GFX8-NEXT: s_mov_b32 m0, -1 1121; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1122; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1123; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1124; GFX8-NEXT: .LBB4_2: 1125; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1126; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1127; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1128; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1129; GFX8-NEXT: v_mov_b32_e32 v0, s2 1130; GFX8-NEXT: v_mov_b32_e32 v1, s3 1131; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1132; GFX8-NEXT: s_mov_b32 s3, 0xf000 1133; GFX8-NEXT: s_mov_b32 s2, -1 1134; GFX8-NEXT: s_nop 2 1135; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1136; GFX8-NEXT: s_endpgm 1137; 1138; GFX9-LABEL: add_i64_constant: 1139; GFX9: ; %bb.0: ; %entry 1140; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1141; GFX9-NEXT: s_mov_b64 s[4:5], exec 1142; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1143; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1144; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1145; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1146; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1147; GFX9-NEXT: s_cbranch_execz .LBB4_2 1148; GFX9-NEXT: ; %bb.1: 1149; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1150; GFX9-NEXT: s_mul_i32 s4, s4, 5 1151; GFX9-NEXT: v_mov_b32_e32 v0, s4 1152; GFX9-NEXT: v_mov_b32_e32 v1, 0 1153; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1154; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1155; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1156; GFX9-NEXT: .LBB4_2: 1157; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1158; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1159; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1160; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1161; GFX9-NEXT: v_mov_b32_e32 v0, s2 1162; GFX9-NEXT: v_mov_b32_e32 v1, s3 1163; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1164; GFX9-NEXT: s_mov_b32 s3, 0xf000 1165; GFX9-NEXT: s_mov_b32 s2, -1 1166; GFX9-NEXT: s_nop 2 1167; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1168; GFX9-NEXT: s_endpgm 1169; 1170; GFX1064-LABEL: add_i64_constant: 1171; GFX1064: ; %bb.0: ; %entry 1172; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1173; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1174; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1175; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1176; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1177; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1178; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1179; GFX1064-NEXT: s_cbranch_execz .LBB4_2 1180; GFX1064-NEXT: ; %bb.1: 1181; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1182; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1183; GFX1064-NEXT: s_mul_i32 s4, s4, 5 1184; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1185; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1186; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1187; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1188; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1189; GFX1064-NEXT: buffer_gl0_inv 1190; GFX1064-NEXT: .LBB4_2: 1191; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1192; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1193; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1194; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1195; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1196; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1197; GFX1064-NEXT: s_mov_b32 s2, -1 1198; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1199; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1200; GFX1064-NEXT: s_endpgm 1201; 1202; GFX1032-LABEL: add_i64_constant: 1203; GFX1032: ; %bb.0: ; %entry 1204; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1205; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1206; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1207; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1208; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1209; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1210; GFX1032-NEXT: s_cbranch_execz .LBB4_2 1211; GFX1032-NEXT: ; %bb.1: 1212; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1213; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1214; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1215; GFX1032-NEXT: v_mov_b32_e32 v0, s3 1216; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1217; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1218; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1219; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1220; GFX1032-NEXT: buffer_gl0_inv 1221; GFX1032-NEXT: .LBB4_2: 1222; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1223; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1224; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1225; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1226; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1227; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1228; GFX1032-NEXT: s_mov_b32 s2, -1 1229; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1230; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1231; GFX1032-NEXT: s_endpgm 1232; 1233; GFX1164-LABEL: add_i64_constant: 1234; GFX1164: ; %bb.0: ; %entry 1235; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1236; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1237; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1238; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1239; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1240; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1241; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1242; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1243; GFX1164-NEXT: s_cbranch_execz .LBB4_2 1244; GFX1164-NEXT: ; %bb.1: 1245; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1246; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1247; GFX1164-NEXT: s_mul_i32 s4, s4, 5 1248; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1249; GFX1164-NEXT: v_mov_b32_e32 v0, s4 1250; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1251; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1252; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1253; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1254; GFX1164-NEXT: buffer_gl0_inv 1255; GFX1164-NEXT: .LBB4_2: 1256; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 1257; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1258; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1259; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1260; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1261; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1262; GFX1164-NEXT: s_mov_b32 s2, -1 1263; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1264; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1265; GFX1164-NEXT: s_endpgm 1266; 1267; GFX1132-LABEL: add_i64_constant: 1268; GFX1132: ; %bb.0: ; %entry 1269; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1270; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1271; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1272; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1273; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1274; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1275; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1276; GFX1132-NEXT: s_cbranch_execz .LBB4_2 1277; GFX1132-NEXT: ; %bb.1: 1278; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1279; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1280; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1281; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1282; GFX1132-NEXT: v_mov_b32_e32 v0, s3 1283; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1284; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1285; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1286; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX1132-NEXT: buffer_gl0_inv 1288; GFX1132-NEXT: .LBB4_2: 1289; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1290; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1291; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1292; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1293; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1294; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1295; GFX1132-NEXT: s_mov_b32 s2, -1 1296; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1297; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1298; GFX1132-NEXT: s_endpgm 1299entry: 1300 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1301 store i64 %old, i64 addrspace(1)* %out 1302 ret void 1303} 1304 1305define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1306; 1307; 1308; GFX7LESS-LABEL: add_i64_uniform: 1309; GFX7LESS: ; %bb.0: ; %entry 1310; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1311; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1312; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1313; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1314; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1315; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1316; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1317; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 1318; GFX7LESS-NEXT: ; %bb.1: 1319; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1320; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 1321; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1323; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1324; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 1325; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1326; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1327; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1328; GFX7LESS-NEXT: s_mov_b32 m0, -1 1329; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1330; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1331; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1332; GFX7LESS-NEXT: .LBB5_2: 1333; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1334; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1335; GFX7LESS-NEXT: s_mov_b32 s6, -1 1336; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1337; GFX7LESS-NEXT: s_mov_b32 s4, s0 1338; GFX7LESS-NEXT: s_mov_b32 s5, s1 1339; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 1340; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1341; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 1342; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 1343; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 1344; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1345; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 1346; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1347; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1348; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1349; GFX7LESS-NEXT: s_endpgm 1350; 1351; GFX8-LABEL: add_i64_uniform: 1352; GFX8: ; %bb.0: ; %entry 1353; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1354; GFX8-NEXT: s_mov_b64 s[6:7], exec 1355; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1356; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1357; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1358; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1359; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1360; GFX8-NEXT: s_cbranch_execz .LBB5_2 1361; GFX8-NEXT: ; %bb.1: 1362; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 1363; GFX8-NEXT: v_mov_b32_e32 v0, s8 1364; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1365; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 1366; GFX8-NEXT: s_mul_i32 s6, s3, s8 1367; GFX8-NEXT: v_mov_b32_e32 v3, 0 1368; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1369; GFX8-NEXT: s_mov_b32 m0, -1 1370; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1371; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1372; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1373; GFX8-NEXT: .LBB5_2: 1374; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1375; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1376; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1377; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1378; GFX8-NEXT: v_mov_b32_e32 v0, s4 1379; GFX8-NEXT: v_mov_b32_e32 v1, s5 1380; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 1381; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 1382; GFX8-NEXT: s_mov_b32 s7, 0xf000 1383; GFX8-NEXT: s_mov_b32 s6, -1 1384; GFX8-NEXT: s_mov_b32 s4, s0 1385; GFX8-NEXT: s_mov_b32 s5, s1 1386; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1387; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1388; GFX8-NEXT: s_endpgm 1389; 1390; GFX9-LABEL: add_i64_uniform: 1391; GFX9: ; %bb.0: ; %entry 1392; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1393; GFX9-NEXT: s_mov_b64 s[6:7], exec 1394; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1395; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1396; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1397; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1398; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1399; GFX9-NEXT: s_cbranch_execz .LBB5_2 1400; GFX9-NEXT: ; %bb.1: 1401; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1402; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1403; GFX9-NEXT: s_mul_i32 s7, s3, s6 1404; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1405; GFX9-NEXT: s_add_i32 s8, s8, s7 1406; GFX9-NEXT: s_mul_i32 s6, s2, s6 1407; GFX9-NEXT: v_mov_b32_e32 v0, s6 1408; GFX9-NEXT: v_mov_b32_e32 v1, s8 1409; GFX9-NEXT: v_mov_b32_e32 v3, 0 1410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1412; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1413; GFX9-NEXT: .LBB5_2: 1414; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1415; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1417; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1418; GFX9-NEXT: v_mov_b32_e32 v0, s4 1419; GFX9-NEXT: v_mov_b32_e32 v1, s5 1420; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] 1421; GFX9-NEXT: s_mov_b32 s7, 0xf000 1422; GFX9-NEXT: s_mov_b32 s6, -1 1423; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1424; GFX9-NEXT: s_mov_b32 s4, s0 1425; GFX9-NEXT: s_mov_b32 s5, s1 1426; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1427; GFX9-NEXT: s_endpgm 1428; 1429; GFX1064-LABEL: add_i64_uniform: 1430; GFX1064: ; %bb.0: ; %entry 1431; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1432; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1433; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1434; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1435; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1436; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1437; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1438; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1439; GFX1064-NEXT: ; %bb.1: 1440; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1441; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1442; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1444; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1445; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1446; GFX1064-NEXT: s_add_i32 s8, s8, s7 1447; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1448; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1449; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1450; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1451; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1452; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX1064-NEXT: buffer_gl0_inv 1454; GFX1064-NEXT: .LBB5_2: 1455; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1456; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1457; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1458; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1459; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1460; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] 1461; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1462; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1463; GFX1064-NEXT: s_mov_b32 s2, -1 1464; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1465; GFX1064-NEXT: s_endpgm 1466; 1467; GFX1032-LABEL: add_i64_uniform: 1468; GFX1032: ; %bb.0: ; %entry 1469; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1470; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1471; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1472; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1473; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1474; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1475; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1476; GFX1032-NEXT: ; %bb.1: 1477; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1478; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1479; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1481; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1482; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1483; GFX1032-NEXT: s_add_i32 s7, s7, s6 1484; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1485; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1486; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1487; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1488; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1489; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX1032-NEXT: buffer_gl0_inv 1491; GFX1032-NEXT: .LBB5_2: 1492; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1493; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1494; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1495; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1496; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1497; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1498; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] 1499; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1500; GFX1032-NEXT: s_mov_b32 s2, -1 1501; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1502; GFX1032-NEXT: s_endpgm 1503; 1504; GFX1164-LABEL: add_i64_uniform: 1505; GFX1164: ; %bb.0: ; %entry 1506; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1507; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1508; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1509; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1510; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1511; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1512; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1513; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1514; GFX1164-NEXT: s_cbranch_execz .LBB5_2 1515; GFX1164-NEXT: ; %bb.1: 1516; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1517; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1518; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1519; GFX1164-NEXT: s_mul_i32 s7, s3, s6 1520; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 1521; GFX1164-NEXT: s_mul_i32 s6, s2, s6 1522; GFX1164-NEXT: s_add_i32 s8, s8, s7 1523; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1524; GFX1164-NEXT: v_mov_b32_e32 v1, s8 1525; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1526; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1527; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1528; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1529; GFX1164-NEXT: buffer_gl0_inv 1530; GFX1164-NEXT: .LBB5_2: 1531; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1532; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 1533; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 1534; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1535; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1536; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1537; GFX1164-NEXT: s_mov_b32 s2, -1 1538; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1539; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1540; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1541; GFX1164-NEXT: v_mov_b32_e32 v1, v3 1542; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1543; GFX1164-NEXT: s_endpgm 1544; 1545; GFX1132-LABEL: add_i64_uniform: 1546; GFX1132: ; %bb.0: ; %entry 1547; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1548; GFX1132-NEXT: s_mov_b32 s5, exec_lo 1549; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1550; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1551; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1552; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1553; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1554; GFX1132-NEXT: s_cbranch_execz .LBB5_2 1555; GFX1132-NEXT: ; %bb.1: 1556; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 1557; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1558; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1559; GFX1132-NEXT: s_mul_i32 s6, s3, s5 1560; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 1561; GFX1132-NEXT: s_mul_i32 s5, s2, s5 1562; GFX1132-NEXT: s_add_i32 s7, s7, s6 1563; GFX1132-NEXT: v_mov_b32_e32 v0, s5 1564; GFX1132-NEXT: v_mov_b32_e32 v1, s7 1565; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1566; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1567; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1568; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1569; GFX1132-NEXT: buffer_gl0_inv 1570; GFX1132-NEXT: .LBB5_2: 1571; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1572; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 1573; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 1574; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1575; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1576; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1577; GFX1132-NEXT: s_mov_b32 s2, -1 1578; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1579; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1580; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1581; GFX1132-NEXT: v_mov_b32_e32 v1, v3 1582; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1583; GFX1132-NEXT: s_endpgm 1584entry: 1585 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1586 store i64 %old, i64 addrspace(1)* %out 1587 ret void 1588} 1589 1590define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1591; 1592; 1593; GFX7LESS-LABEL: add_i64_varying: 1594; GFX7LESS: ; %bb.0: ; %entry 1595; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1596; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1597; GFX7LESS-NEXT: s_mov_b32 m0, -1 1598; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1599; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1600; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1602; GFX7LESS-NEXT: s_mov_b32 s2, -1 1603; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1604; GFX7LESS-NEXT: s_endpgm 1605; 1606; GFX8-LABEL: add_i64_varying: 1607; GFX8: ; %bb.0: ; %entry 1608; GFX8-NEXT: v_mov_b32_e32 v1, 0 1609; GFX8-NEXT: s_mov_b32 m0, -1 1610; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1611; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1612; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1613; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX8-NEXT: s_mov_b32 s3, 0xf000 1615; GFX8-NEXT: s_mov_b32 s2, -1 1616; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1617; GFX8-NEXT: s_endpgm 1618; 1619; GFX9-LABEL: add_i64_varying: 1620; GFX9: ; %bb.0: ; %entry 1621; GFX9-NEXT: v_mov_b32_e32 v1, 0 1622; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1623; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1624; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1625; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1626; GFX9-NEXT: s_mov_b32 s3, 0xf000 1627; GFX9-NEXT: s_mov_b32 s2, -1 1628; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1629; GFX9-NEXT: s_endpgm 1630; 1631; GFX10-LABEL: add_i64_varying: 1632; GFX10: ; %bb.0: ; %entry 1633; GFX10-NEXT: v_mov_b32_e32 v1, 0 1634; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1635; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1636; GFX10-NEXT: s_mov_b32 s2, -1 1637; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1638; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1639; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1640; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1641; GFX10-NEXT: buffer_gl0_inv 1642; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1643; GFX10-NEXT: s_endpgm 1644; 1645; GFX11-LABEL: add_i64_varying: 1646; GFX11: ; %bb.0: ; %entry 1647; GFX11-NEXT: v_mov_b32_e32 v1, 0 1648; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1649; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1650; GFX11-NEXT: s_mov_b32 s2, -1 1651; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1652; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1653; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1654; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1655; GFX11-NEXT: buffer_gl0_inv 1656; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1657; GFX11-NEXT: s_endpgm 1658entry: 1659 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1660 %zext = zext i32 %lane to i64 1661 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1662 store i64 %old, i64 addrspace(1)* %out 1663 ret void 1664} 1665 1666define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1667; 1668; 1669; GFX7LESS-LABEL: sub_i32_constant: 1670; GFX7LESS: ; %bb.0: ; %entry 1671; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1672; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1673; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1674; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1675; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1676; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1677; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1678; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1679; GFX7LESS-NEXT: ; %bb.1: 1680; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1681; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1682; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1683; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1684; GFX7LESS-NEXT: s_mov_b32 m0, -1 1685; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1686; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1687; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1688; GFX7LESS-NEXT: .LBB7_2: 1689; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1690; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1691; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1692; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1693; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1694; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1695; GFX7LESS-NEXT: s_mov_b32 s2, -1 1696; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1697; GFX7LESS-NEXT: s_endpgm 1698; 1699; GFX8-LABEL: sub_i32_constant: 1700; GFX8: ; %bb.0: ; %entry 1701; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1702; GFX8-NEXT: s_mov_b64 s[2:3], exec 1703; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1704; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1705; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1706; GFX8-NEXT: ; implicit-def: $vgpr1 1707; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1708; GFX8-NEXT: s_cbranch_execz .LBB7_2 1709; GFX8-NEXT: ; %bb.1: 1710; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1711; GFX8-NEXT: s_mul_i32 s2, s2, 5 1712; GFX8-NEXT: v_mov_b32_e32 v1, 0 1713; GFX8-NEXT: v_mov_b32_e32 v2, s2 1714; GFX8-NEXT: s_mov_b32 m0, -1 1715; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1716; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1717; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1718; GFX8-NEXT: .LBB7_2: 1719; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1720; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1721; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1722; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1723; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1724; GFX8-NEXT: s_mov_b32 s3, 0xf000 1725; GFX8-NEXT: s_mov_b32 s2, -1 1726; GFX8-NEXT: s_nop 0 1727; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1728; GFX8-NEXT: s_endpgm 1729; 1730; GFX9-LABEL: sub_i32_constant: 1731; GFX9: ; %bb.0: ; %entry 1732; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1733; GFX9-NEXT: s_mov_b64 s[2:3], exec 1734; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1735; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1736; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1737; GFX9-NEXT: ; implicit-def: $vgpr1 1738; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1739; GFX9-NEXT: s_cbranch_execz .LBB7_2 1740; GFX9-NEXT: ; %bb.1: 1741; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1742; GFX9-NEXT: s_mul_i32 s2, s2, 5 1743; GFX9-NEXT: v_mov_b32_e32 v1, 0 1744; GFX9-NEXT: v_mov_b32_e32 v2, s2 1745; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1746; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1747; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1748; GFX9-NEXT: .LBB7_2: 1749; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1750; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1751; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1752; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1753; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1754; GFX9-NEXT: s_mov_b32 s3, 0xf000 1755; GFX9-NEXT: s_mov_b32 s2, -1 1756; GFX9-NEXT: s_nop 0 1757; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1758; GFX9-NEXT: s_endpgm 1759; 1760; GFX1064-LABEL: sub_i32_constant: 1761; GFX1064: ; %bb.0: ; %entry 1762; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1763; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1764; GFX1064-NEXT: ; implicit-def: $vgpr1 1765; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1766; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1767; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1768; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1769; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1770; GFX1064-NEXT: ; %bb.1: 1771; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1772; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1773; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1774; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1775; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1776; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1777; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1778; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1779; GFX1064-NEXT: buffer_gl0_inv 1780; GFX1064-NEXT: .LBB7_2: 1781; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1782; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1783; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1784; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1785; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1786; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1787; GFX1064-NEXT: s_mov_b32 s2, -1 1788; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1789; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1790; GFX1064-NEXT: s_endpgm 1791; 1792; GFX1032-LABEL: sub_i32_constant: 1793; GFX1032: ; %bb.0: ; %entry 1794; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1795; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1796; GFX1032-NEXT: ; implicit-def: $vgpr1 1797; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1798; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1799; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1800; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1801; GFX1032-NEXT: ; %bb.1: 1802; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1803; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1804; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1805; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1806; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1807; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1808; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1809; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1810; GFX1032-NEXT: buffer_gl0_inv 1811; GFX1032-NEXT: .LBB7_2: 1812; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1813; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1814; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1815; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1816; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1817; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1818; GFX1032-NEXT: s_mov_b32 s2, -1 1819; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1821; GFX1032-NEXT: s_endpgm 1822; 1823; GFX1164-LABEL: sub_i32_constant: 1824; GFX1164: ; %bb.0: ; %entry 1825; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1826; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1827; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1828; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1829; GFX1164-NEXT: ; implicit-def: $vgpr1 1830; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1831; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1832; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 1833; GFX1164-NEXT: s_cbranch_execz .LBB7_2 1834; GFX1164-NEXT: ; %bb.1: 1835; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1836; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1837; GFX1164-NEXT: s_mul_i32 s2, s2, 5 1838; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1839; GFX1164-NEXT: v_mov_b32_e32 v2, s2 1840; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1841; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1842; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 1843; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1844; GFX1164-NEXT: buffer_gl0_inv 1845; GFX1164-NEXT: .LBB7_2: 1846; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1847; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 1848; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1849; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1850; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 1851; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1852; GFX1164-NEXT: s_mov_b32 s2, -1 1853; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1854; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1855; GFX1164-NEXT: s_endpgm 1856; 1857; GFX1132-LABEL: sub_i32_constant: 1858; GFX1132: ; %bb.0: ; %entry 1859; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1860; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1861; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1862; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1863; GFX1132-NEXT: ; implicit-def: $vgpr1 1864; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1865; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 1866; GFX1132-NEXT: s_cbranch_execz .LBB7_2 1867; GFX1132-NEXT: ; %bb.1: 1868; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1869; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1870; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1871; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1872; GFX1132-NEXT: v_mov_b32_e32 v2, s3 1873; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1874; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1875; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 1876; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1877; GFX1132-NEXT: buffer_gl0_inv 1878; GFX1132-NEXT: .LBB7_2: 1879; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1880; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 1881; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1882; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1883; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 1884; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1885; GFX1132-NEXT: s_mov_b32 s2, -1 1886; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1887; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1888; GFX1132-NEXT: s_endpgm 1889entry: 1890 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1891 store i32 %old, i32 addrspace(1)* %out 1892 ret void 1893} 1894 1895define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1896; 1897; 1898; GFX7LESS-LABEL: sub_i32_uniform: 1899; GFX7LESS: ; %bb.0: ; %entry 1900; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1901; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1902; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1903; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1904; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1905; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1906; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1907; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1908; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1909; GFX7LESS-NEXT: ; %bb.1: 1910; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1911; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1912; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1913; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1914; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1915; GFX7LESS-NEXT: s_mov_b32 m0, -1 1916; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1917; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1918; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1919; GFX7LESS-NEXT: .LBB8_2: 1920; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1921; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1922; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1923; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1924; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1925; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1926; GFX7LESS-NEXT: s_mov_b32 s6, -1 1927; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1928; GFX7LESS-NEXT: s_endpgm 1929; 1930; GFX8-LABEL: sub_i32_uniform: 1931; GFX8: ; %bb.0: ; %entry 1932; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1933; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1934; GFX8-NEXT: s_mov_b64 s[2:3], exec 1935; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1936; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1937; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1938; GFX8-NEXT: ; implicit-def: $vgpr1 1939; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1940; GFX8-NEXT: s_cbranch_execz .LBB8_2 1941; GFX8-NEXT: ; %bb.1: 1942; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1943; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1944; GFX8-NEXT: s_mul_i32 s2, s6, s2 1945; GFX8-NEXT: v_mov_b32_e32 v1, 0 1946; GFX8-NEXT: v_mov_b32_e32 v2, s2 1947; GFX8-NEXT: s_mov_b32 m0, -1 1948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1950; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1951; GFX8-NEXT: .LBB8_2: 1952; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1954; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1955; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1956; GFX8-NEXT: s_mov_b32 s7, 0xf000 1957; GFX8-NEXT: s_mov_b32 s6, -1 1958; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1959; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1960; GFX8-NEXT: s_endpgm 1961; 1962; GFX9-LABEL: sub_i32_uniform: 1963; GFX9: ; %bb.0: ; %entry 1964; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1965; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1966; GFX9-NEXT: s_mov_b64 s[2:3], exec 1967; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1968; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1969; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1970; GFX9-NEXT: ; implicit-def: $vgpr1 1971; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1972; GFX9-NEXT: s_cbranch_execz .LBB8_2 1973; GFX9-NEXT: ; %bb.1: 1974; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1976; GFX9-NEXT: s_mul_i32 s2, s6, s2 1977; GFX9-NEXT: v_mov_b32_e32 v1, 0 1978; GFX9-NEXT: v_mov_b32_e32 v2, s2 1979; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1980; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1981; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1982; GFX9-NEXT: .LBB8_2: 1983; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1984; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1986; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1987; GFX9-NEXT: s_mov_b32 s7, 0xf000 1988; GFX9-NEXT: s_mov_b32 s6, -1 1989; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1990; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1991; GFX9-NEXT: s_endpgm 1992; 1993; GFX1064-LABEL: sub_i32_uniform: 1994; GFX1064: ; %bb.0: ; %entry 1995; GFX1064-NEXT: s_clause 0x1 1996; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1997; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 1998; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1999; GFX1064-NEXT: ; implicit-def: $vgpr1 2000; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2001; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2002; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2003; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 2004; GFX1064-NEXT: s_cbranch_execz .LBB8_2 2005; GFX1064-NEXT: ; %bb.1: 2006; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2007; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2008; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2009; GFX1064-NEXT: s_mul_i32 s2, s6, s2 2010; GFX1064-NEXT: v_mov_b32_e32 v2, s2 2011; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2012; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2013; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 2014; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2015; GFX1064-NEXT: buffer_gl0_inv 2016; GFX1064-NEXT: .LBB8_2: 2017; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2018; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 2019; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2020; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 2021; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 2022; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 2023; GFX1064-NEXT: s_mov_b32 s6, -1 2024; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2025; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 2026; GFX1064-NEXT: s_endpgm 2027; 2028; GFX1032-LABEL: sub_i32_uniform: 2029; GFX1032: ; %bb.0: ; %entry 2030; GFX1032-NEXT: s_clause 0x1 2031; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2032; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 2033; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2034; GFX1032-NEXT: ; implicit-def: $vgpr1 2035; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 2036; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2037; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2038; GFX1032-NEXT: s_cbranch_execz .LBB8_2 2039; GFX1032-NEXT: ; %bb.1: 2040; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 2041; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2042; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2043; GFX1032-NEXT: s_mul_i32 s1, s2, s1 2044; GFX1032-NEXT: v_mov_b32_e32 v2, s1 2045; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2046; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2047; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 2048; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2049; GFX1032-NEXT: buffer_gl0_inv 2050; GFX1032-NEXT: .LBB8_2: 2051; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2052; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 2053; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2054; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2055; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 2056; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2057; GFX1032-NEXT: s_mov_b32 s6, -1 2058; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2059; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 2060; GFX1032-NEXT: s_endpgm 2061; 2062; GFX1164-LABEL: sub_i32_uniform: 2063; GFX1164: ; %bb.0: ; %entry 2064; GFX1164-NEXT: s_clause 0x1 2065; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2066; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 2067; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2068; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2069; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2070; GFX1164-NEXT: ; implicit-def: $vgpr1 2071; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2072; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2073; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 2074; GFX1164-NEXT: s_cbranch_execz .LBB8_2 2075; GFX1164-NEXT: ; %bb.1: 2076; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2077; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2078; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2079; GFX1164-NEXT: s_mul_i32 s2, s6, s2 2080; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2081; GFX1164-NEXT: v_mov_b32_e32 v2, s2 2082; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2083; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2084; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 2085; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2086; GFX1164-NEXT: buffer_gl0_inv 2087; GFX1164-NEXT: .LBB8_2: 2088; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 2089; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2090; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 2091; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 2092; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 2093; GFX1164-NEXT: s_mov_b32 s6, -1 2094; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2095; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2096; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2097; GFX1164-NEXT: s_endpgm 2098; 2099; GFX1132-LABEL: sub_i32_uniform: 2100; GFX1132: ; %bb.0: ; %entry 2101; GFX1132-NEXT: s_clause 0x1 2102; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2103; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 2104; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2105; GFX1132-NEXT: s_mov_b32 s1, exec_lo 2106; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2107; GFX1132-NEXT: ; implicit-def: $vgpr1 2108; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2109; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 2110; GFX1132-NEXT: s_cbranch_execz .LBB8_2 2111; GFX1132-NEXT: ; %bb.1: 2112; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 2113; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2114; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2115; GFX1132-NEXT: s_mul_i32 s2, s0, s2 2116; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2117; GFX1132-NEXT: v_mov_b32_e32 v2, s2 2118; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2119; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2120; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 2121; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2122; GFX1132-NEXT: buffer_gl0_inv 2123; GFX1132-NEXT: .LBB8_2: 2124; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 2125; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2126; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 2127; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 2128; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 2129; GFX1132-NEXT: s_mov_b32 s6, -1 2130; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2131; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2132; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2133; GFX1132-NEXT: s_endpgm 2134entry: 2135 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 2136 store i32 %old, i32 addrspace(1)* %out 2137 ret void 2138} 2139 2140define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 2141; 2142; 2143; GFX7LESS-LABEL: sub_i32_varying: 2144; GFX7LESS: ; %bb.0: ; %entry 2145; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2146; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2147; GFX7LESS-NEXT: s_mov_b32 m0, -1 2148; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2149; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 2150; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2151; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2152; GFX7LESS-NEXT: s_mov_b32 s2, -1 2153; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2154; GFX7LESS-NEXT: s_endpgm 2155; 2156; GFX8-LABEL: sub_i32_varying: 2157; GFX8: ; %bb.0: ; %entry 2158; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2159; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2160; GFX8-NEXT: v_mov_b32_e32 v1, 0 2161; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2162; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2163; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2164; GFX8-NEXT: v_mov_b32_e32 v2, v0 2165; GFX8-NEXT: s_not_b64 exec, exec 2166; GFX8-NEXT: v_mov_b32_e32 v2, 0 2167; GFX8-NEXT: s_not_b64 exec, exec 2168; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2169; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2170; GFX8-NEXT: s_nop 1 2171; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2172; GFX8-NEXT: s_nop 1 2173; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2174; GFX8-NEXT: s_nop 1 2175; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2176; GFX8-NEXT: s_nop 1 2177; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2178; GFX8-NEXT: s_nop 1 2179; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2180; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2181; GFX8-NEXT: s_nop 0 2182; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2183; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2184; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2185; GFX8-NEXT: ; implicit-def: $vgpr0 2186; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2187; GFX8-NEXT: s_cbranch_execz .LBB9_2 2188; GFX8-NEXT: ; %bb.1: 2189; GFX8-NEXT: v_mov_b32_e32 v0, 0 2190; GFX8-NEXT: v_mov_b32_e32 v3, s4 2191; GFX8-NEXT: s_mov_b32 m0, -1 2192; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2193; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 2194; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2195; GFX8-NEXT: .LBB9_2: 2196; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2197; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2198; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2199; GFX8-NEXT: v_mov_b32_e32 v0, v1 2200; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2201; GFX8-NEXT: s_mov_b32 s3, 0xf000 2202; GFX8-NEXT: s_mov_b32 s2, -1 2203; GFX8-NEXT: s_nop 0 2204; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2205; GFX8-NEXT: s_endpgm 2206; 2207; GFX9-LABEL: sub_i32_varying: 2208; GFX9: ; %bb.0: ; %entry 2209; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2210; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2211; GFX9-NEXT: v_mov_b32_e32 v1, 0 2212; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2213; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2214; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2215; GFX9-NEXT: v_mov_b32_e32 v2, v0 2216; GFX9-NEXT: s_not_b64 exec, exec 2217; GFX9-NEXT: v_mov_b32_e32 v2, 0 2218; GFX9-NEXT: s_not_b64 exec, exec 2219; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2220; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2221; GFX9-NEXT: s_nop 1 2222; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2223; GFX9-NEXT: s_nop 1 2224; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2225; GFX9-NEXT: s_nop 1 2226; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2227; GFX9-NEXT: s_nop 1 2228; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2229; GFX9-NEXT: s_nop 1 2230; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2231; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2232; GFX9-NEXT: s_nop 0 2233; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2234; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2235; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2236; GFX9-NEXT: ; implicit-def: $vgpr0 2237; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2238; GFX9-NEXT: s_cbranch_execz .LBB9_2 2239; GFX9-NEXT: ; %bb.1: 2240; GFX9-NEXT: v_mov_b32_e32 v0, 0 2241; GFX9-NEXT: v_mov_b32_e32 v3, s4 2242; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2243; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2244; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2245; GFX9-NEXT: .LBB9_2: 2246; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2247; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2248; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2249; GFX9-NEXT: v_mov_b32_e32 v0, v1 2250; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2251; GFX9-NEXT: s_mov_b32 s3, 0xf000 2252; GFX9-NEXT: s_mov_b32 s2, -1 2253; GFX9-NEXT: s_nop 0 2254; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2255; GFX9-NEXT: s_endpgm 2256; 2257; GFX1064-LABEL: sub_i32_varying: 2258; GFX1064: ; %bb.0: ; %entry 2259; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2260; GFX1064-NEXT: s_not_b64 exec, exec 2261; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2262; GFX1064-NEXT: s_not_b64 exec, exec 2263; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2264; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2265; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2266; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2267; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2268; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2269; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2270; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2271; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2272; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2273; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2274; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2275; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2276; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2277; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2278; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2279; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2280; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2281; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2282; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2283; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2284; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2285; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2286; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2287; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2288; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2289; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2290; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2291; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2292; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2293; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2294; GFX1064-NEXT: s_mov_b32 s2, -1 2295; GFX1064-NEXT: ; implicit-def: $vgpr0 2296; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2297; GFX1064-NEXT: s_cbranch_execz .LBB9_2 2298; GFX1064-NEXT: ; %bb.1: 2299; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2300; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2301; GFX1064-NEXT: s_mov_b32 s3, s7 2302; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2303; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2304; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 2305; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2306; GFX1064-NEXT: buffer_gl0_inv 2307; GFX1064-NEXT: .LBB9_2: 2308; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2309; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2310; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2311; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2312; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2313; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2314; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2315; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2316; GFX1064-NEXT: s_endpgm 2317; 2318; GFX1032-LABEL: sub_i32_varying: 2319; GFX1032: ; %bb.0: ; %entry 2320; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2321; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2322; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2323; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2324; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2325; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2326; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2327; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2328; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2329; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2330; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2331; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2332; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2333; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2334; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2335; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2336; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2337; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2338; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2339; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2340; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2341; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2342; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2343; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2344; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2345; GFX1032-NEXT: s_mov_b32 s2, -1 2346; GFX1032-NEXT: ; implicit-def: $vgpr0 2347; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2348; GFX1032-NEXT: s_cbranch_execz .LBB9_2 2349; GFX1032-NEXT: ; %bb.1: 2350; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2351; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2352; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2353; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2354; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 2355; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2356; GFX1032-NEXT: buffer_gl0_inv 2357; GFX1032-NEXT: .LBB9_2: 2358; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2359; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2360; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2361; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2362; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2363; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2364; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2365; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2366; GFX1032-NEXT: s_endpgm 2367; 2368; GFX1164-LABEL: sub_i32_varying: 2369; GFX1164: ; %bb.0: ; %entry 2370; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2371; GFX1164-NEXT: s_not_b64 exec, exec 2372; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2373; GFX1164-NEXT: s_not_b64 exec, exec 2374; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2375; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2376; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2377; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2378; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2379; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2380; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2381; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2382; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2383; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2384; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2385; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2386; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2387; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 2388; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2389; GFX1164-NEXT: v_mov_b32_e32 v2, s4 2390; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2391; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2392; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 2393; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2394; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2395; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2396; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2397; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 2398; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 2399; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2400; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2401; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2402; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2403; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 2404; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 2405; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 2406; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2407; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 2408; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2409; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 2410; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 2411; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 2412; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2413; GFX1164-NEXT: s_mov_b32 s2, -1 2414; GFX1164-NEXT: ; implicit-def: $vgpr0 2415; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 2416; GFX1164-NEXT: s_cbranch_execz .LBB9_2 2417; GFX1164-NEXT: ; %bb.1: 2418; GFX1164-NEXT: v_mov_b32_e32 v0, 0 2419; GFX1164-NEXT: v_mov_b32_e32 v4, s7 2420; GFX1164-NEXT: s_mov_b32 s3, s7 2421; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2422; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2423; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v4 2424; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2425; GFX1164-NEXT: buffer_gl0_inv 2426; GFX1164-NEXT: .LBB9_2: 2427; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 2428; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 2429; GFX1164-NEXT: v_mov_b32_e32 v0, v3 2430; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 2431; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2432; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2433; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2434; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2435; GFX1164-NEXT: s_endpgm 2436; 2437; GFX1132-LABEL: sub_i32_varying: 2438; GFX1132: ; %bb.0: ; %entry 2439; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2440; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2441; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2442; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2443; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2444; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2445; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2446; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2447; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2448; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2449; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2450; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2451; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2452; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2453; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2454; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2455; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2456; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2457; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2458; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2459; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 2460; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 2461; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2462; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2463; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2464; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2465; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2466; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 2467; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2468; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 2469; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2470; GFX1132-NEXT: s_mov_b32 s2, -1 2471; GFX1132-NEXT: ; implicit-def: $vgpr0 2472; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 2473; GFX1132-NEXT: s_cbranch_execz .LBB9_2 2474; GFX1132-NEXT: ; %bb.1: 2475; GFX1132-NEXT: v_mov_b32_e32 v0, 0 2476; GFX1132-NEXT: v_mov_b32_e32 v4, s4 2477; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2478; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2479; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v4 2480; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2481; GFX1132-NEXT: buffer_gl0_inv 2482; GFX1132-NEXT: .LBB9_2: 2483; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 2484; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 2485; GFX1132-NEXT: v_mov_b32_e32 v0, v3 2486; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2487; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2488; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2489; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2490; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2491; GFX1132-NEXT: s_endpgm 2492entry: 2493 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2494 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2495 store i32 %old, i32 addrspace(1)* %out 2496 ret void 2497} 2498 2499define amdgpu_kernel void @sub_i32_varying_nouse() { 2500; GFX7LESS-LABEL: sub_i32_varying_nouse: 2501; GFX7LESS: ; %bb.0: ; %entry 2502; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2503; GFX7LESS-NEXT: s_mov_b32 m0, -1 2504; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2505; GFX7LESS-NEXT: ds_sub_u32 v1, v0 2506; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2507; GFX7LESS-NEXT: s_endpgm 2508; 2509; GFX8-LABEL: sub_i32_varying_nouse: 2510; GFX8: ; %bb.0: ; %entry 2511; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2512; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2513; GFX8-NEXT: v_mov_b32_e32 v1, v0 2514; GFX8-NEXT: s_not_b64 exec, exec 2515; GFX8-NEXT: v_mov_b32_e32 v1, 0 2516; GFX8-NEXT: s_not_b64 exec, exec 2517; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 2518; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2519; GFX8-NEXT: s_nop 1 2520; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2521; GFX8-NEXT: s_nop 1 2522; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2523; GFX8-NEXT: s_nop 1 2524; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2525; GFX8-NEXT: s_nop 1 2526; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2527; GFX8-NEXT: s_nop 1 2528; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2529; GFX8-NEXT: v_readlane_b32 s2, v1, 63 2530; GFX8-NEXT: s_mov_b64 exec, s[0:1] 2531; GFX8-NEXT: s_mov_b32 s0, s2 2532; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2533; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2534; GFX8-NEXT: s_cbranch_execz .LBB10_2 2535; GFX8-NEXT: ; %bb.1: 2536; GFX8-NEXT: v_mov_b32_e32 v0, 0 2537; GFX8-NEXT: v_mov_b32_e32 v2, s0 2538; GFX8-NEXT: s_mov_b32 m0, -1 2539; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2540; GFX8-NEXT: ds_sub_u32 v0, v2 2541; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2542; GFX8-NEXT: .LBB10_2: 2543; GFX8-NEXT: s_endpgm 2544; 2545; GFX9-LABEL: sub_i32_varying_nouse: 2546; GFX9: ; %bb.0: ; %entry 2547; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2548; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2549; GFX9-NEXT: v_mov_b32_e32 v1, v0 2550; GFX9-NEXT: s_not_b64 exec, exec 2551; GFX9-NEXT: v_mov_b32_e32 v1, 0 2552; GFX9-NEXT: s_not_b64 exec, exec 2553; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 2554; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2555; GFX9-NEXT: s_nop 1 2556; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2557; GFX9-NEXT: s_nop 1 2558; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2559; GFX9-NEXT: s_nop 1 2560; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2561; GFX9-NEXT: s_nop 1 2562; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2563; GFX9-NEXT: s_nop 1 2564; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2565; GFX9-NEXT: v_readlane_b32 s2, v1, 63 2566; GFX9-NEXT: s_mov_b64 exec, s[0:1] 2567; GFX9-NEXT: s_mov_b32 s0, s2 2568; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2569; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2570; GFX9-NEXT: s_cbranch_execz .LBB10_2 2571; GFX9-NEXT: ; %bb.1: 2572; GFX9-NEXT: v_mov_b32_e32 v0, 0 2573; GFX9-NEXT: v_mov_b32_e32 v2, s0 2574; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2575; GFX9-NEXT: ds_sub_u32 v0, v2 2576; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2577; GFX9-NEXT: .LBB10_2: 2578; GFX9-NEXT: s_endpgm 2579; 2580; GFX1064-LABEL: sub_i32_varying_nouse: 2581; GFX1064: ; %bb.0: ; %entry 2582; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2583; GFX1064-NEXT: s_not_b64 exec, exec 2584; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2585; GFX1064-NEXT: s_not_b64 exec, exec 2586; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2587; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2588; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2589; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2590; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2591; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2592; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2593; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 2594; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2595; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2596; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2597; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 2598; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 2599; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2600; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2601; GFX1064-NEXT: s_add_i32 s0, s2, s3 2602; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2603; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2604; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2605; GFX1064-NEXT: ; %bb.1: 2606; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2607; GFX1064-NEXT: v_mov_b32_e32 v3, s0 2608; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2609; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2610; GFX1064-NEXT: ds_sub_u32 v0, v3 2611; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2612; GFX1064-NEXT: buffer_gl0_inv 2613; GFX1064-NEXT: .LBB10_2: 2614; GFX1064-NEXT: s_endpgm 2615; 2616; GFX1032-LABEL: sub_i32_varying_nouse: 2617; GFX1032: ; %bb.0: ; %entry 2618; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2619; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2620; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2621; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2622; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 2623; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2624; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2625; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2626; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2627; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2628; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2629; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 2630; GFX1032-NEXT: s_mov_b32 exec_lo, s0 2631; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2632; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2633; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2634; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2635; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2636; GFX1032-NEXT: ; %bb.1: 2637; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2638; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2639; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2640; GFX1032-NEXT: ds_sub_u32 v3, v0 2641; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2642; GFX1032-NEXT: buffer_gl0_inv 2643; GFX1032-NEXT: .LBB10_2: 2644; GFX1032-NEXT: s_endpgm 2645; 2646; GFX1164-LABEL: sub_i32_varying_nouse: 2647; GFX1164: ; %bb.0: ; %entry 2648; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2649; GFX1164-NEXT: s_not_b64 exec, exec 2650; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2651; GFX1164-NEXT: s_not_b64 exec, exec 2652; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2653; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2654; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2655; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2656; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2657; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2658; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2659; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2660; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2661; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2662; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2663; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2664; GFX1164-NEXT: v_permlane64_b32 v2, v1 2665; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2666; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2667; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2668; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2669; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2670; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2671; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) 2672; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 2673; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2674; GFX1164-NEXT: v_mov_b32_e32 v0, v1 2675; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2676; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 2677; GFX1164-NEXT: s_cbranch_execz .LBB10_2 2678; GFX1164-NEXT: ; %bb.1: 2679; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2680; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2681; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2682; GFX1164-NEXT: ds_sub_u32 v3, v0 2683; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2684; GFX1164-NEXT: buffer_gl0_inv 2685; GFX1164-NEXT: .LBB10_2: 2686; GFX1164-NEXT: s_endpgm 2687; 2688; GFX1132-LABEL: sub_i32_varying_nouse: 2689; GFX1132: ; %bb.0: ; %entry 2690; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2691; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2692; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2693; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2694; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 2695; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2696; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2697; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2698; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2699; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2700; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2701; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2702; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2703; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2704; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2705; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 2706; GFX1132-NEXT: s_mov_b32 exec_lo, s0 2707; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2708; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2709; GFX1132-NEXT: v_mov_b32_e32 v0, v1 2710; GFX1132-NEXT: s_mov_b32 s0, exec_lo 2711; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 2712; GFX1132-NEXT: s_cbranch_execz .LBB10_2 2713; GFX1132-NEXT: ; %bb.1: 2714; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2715; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2716; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2717; GFX1132-NEXT: ds_sub_u32 v3, v0 2718; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2719; GFX1132-NEXT: buffer_gl0_inv 2720; GFX1132-NEXT: .LBB10_2: 2721; GFX1132-NEXT: s_endpgm 2722entry: 2723 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2724 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2725 ret void 2726} 2727 2728define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2729; 2730; 2731; GFX7LESS-LABEL: sub_i64_constant: 2732; GFX7LESS: ; %bb.0: ; %entry 2733; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2734; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2735; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2736; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 2737; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2738; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2739; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2740; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 2741; GFX7LESS-NEXT: ; %bb.1: 2742; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2743; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 2744; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2745; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 2746; GFX7LESS-NEXT: s_mov_b32 m0, -1 2747; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2748; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2749; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2750; GFX7LESS-NEXT: .LBB11_2: 2751; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2752; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2753; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 2754; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 2755; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2756; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2757; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2758; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2759; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2760; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2761; GFX7LESS-NEXT: s_mov_b32 s2, -1 2762; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2763; GFX7LESS-NEXT: s_endpgm 2764; 2765; GFX8-LABEL: sub_i64_constant: 2766; GFX8: ; %bb.0: ; %entry 2767; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2768; GFX8-NEXT: s_mov_b64 s[4:5], exec 2769; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2770; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2771; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2772; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2773; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2774; GFX8-NEXT: s_cbranch_execz .LBB11_2 2775; GFX8-NEXT: ; %bb.1: 2776; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2777; GFX8-NEXT: s_mul_i32 s4, s4, 5 2778; GFX8-NEXT: v_mov_b32_e32 v0, s4 2779; GFX8-NEXT: v_mov_b32_e32 v1, 0 2780; GFX8-NEXT: s_mov_b32 m0, -1 2781; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2782; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2783; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2784; GFX8-NEXT: .LBB11_2: 2785; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2786; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2787; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2788; GFX8-NEXT: v_readfirstlane_b32 s3, v1 2789; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2790; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2791; GFX8-NEXT: v_mov_b32_e32 v2, s3 2792; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2793; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2794; GFX8-NEXT: s_mov_b32 s3, 0xf000 2795; GFX8-NEXT: s_mov_b32 s2, -1 2796; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2797; GFX8-NEXT: s_endpgm 2798; 2799; GFX9-LABEL: sub_i64_constant: 2800; GFX9: ; %bb.0: ; %entry 2801; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2802; GFX9-NEXT: s_mov_b64 s[4:5], exec 2803; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2804; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2805; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2806; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2807; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2808; GFX9-NEXT: s_cbranch_execz .LBB11_2 2809; GFX9-NEXT: ; %bb.1: 2810; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2811; GFX9-NEXT: s_mul_i32 s4, s4, 5 2812; GFX9-NEXT: v_mov_b32_e32 v0, s4 2813; GFX9-NEXT: v_mov_b32_e32 v1, 0 2814; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2815; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2816; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2817; GFX9-NEXT: .LBB11_2: 2818; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2820; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2821; GFX9-NEXT: v_readfirstlane_b32 s3, v1 2822; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2823; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2824; GFX9-NEXT: v_mov_b32_e32 v2, s3 2825; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2826; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2827; GFX9-NEXT: s_mov_b32 s3, 0xf000 2828; GFX9-NEXT: s_mov_b32 s2, -1 2829; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2830; GFX9-NEXT: s_endpgm 2831; 2832; GFX1064-LABEL: sub_i64_constant: 2833; GFX1064: ; %bb.0: ; %entry 2834; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2835; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2836; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2837; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2838; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2839; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2840; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2841; GFX1064-NEXT: s_cbranch_execz .LBB11_2 2842; GFX1064-NEXT: ; %bb.1: 2843; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2844; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2845; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2846; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2847; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2848; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2849; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2850; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2851; GFX1064-NEXT: buffer_gl0_inv 2852; GFX1064-NEXT: .LBB11_2: 2853; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2854; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2855; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2856; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2857; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2858; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2859; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2860; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2861; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2862; GFX1064-NEXT: s_mov_b32 s2, -1 2863; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2864; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2865; GFX1064-NEXT: s_endpgm 2866; 2867; GFX1032-LABEL: sub_i64_constant: 2868; GFX1032: ; %bb.0: ; %entry 2869; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2870; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2871; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2872; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2873; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2874; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2875; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2876; GFX1032-NEXT: ; %bb.1: 2877; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2878; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2879; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2880; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2881; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2882; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2883; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2884; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2885; GFX1032-NEXT: buffer_gl0_inv 2886; GFX1032-NEXT: .LBB11_2: 2887; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2888; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2889; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2890; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2891; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2892; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2893; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2894; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2895; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2896; GFX1032-NEXT: s_mov_b32 s2, -1 2897; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2898; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2899; GFX1032-NEXT: s_endpgm 2900; 2901; GFX1164-LABEL: sub_i64_constant: 2902; GFX1164: ; %bb.0: ; %entry 2903; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2904; GFX1164-NEXT: s_mov_b64 s[4:5], exec 2905; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2906; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2907; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2908; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2909; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2910; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2911; GFX1164-NEXT: s_cbranch_execz .LBB11_2 2912; GFX1164-NEXT: ; %bb.1: 2913; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2914; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2915; GFX1164-NEXT: s_mul_i32 s4, s4, 5 2916; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2917; GFX1164-NEXT: v_mov_b32_e32 v0, s4 2918; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2919; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2920; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2921; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2922; GFX1164-NEXT: buffer_gl0_inv 2923; GFX1164-NEXT: .LBB11_2: 2924; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 2925; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2926; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2927; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 2928; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2929; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2930; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2931; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2932; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2933; GFX1164-NEXT: s_mov_b32 s2, -1 2934; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2935; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2936; GFX1164-NEXT: s_endpgm 2937; 2938; GFX1132-LABEL: sub_i64_constant: 2939; GFX1132: ; %bb.0: ; %entry 2940; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2941; GFX1132-NEXT: s_mov_b32 s3, exec_lo 2942; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2943; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2944; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 2945; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 2946; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 2947; GFX1132-NEXT: s_cbranch_execz .LBB11_2 2948; GFX1132-NEXT: ; %bb.1: 2949; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 2950; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2951; GFX1132-NEXT: s_mul_i32 s3, s3, 5 2952; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2953; GFX1132-NEXT: v_mov_b32_e32 v0, s3 2954; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2955; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2956; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2957; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2958; GFX1132-NEXT: buffer_gl0_inv 2959; GFX1132-NEXT: .LBB11_2: 2960; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 2961; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2962; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2963; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 2964; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2965; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2966; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2967; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2968; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2969; GFX1132-NEXT: s_mov_b32 s2, -1 2970; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2971; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2972; GFX1132-NEXT: s_endpgm 2973entry: 2974 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2975 store i64 %old, i64 addrspace(1)* %out 2976 ret void 2977} 2978 2979define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2980; 2981; 2982; GFX7LESS-LABEL: sub_i64_uniform: 2983; GFX7LESS: ; %bb.0: ; %entry 2984; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2985; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2986; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2987; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 2988; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2989; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2990; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2991; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 2992; GFX7LESS-NEXT: ; %bb.1: 2993; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2994; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 2995; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2996; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2997; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2998; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 2999; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 3000; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 3001; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 3002; GFX7LESS-NEXT: s_mov_b32 m0, -1 3003; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3004; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3005; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3006; GFX7LESS-NEXT: .LBB12_2: 3007; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 3008; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 3009; GFX7LESS-NEXT: s_mov_b32 s6, -1 3010; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3011; GFX7LESS-NEXT: s_mov_b32 s4, s0 3012; GFX7LESS-NEXT: s_mov_b32 s5, s1 3013; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 3014; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 3015; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 3016; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 3017; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 3018; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 3019; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 3020; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 3021; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 3022; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3023; GFX7LESS-NEXT: s_endpgm 3024; 3025; GFX8-LABEL: sub_i64_uniform: 3026; GFX8: ; %bb.0: ; %entry 3027; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3028; GFX8-NEXT: s_mov_b64 s[6:7], exec 3029; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3030; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3031; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3032; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3033; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3034; GFX8-NEXT: s_cbranch_execz .LBB12_2 3035; GFX8-NEXT: ; %bb.1: 3036; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 3037; GFX8-NEXT: v_mov_b32_e32 v0, s8 3038; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3039; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 3040; GFX8-NEXT: s_mul_i32 s6, s3, s8 3041; GFX8-NEXT: v_mov_b32_e32 v3, 0 3042; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 3043; GFX8-NEXT: s_mov_b32 m0, -1 3044; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3045; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3046; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3047; GFX8-NEXT: .LBB12_2: 3048; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3049; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3050; GFX8-NEXT: s_mov_b32 s4, s0 3051; GFX8-NEXT: s_mov_b32 s5, s1 3052; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 3053; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 3054; GFX8-NEXT: v_readfirstlane_b32 s0, v0 3055; GFX8-NEXT: v_readfirstlane_b32 s1, v1 3056; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 3057; GFX8-NEXT: v_mov_b32_e32 v3, s1 3058; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 3059; GFX8-NEXT: s_mov_b32 s7, 0xf000 3060; GFX8-NEXT: s_mov_b32 s6, -1 3061; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 3062; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3063; GFX8-NEXT: s_endpgm 3064; 3065; GFX9-LABEL: sub_i64_uniform: 3066; GFX9: ; %bb.0: ; %entry 3067; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3068; GFX9-NEXT: s_mov_b64 s[6:7], exec 3069; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3070; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3071; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3072; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3073; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3074; GFX9-NEXT: s_cbranch_execz .LBB12_2 3075; GFX9-NEXT: ; %bb.1: 3076; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3077; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3078; GFX9-NEXT: s_mul_i32 s7, s3, s6 3079; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 3080; GFX9-NEXT: s_add_i32 s8, s8, s7 3081; GFX9-NEXT: s_mul_i32 s6, s2, s6 3082; GFX9-NEXT: v_mov_b32_e32 v0, s6 3083; GFX9-NEXT: v_mov_b32_e32 v1, s8 3084; GFX9-NEXT: v_mov_b32_e32 v3, 0 3085; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3086; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3087; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3088; GFX9-NEXT: .LBB12_2: 3089; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3090; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3091; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3092; GFX9-NEXT: s_mov_b32 s4, s0 3093; GFX9-NEXT: s_mov_b32 s5, s1 3094; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 3095; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3096; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3097; GFX9-NEXT: v_mov_b32_e32 v1, v4 3098; GFX9-NEXT: v_mov_b32_e32 v2, s1 3099; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 3100; GFX9-NEXT: s_mov_b32 s7, 0xf000 3101; GFX9-NEXT: s_mov_b32 s6, -1 3102; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 3103; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3104; GFX9-NEXT: s_endpgm 3105; 3106; GFX1064-LABEL: sub_i64_uniform: 3107; GFX1064: ; %bb.0: ; %entry 3108; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3109; GFX1064-NEXT: s_mov_b64 s[6:7], exec 3110; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3111; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3112; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3115; GFX1064-NEXT: s_cbranch_execz .LBB12_2 3116; GFX1064-NEXT: ; %bb.1: 3117; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3118; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3119; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3120; GFX1064-NEXT: s_mul_i32 s7, s3, s6 3121; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 3122; GFX1064-NEXT: s_mul_i32 s6, s2, s6 3123; GFX1064-NEXT: s_add_i32 s8, s8, s7 3124; GFX1064-NEXT: v_mov_b32_e32 v0, s6 3125; GFX1064-NEXT: v_mov_b32_e32 v1, s8 3126; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3127; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3128; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3129; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3130; GFX1064-NEXT: buffer_gl0_inv 3131; GFX1064-NEXT: .LBB12_2: 3132; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3133; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3134; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3135; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3136; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 3137; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 3138; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3139; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3140; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3141; GFX1064-NEXT: v_mov_b32_e32 v1, v4 3142; GFX1064-NEXT: s_mov_b32 s2, -1 3143; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3144; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3145; GFX1064-NEXT: s_endpgm 3146; 3147; GFX1032-LABEL: sub_i64_uniform: 3148; GFX1032: ; %bb.0: ; %entry 3149; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3150; GFX1032-NEXT: s_mov_b32 s5, exec_lo 3151; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3152; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3153; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 3154; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3155; GFX1032-NEXT: s_cbranch_execz .LBB12_2 3156; GFX1032-NEXT: ; %bb.1: 3157; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 3158; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3159; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3160; GFX1032-NEXT: s_mul_i32 s6, s3, s5 3161; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 3162; GFX1032-NEXT: s_mul_i32 s5, s2, s5 3163; GFX1032-NEXT: s_add_i32 s7, s7, s6 3164; GFX1032-NEXT: v_mov_b32_e32 v0, s5 3165; GFX1032-NEXT: v_mov_b32_e32 v1, s7 3166; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3167; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3168; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3169; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3170; GFX1032-NEXT: buffer_gl0_inv 3171; GFX1032-NEXT: .LBB12_2: 3172; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3173; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3174; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3175; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 3176; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 3177; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] 3178; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3179; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3180; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3181; GFX1032-NEXT: v_mov_b32_e32 v1, v4 3182; GFX1032-NEXT: s_mov_b32 s2, -1 3183; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3184; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3185; GFX1032-NEXT: s_endpgm 3186; 3187; GFX1164-LABEL: sub_i64_uniform: 3188; GFX1164: ; %bb.0: ; %entry 3189; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3190; GFX1164-NEXT: s_mov_b64 s[6:7], exec 3191; GFX1164-NEXT: s_mov_b64 s[4:5], exec 3192; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3193; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3194; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3195; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 3196; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 3197; GFX1164-NEXT: s_cbranch_execz .LBB12_2 3198; GFX1164-NEXT: ; %bb.1: 3199; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3200; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3201; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3202; GFX1164-NEXT: s_mul_i32 s7, s3, s6 3203; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 3204; GFX1164-NEXT: s_mul_i32 s6, s2, s6 3205; GFX1164-NEXT: s_add_i32 s8, s8, s7 3206; GFX1164-NEXT: v_mov_b32_e32 v0, s6 3207; GFX1164-NEXT: v_mov_b32_e32 v1, s8 3208; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3209; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3210; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3211; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3212; GFX1164-NEXT: buffer_gl0_inv 3213; GFX1164-NEXT: .LBB12_2: 3214; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3215; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3216; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3217; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 3218; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 3219; GFX1164-NEXT: s_waitcnt_depctr 0xfff 3220; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3221; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3222; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3223; GFX1164-NEXT: s_mov_b32 s2, -1 3224; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3225; GFX1164-NEXT: v_mov_b32_e32 v1, v5 3226; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3227; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3228; GFX1164-NEXT: s_endpgm 3229; 3230; GFX1132-LABEL: sub_i64_uniform: 3231; GFX1132: ; %bb.0: ; %entry 3232; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3233; GFX1132-NEXT: s_mov_b32 s5, exec_lo 3234; GFX1132-NEXT: s_mov_b32 s4, exec_lo 3235; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3236; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 3237; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3238; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 3239; GFX1132-NEXT: s_cbranch_execz .LBB12_2 3240; GFX1132-NEXT: ; %bb.1: 3241; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 3242; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3243; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3244; GFX1132-NEXT: s_mul_i32 s6, s3, s5 3245; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 3246; GFX1132-NEXT: s_mul_i32 s5, s2, s5 3247; GFX1132-NEXT: s_add_i32 s7, s7, s6 3248; GFX1132-NEXT: v_mov_b32_e32 v0, s5 3249; GFX1132-NEXT: v_mov_b32_e32 v1, s7 3250; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3251; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3252; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3253; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3254; GFX1132-NEXT: buffer_gl0_inv 3255; GFX1132-NEXT: .LBB12_2: 3256; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 3257; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3258; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3259; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 3260; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 3261; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 3262; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3263; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3264; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3265; GFX1132-NEXT: s_mov_b32 s2, -1 3266; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3267; GFX1132-NEXT: v_mov_b32_e32 v1, v5 3268; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3269; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3270; GFX1132-NEXT: s_endpgm 3271entry: 3272 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 3273 store i64 %old, i64 addrspace(1)* %out 3274 ret void 3275} 3276 3277define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 3278; 3279; 3280; GFX7LESS-LABEL: sub_i64_varying: 3281; GFX7LESS: ; %bb.0: ; %entry 3282; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3283; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3284; GFX7LESS-NEXT: s_mov_b32 m0, -1 3285; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3286; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3287; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3288; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3289; GFX7LESS-NEXT: s_mov_b32 s2, -1 3290; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3291; GFX7LESS-NEXT: s_endpgm 3292; 3293; GFX8-LABEL: sub_i64_varying: 3294; GFX8: ; %bb.0: ; %entry 3295; GFX8-NEXT: v_mov_b32_e32 v1, 0 3296; GFX8-NEXT: s_mov_b32 m0, -1 3297; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3298; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3299; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3300; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3301; GFX8-NEXT: s_mov_b32 s3, 0xf000 3302; GFX8-NEXT: s_mov_b32 s2, -1 3303; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3304; GFX8-NEXT: s_endpgm 3305; 3306; GFX9-LABEL: sub_i64_varying: 3307; GFX9: ; %bb.0: ; %entry 3308; GFX9-NEXT: v_mov_b32_e32 v1, 0 3309; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3310; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3311; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3312; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3313; GFX9-NEXT: s_mov_b32 s3, 0xf000 3314; GFX9-NEXT: s_mov_b32 s2, -1 3315; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3316; GFX9-NEXT: s_endpgm 3317; 3318; GFX10-LABEL: sub_i64_varying: 3319; GFX10: ; %bb.0: ; %entry 3320; GFX10-NEXT: v_mov_b32_e32 v1, 0 3321; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3322; GFX10-NEXT: s_mov_b32 s3, 0x31016000 3323; GFX10-NEXT: s_mov_b32 s2, -1 3324; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3325; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3326; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3327; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3328; GFX10-NEXT: buffer_gl0_inv 3329; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3330; GFX10-NEXT: s_endpgm 3331; 3332; GFX11-LABEL: sub_i64_varying: 3333; GFX11: ; %bb.0: ; %entry 3334; GFX11-NEXT: v_mov_b32_e32 v1, 0 3335; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3336; GFX11-NEXT: s_mov_b32 s3, 0x31016000 3337; GFX11-NEXT: s_mov_b32 s2, -1 3338; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3339; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3340; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3341; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3342; GFX11-NEXT: buffer_gl0_inv 3343; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3344; GFX11-NEXT: s_endpgm 3345entry: 3346 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3347 %zext = zext i32 %lane to i64 3348 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 3349 store i64 %old, i64 addrspace(1)* %out 3350 ret void 3351} 3352 3353define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 3354; 3355; 3356; GFX7LESS-LABEL: and_i32_varying: 3357; GFX7LESS: ; %bb.0: ; %entry 3358; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3359; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3360; GFX7LESS-NEXT: s_mov_b32 m0, -1 3361; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3362; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 3363; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3364; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3365; GFX7LESS-NEXT: s_mov_b32 s2, -1 3366; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3367; GFX7LESS-NEXT: s_endpgm 3368; 3369; GFX8-LABEL: and_i32_varying: 3370; GFX8: ; %bb.0: ; %entry 3371; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3372; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3373; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3374; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3375; GFX8-NEXT: v_mov_b32_e32 v1, -1 3376; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3377; GFX8-NEXT: v_mov_b32_e32 v2, v0 3378; GFX8-NEXT: s_not_b64 exec, exec 3379; GFX8-NEXT: v_mov_b32_e32 v2, -1 3380; GFX8-NEXT: s_not_b64 exec, exec 3381; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3382; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3383; GFX8-NEXT: s_nop 1 3384; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3385; GFX8-NEXT: s_nop 1 3386; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3387; GFX8-NEXT: s_nop 1 3388; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3389; GFX8-NEXT: s_nop 1 3390; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3391; GFX8-NEXT: s_nop 1 3392; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3393; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3394; GFX8-NEXT: s_nop 0 3395; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3396; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3397; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3398; GFX8-NEXT: ; implicit-def: $vgpr0 3399; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3400; GFX8-NEXT: s_cbranch_execz .LBB14_2 3401; GFX8-NEXT: ; %bb.1: 3402; GFX8-NEXT: v_mov_b32_e32 v0, 0 3403; GFX8-NEXT: v_mov_b32_e32 v3, s4 3404; GFX8-NEXT: s_mov_b32 m0, -1 3405; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3406; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 3407; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3408; GFX8-NEXT: .LBB14_2: 3409; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3410; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3411; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3412; GFX8-NEXT: v_mov_b32_e32 v0, v1 3413; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 3414; GFX8-NEXT: s_mov_b32 s3, 0xf000 3415; GFX8-NEXT: s_mov_b32 s2, -1 3416; GFX8-NEXT: s_nop 0 3417; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3418; GFX8-NEXT: s_endpgm 3419; 3420; GFX9-LABEL: and_i32_varying: 3421; GFX9: ; %bb.0: ; %entry 3422; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3423; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3424; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3425; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3426; GFX9-NEXT: v_mov_b32_e32 v1, -1 3427; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3428; GFX9-NEXT: v_mov_b32_e32 v2, v0 3429; GFX9-NEXT: s_not_b64 exec, exec 3430; GFX9-NEXT: v_mov_b32_e32 v2, -1 3431; GFX9-NEXT: s_not_b64 exec, exec 3432; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3433; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3434; GFX9-NEXT: s_nop 1 3435; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3436; GFX9-NEXT: s_nop 1 3437; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3438; GFX9-NEXT: s_nop 1 3439; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3440; GFX9-NEXT: s_nop 1 3441; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3442; GFX9-NEXT: s_nop 1 3443; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3444; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3445; GFX9-NEXT: s_nop 0 3446; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3447; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3448; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3449; GFX9-NEXT: ; implicit-def: $vgpr0 3450; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3451; GFX9-NEXT: s_cbranch_execz .LBB14_2 3452; GFX9-NEXT: ; %bb.1: 3453; GFX9-NEXT: v_mov_b32_e32 v0, 0 3454; GFX9-NEXT: v_mov_b32_e32 v3, s4 3455; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3456; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 3457; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3458; GFX9-NEXT: .LBB14_2: 3459; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3460; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3461; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3462; GFX9-NEXT: v_mov_b32_e32 v0, v1 3463; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 3464; GFX9-NEXT: s_mov_b32 s3, 0xf000 3465; GFX9-NEXT: s_mov_b32 s2, -1 3466; GFX9-NEXT: s_nop 0 3467; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3468; GFX9-NEXT: s_endpgm 3469; 3470; GFX1064-LABEL: and_i32_varying: 3471; GFX1064: ; %bb.0: ; %entry 3472; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3473; GFX1064-NEXT: s_not_b64 exec, exec 3474; GFX1064-NEXT: v_mov_b32_e32 v1, -1 3475; GFX1064-NEXT: s_not_b64 exec, exec 3476; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3477; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3478; GFX1064-NEXT: v_mov_b32_e32 v3, -1 3479; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3480; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3481; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3482; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3483; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3484; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3485; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3486; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3487; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3488; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3489; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3490; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3491; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3492; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3493; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3494; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3495; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3496; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3497; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3498; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3499; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3500; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3501; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3502; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3503; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3504; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3505; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3506; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3507; GFX1064-NEXT: s_mov_b32 s2, -1 3508; GFX1064-NEXT: ; implicit-def: $vgpr0 3509; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3510; GFX1064-NEXT: s_cbranch_execz .LBB14_2 3511; GFX1064-NEXT: ; %bb.1: 3512; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3513; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3514; GFX1064-NEXT: s_mov_b32 s3, s7 3515; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3516; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3517; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 3518; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3519; GFX1064-NEXT: buffer_gl0_inv 3520; GFX1064-NEXT: .LBB14_2: 3521; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3522; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3523; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3524; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3525; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 3526; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3527; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3528; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3529; GFX1064-NEXT: s_endpgm 3530; 3531; GFX1032-LABEL: and_i32_varying: 3532; GFX1032: ; %bb.0: ; %entry 3533; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3534; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3535; GFX1032-NEXT: v_mov_b32_e32 v1, -1 3536; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3538; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3539; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3540; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3541; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3542; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3543; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3544; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3545; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3546; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3547; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3548; GFX1032-NEXT: v_mov_b32_e32 v3, -1 3549; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3550; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3551; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3552; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3553; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3554; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3555; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3556; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3557; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3558; GFX1032-NEXT: s_mov_b32 s2, -1 3559; GFX1032-NEXT: ; implicit-def: $vgpr0 3560; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3561; GFX1032-NEXT: s_cbranch_execz .LBB14_2 3562; GFX1032-NEXT: ; %bb.1: 3563; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3564; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3565; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3566; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3567; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 3568; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3569; GFX1032-NEXT: buffer_gl0_inv 3570; GFX1032-NEXT: .LBB14_2: 3571; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3572; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3573; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3574; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3575; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 3576; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3577; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3578; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3579; GFX1032-NEXT: s_endpgm 3580; 3581; GFX1164-LABEL: and_i32_varying: 3582; GFX1164: ; %bb.0: ; %entry 3583; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3584; GFX1164-NEXT: s_not_b64 exec, exec 3585; GFX1164-NEXT: v_mov_b32_e32 v1, -1 3586; GFX1164-NEXT: s_not_b64 exec, exec 3587; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3588; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3589; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3590; GFX1164-NEXT: v_mov_b32_e32 v3, -1 3591; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3592; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3593; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3594; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3595; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3596; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3597; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3598; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3599; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3600; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3601; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3602; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3603; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3604; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3605; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3606; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3607; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3608; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3609; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3610; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3611; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3612; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3613; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3614; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3615; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3616; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3617; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3618; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3619; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3620; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 3621; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3622; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3623; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3624; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3625; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3626; GFX1164-NEXT: s_mov_b32 s2, -1 3627; GFX1164-NEXT: ; implicit-def: $vgpr0 3628; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3629; GFX1164-NEXT: s_cbranch_execz .LBB14_2 3630; GFX1164-NEXT: ; %bb.1: 3631; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3632; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3633; GFX1164-NEXT: s_mov_b32 s3, s7 3634; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3635; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3636; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 3637; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3638; GFX1164-NEXT: buffer_gl0_inv 3639; GFX1164-NEXT: .LBB14_2: 3640; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3641; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3642; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3643; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3644; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 3645; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3646; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3647; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3648; GFX1164-NEXT: s_endpgm 3649; 3650; GFX1132-LABEL: and_i32_varying: 3651; GFX1132: ; %bb.0: ; %entry 3652; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3653; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3654; GFX1132-NEXT: v_mov_b32_e32 v1, -1 3655; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3656; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3657; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3658; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3659; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3660; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3661; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3662; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3663; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3664; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3665; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3666; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3667; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3668; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3669; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3670; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3671; GFX1132-NEXT: v_mov_b32_e32 v3, -1 3672; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3673; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3674; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 3675; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3676; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3677; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3678; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3679; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3680; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3681; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 3682; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3683; GFX1132-NEXT: s_mov_b32 s2, -1 3684; GFX1132-NEXT: ; implicit-def: $vgpr0 3685; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3686; GFX1132-NEXT: s_cbranch_execz .LBB14_2 3687; GFX1132-NEXT: ; %bb.1: 3688; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3689; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3690; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3691; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3692; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 3693; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3694; GFX1132-NEXT: buffer_gl0_inv 3695; GFX1132-NEXT: .LBB14_2: 3696; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3697; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3698; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3699; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 3700; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 3701; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3702; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3703; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3704; GFX1132-NEXT: s_endpgm 3705entry: 3706 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3707 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3708 store i32 %old, i32 addrspace(1)* %out 3709 ret void 3710} 3711 3712define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 3713; 3714; 3715; GFX7LESS-LABEL: or_i32_varying: 3716; GFX7LESS: ; %bb.0: ; %entry 3717; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3718; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3719; GFX7LESS-NEXT: s_mov_b32 m0, -1 3720; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3721; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 3722; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3723; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3724; GFX7LESS-NEXT: s_mov_b32 s2, -1 3725; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3726; GFX7LESS-NEXT: s_endpgm 3727; 3728; GFX8-LABEL: or_i32_varying: 3729; GFX8: ; %bb.0: ; %entry 3730; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3731; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3732; GFX8-NEXT: v_mov_b32_e32 v1, 0 3733; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3734; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3735; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3736; GFX8-NEXT: v_mov_b32_e32 v2, v0 3737; GFX8-NEXT: s_not_b64 exec, exec 3738; GFX8-NEXT: v_mov_b32_e32 v2, 0 3739; GFX8-NEXT: s_not_b64 exec, exec 3740; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3741; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3742; GFX8-NEXT: s_nop 1 3743; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3744; GFX8-NEXT: s_nop 1 3745; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3746; GFX8-NEXT: s_nop 1 3747; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3748; GFX8-NEXT: s_nop 1 3749; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3750; GFX8-NEXT: s_nop 1 3751; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3752; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3753; GFX8-NEXT: s_nop 0 3754; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3755; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3756; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3757; GFX8-NEXT: ; implicit-def: $vgpr0 3758; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3759; GFX8-NEXT: s_cbranch_execz .LBB15_2 3760; GFX8-NEXT: ; %bb.1: 3761; GFX8-NEXT: v_mov_b32_e32 v0, 0 3762; GFX8-NEXT: v_mov_b32_e32 v3, s4 3763; GFX8-NEXT: s_mov_b32 m0, -1 3764; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3765; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 3766; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3767; GFX8-NEXT: .LBB15_2: 3768; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3769; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3770; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3771; GFX8-NEXT: v_mov_b32_e32 v0, v1 3772; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 3773; GFX8-NEXT: s_mov_b32 s3, 0xf000 3774; GFX8-NEXT: s_mov_b32 s2, -1 3775; GFX8-NEXT: s_nop 0 3776; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3777; GFX8-NEXT: s_endpgm 3778; 3779; GFX9-LABEL: or_i32_varying: 3780; GFX9: ; %bb.0: ; %entry 3781; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3782; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3783; GFX9-NEXT: v_mov_b32_e32 v1, 0 3784; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3785; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3786; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3787; GFX9-NEXT: v_mov_b32_e32 v2, v0 3788; GFX9-NEXT: s_not_b64 exec, exec 3789; GFX9-NEXT: v_mov_b32_e32 v2, 0 3790; GFX9-NEXT: s_not_b64 exec, exec 3791; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3792; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3793; GFX9-NEXT: s_nop 1 3794; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3795; GFX9-NEXT: s_nop 1 3796; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3797; GFX9-NEXT: s_nop 1 3798; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3799; GFX9-NEXT: s_nop 1 3800; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3801; GFX9-NEXT: s_nop 1 3802; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3803; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3804; GFX9-NEXT: s_nop 0 3805; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3806; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3807; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3808; GFX9-NEXT: ; implicit-def: $vgpr0 3809; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3810; GFX9-NEXT: s_cbranch_execz .LBB15_2 3811; GFX9-NEXT: ; %bb.1: 3812; GFX9-NEXT: v_mov_b32_e32 v0, 0 3813; GFX9-NEXT: v_mov_b32_e32 v3, s4 3814; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3815; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3816; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3817; GFX9-NEXT: .LBB15_2: 3818; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3820; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3821; GFX9-NEXT: v_mov_b32_e32 v0, v1 3822; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3823; GFX9-NEXT: s_mov_b32 s3, 0xf000 3824; GFX9-NEXT: s_mov_b32 s2, -1 3825; GFX9-NEXT: s_nop 0 3826; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3827; GFX9-NEXT: s_endpgm 3828; 3829; GFX1064-LABEL: or_i32_varying: 3830; GFX1064: ; %bb.0: ; %entry 3831; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3832; GFX1064-NEXT: s_not_b64 exec, exec 3833; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3834; GFX1064-NEXT: s_not_b64 exec, exec 3835; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3836; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3837; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3838; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3839; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3840; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3841; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3842; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3843; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3844; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3845; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3846; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3847; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3848; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3849; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3850; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3851; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3852; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3853; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3854; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3855; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3856; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3857; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3858; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3859; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3860; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3861; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3862; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3863; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3864; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3865; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3866; GFX1064-NEXT: s_mov_b32 s2, -1 3867; GFX1064-NEXT: ; implicit-def: $vgpr0 3868; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3869; GFX1064-NEXT: s_cbranch_execz .LBB15_2 3870; GFX1064-NEXT: ; %bb.1: 3871; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3872; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3873; GFX1064-NEXT: s_mov_b32 s3, s7 3874; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3875; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3876; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 3877; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3878; GFX1064-NEXT: buffer_gl0_inv 3879; GFX1064-NEXT: .LBB15_2: 3880; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3881; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3882; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3883; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3884; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3885; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3886; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3887; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3888; GFX1064-NEXT: s_endpgm 3889; 3890; GFX1032-LABEL: or_i32_varying: 3891; GFX1032: ; %bb.0: ; %entry 3892; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3893; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3894; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3895; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3896; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3897; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3898; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3899; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3900; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3901; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3902; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3903; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3904; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3905; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3906; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3907; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3908; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3909; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3910; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3911; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3912; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3913; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3914; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3915; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3916; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3917; GFX1032-NEXT: s_mov_b32 s2, -1 3918; GFX1032-NEXT: ; implicit-def: $vgpr0 3919; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3920; GFX1032-NEXT: s_cbranch_execz .LBB15_2 3921; GFX1032-NEXT: ; %bb.1: 3922; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3923; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3924; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3925; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3926; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 3927; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3928; GFX1032-NEXT: buffer_gl0_inv 3929; GFX1032-NEXT: .LBB15_2: 3930; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3931; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3932; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3933; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3934; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3935; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3936; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3937; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3938; GFX1032-NEXT: s_endpgm 3939; 3940; GFX1164-LABEL: or_i32_varying: 3941; GFX1164: ; %bb.0: ; %entry 3942; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3943; GFX1164-NEXT: s_not_b64 exec, exec 3944; GFX1164-NEXT: v_mov_b32_e32 v1, 0 3945; GFX1164-NEXT: s_not_b64 exec, exec 3946; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3947; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3948; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3949; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3950; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3951; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3952; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3953; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3954; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3955; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3956; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3957; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3958; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3959; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3960; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3961; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3962; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3963; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 3964; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3965; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3966; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3967; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3968; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3969; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3970; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3971; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3972; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3973; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3974; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3975; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3976; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3977; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3978; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3979; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 3980; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3981; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3982; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3983; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3984; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3985; GFX1164-NEXT: s_mov_b32 s2, -1 3986; GFX1164-NEXT: ; implicit-def: $vgpr0 3987; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3988; GFX1164-NEXT: s_cbranch_execz .LBB15_2 3989; GFX1164-NEXT: ; %bb.1: 3990; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3991; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3992; GFX1164-NEXT: s_mov_b32 s3, s7 3993; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3994; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3995; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v4 3996; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3997; GFX1164-NEXT: buffer_gl0_inv 3998; GFX1164-NEXT: .LBB15_2: 3999; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4000; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4001; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4002; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4003; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 4004; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4005; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4006; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4007; GFX1164-NEXT: s_endpgm 4008; 4009; GFX1132-LABEL: or_i32_varying: 4010; GFX1132: ; %bb.0: ; %entry 4011; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4012; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4013; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4014; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4015; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4016; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4017; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4018; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4019; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4020; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4021; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4022; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4023; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4024; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4025; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4026; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4027; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4028; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4029; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4030; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4031; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4032; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4033; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4034; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4035; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4036; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4037; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4038; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4039; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4040; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4041; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4042; GFX1132-NEXT: s_mov_b32 s2, -1 4043; GFX1132-NEXT: ; implicit-def: $vgpr0 4044; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4045; GFX1132-NEXT: s_cbranch_execz .LBB15_2 4046; GFX1132-NEXT: ; %bb.1: 4047; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4048; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4049; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4050; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4051; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v4 4052; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4053; GFX1132-NEXT: buffer_gl0_inv 4054; GFX1132-NEXT: .LBB15_2: 4055; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4056; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4057; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4058; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4059; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 4060; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4061; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4062; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4063; GFX1132-NEXT: s_endpgm 4064entry: 4065 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4066 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4067 store i32 %old, i32 addrspace(1)* %out 4068 ret void 4069} 4070 4071define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 4072; 4073; 4074; GFX7LESS-LABEL: xor_i32_varying: 4075; GFX7LESS: ; %bb.0: ; %entry 4076; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4077; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4078; GFX7LESS-NEXT: s_mov_b32 m0, -1 4079; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4080; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 4081; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4082; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4083; GFX7LESS-NEXT: s_mov_b32 s2, -1 4084; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4085; GFX7LESS-NEXT: s_endpgm 4086; 4087; GFX8-LABEL: xor_i32_varying: 4088; GFX8: ; %bb.0: ; %entry 4089; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4090; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4091; GFX8-NEXT: v_mov_b32_e32 v1, 0 4092; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4093; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4094; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4095; GFX8-NEXT: v_mov_b32_e32 v2, v0 4096; GFX8-NEXT: s_not_b64 exec, exec 4097; GFX8-NEXT: v_mov_b32_e32 v2, 0 4098; GFX8-NEXT: s_not_b64 exec, exec 4099; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4100; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4101; GFX8-NEXT: s_nop 1 4102; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4103; GFX8-NEXT: s_nop 1 4104; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4105; GFX8-NEXT: s_nop 1 4106; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4107; GFX8-NEXT: s_nop 1 4108; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4109; GFX8-NEXT: s_nop 1 4110; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4111; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4112; GFX8-NEXT: s_nop 0 4113; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4114; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4115; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4116; GFX8-NEXT: ; implicit-def: $vgpr0 4117; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4118; GFX8-NEXT: s_cbranch_execz .LBB16_2 4119; GFX8-NEXT: ; %bb.1: 4120; GFX8-NEXT: v_mov_b32_e32 v0, 0 4121; GFX8-NEXT: v_mov_b32_e32 v3, s4 4122; GFX8-NEXT: s_mov_b32 m0, -1 4123; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4124; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 4125; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4126; GFX8-NEXT: .LBB16_2: 4127; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4128; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4129; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4130; GFX8-NEXT: v_mov_b32_e32 v0, v1 4131; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 4132; GFX8-NEXT: s_mov_b32 s3, 0xf000 4133; GFX8-NEXT: s_mov_b32 s2, -1 4134; GFX8-NEXT: s_nop 0 4135; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4136; GFX8-NEXT: s_endpgm 4137; 4138; GFX9-LABEL: xor_i32_varying: 4139; GFX9: ; %bb.0: ; %entry 4140; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4141; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4142; GFX9-NEXT: v_mov_b32_e32 v1, 0 4143; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4144; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4145; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4146; GFX9-NEXT: v_mov_b32_e32 v2, v0 4147; GFX9-NEXT: s_not_b64 exec, exec 4148; GFX9-NEXT: v_mov_b32_e32 v2, 0 4149; GFX9-NEXT: s_not_b64 exec, exec 4150; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4151; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4152; GFX9-NEXT: s_nop 1 4153; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4154; GFX9-NEXT: s_nop 1 4155; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4156; GFX9-NEXT: s_nop 1 4157; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4158; GFX9-NEXT: s_nop 1 4159; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4160; GFX9-NEXT: s_nop 1 4161; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4162; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4163; GFX9-NEXT: s_nop 0 4164; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4165; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4166; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4167; GFX9-NEXT: ; implicit-def: $vgpr0 4168; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4169; GFX9-NEXT: s_cbranch_execz .LBB16_2 4170; GFX9-NEXT: ; %bb.1: 4171; GFX9-NEXT: v_mov_b32_e32 v0, 0 4172; GFX9-NEXT: v_mov_b32_e32 v3, s4 4173; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4174; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 4175; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4176; GFX9-NEXT: .LBB16_2: 4177; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4178; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4179; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4180; GFX9-NEXT: v_mov_b32_e32 v0, v1 4181; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 4182; GFX9-NEXT: s_mov_b32 s3, 0xf000 4183; GFX9-NEXT: s_mov_b32 s2, -1 4184; GFX9-NEXT: s_nop 0 4185; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4186; GFX9-NEXT: s_endpgm 4187; 4188; GFX1064-LABEL: xor_i32_varying: 4189; GFX1064: ; %bb.0: ; %entry 4190; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4191; GFX1064-NEXT: s_not_b64 exec, exec 4192; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4193; GFX1064-NEXT: s_not_b64 exec, exec 4194; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4195; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4196; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4197; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4198; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4199; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4200; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4201; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4202; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4203; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4204; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4205; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4206; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4207; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4208; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4209; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4210; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4211; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4212; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4213; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4214; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4215; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4216; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4217; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4218; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4219; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4220; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4221; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4222; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4223; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4224; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4225; GFX1064-NEXT: s_mov_b32 s2, -1 4226; GFX1064-NEXT: ; implicit-def: $vgpr0 4227; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4228; GFX1064-NEXT: s_cbranch_execz .LBB16_2 4229; GFX1064-NEXT: ; %bb.1: 4230; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4231; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4232; GFX1064-NEXT: s_mov_b32 s3, s7 4233; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4234; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4235; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 4236; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4237; GFX1064-NEXT: buffer_gl0_inv 4238; GFX1064-NEXT: .LBB16_2: 4239; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4240; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4241; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4242; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4243; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 4244; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4245; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4246; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4247; GFX1064-NEXT: s_endpgm 4248; 4249; GFX1032-LABEL: xor_i32_varying: 4250; GFX1032: ; %bb.0: ; %entry 4251; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4252; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4253; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4254; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4255; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4256; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4257; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4258; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4259; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4260; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4261; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4262; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4263; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4264; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4265; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4266; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4267; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4268; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4269; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4270; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4271; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4272; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4273; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4274; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4275; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4276; GFX1032-NEXT: s_mov_b32 s2, -1 4277; GFX1032-NEXT: ; implicit-def: $vgpr0 4278; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4279; GFX1032-NEXT: s_cbranch_execz .LBB16_2 4280; GFX1032-NEXT: ; %bb.1: 4281; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4282; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4283; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4284; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4285; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 4286; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4287; GFX1032-NEXT: buffer_gl0_inv 4288; GFX1032-NEXT: .LBB16_2: 4289; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4290; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4291; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4292; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4293; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 4294; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4295; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4296; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4297; GFX1032-NEXT: s_endpgm 4298; 4299; GFX1164-LABEL: xor_i32_varying: 4300; GFX1164: ; %bb.0: ; %entry 4301; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4302; GFX1164-NEXT: s_not_b64 exec, exec 4303; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4304; GFX1164-NEXT: s_not_b64 exec, exec 4305; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4306; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4307; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4308; GFX1164-NEXT: v_mov_b32_e32 v3, 0 4309; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4310; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4311; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4312; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4313; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4314; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4315; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4316; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4317; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4318; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4319; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4320; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4321; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4322; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4323; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4324; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4325; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4326; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4327; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4328; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4329; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4330; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4331; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4332; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4333; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4334; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4335; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4336; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4337; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4338; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 4339; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4340; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4341; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4342; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4343; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4344; GFX1164-NEXT: s_mov_b32 s2, -1 4345; GFX1164-NEXT: ; implicit-def: $vgpr0 4346; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4347; GFX1164-NEXT: s_cbranch_execz .LBB16_2 4348; GFX1164-NEXT: ; %bb.1: 4349; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4350; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4351; GFX1164-NEXT: s_mov_b32 s3, s7 4352; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4353; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4354; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v4 4355; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4356; GFX1164-NEXT: buffer_gl0_inv 4357; GFX1164-NEXT: .LBB16_2: 4358; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4359; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4360; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4361; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4362; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 4363; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4364; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4365; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4366; GFX1164-NEXT: s_endpgm 4367; 4368; GFX1132-LABEL: xor_i32_varying: 4369; GFX1132: ; %bb.0: ; %entry 4370; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4371; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4372; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4373; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4374; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4375; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4376; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4377; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4378; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4379; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4380; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4381; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4382; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4383; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4384; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4385; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4386; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4387; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4388; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4389; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4390; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4391; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4392; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4393; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4394; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4395; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4396; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4397; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4398; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4399; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4400; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4401; GFX1132-NEXT: s_mov_b32 s2, -1 4402; GFX1132-NEXT: ; implicit-def: $vgpr0 4403; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4404; GFX1132-NEXT: s_cbranch_execz .LBB16_2 4405; GFX1132-NEXT: ; %bb.1: 4406; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4407; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4408; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4409; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4410; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v4 4411; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4412; GFX1132-NEXT: buffer_gl0_inv 4413; GFX1132-NEXT: .LBB16_2: 4414; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4415; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4416; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4417; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4418; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 4419; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4420; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4421; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4422; GFX1132-NEXT: s_endpgm 4423entry: 4424 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4425 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4426 store i32 %old, i32 addrspace(1)* %out 4427 ret void 4428} 4429 4430define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 4431; 4432; 4433; GFX7LESS-LABEL: max_i32_varying: 4434; GFX7LESS: ; %bb.0: ; %entry 4435; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4436; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4437; GFX7LESS-NEXT: s_mov_b32 m0, -1 4438; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4439; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 4440; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4441; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4442; GFX7LESS-NEXT: s_mov_b32 s2, -1 4443; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4444; GFX7LESS-NEXT: s_endpgm 4445; 4446; GFX8-LABEL: max_i32_varying: 4447; GFX8: ; %bb.0: ; %entry 4448; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4449; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4450; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4451; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4452; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4453; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4454; GFX8-NEXT: v_mov_b32_e32 v2, v0 4455; GFX8-NEXT: s_not_b64 exec, exec 4456; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 4457; GFX8-NEXT: s_not_b64 exec, exec 4458; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4459; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4460; GFX8-NEXT: s_nop 1 4461; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4462; GFX8-NEXT: s_nop 1 4463; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4464; GFX8-NEXT: s_nop 1 4465; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4466; GFX8-NEXT: s_nop 1 4467; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4468; GFX8-NEXT: s_nop 1 4469; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4470; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4471; GFX8-NEXT: s_nop 0 4472; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4473; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4474; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4475; GFX8-NEXT: ; implicit-def: $vgpr0 4476; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4477; GFX8-NEXT: s_cbranch_execz .LBB17_2 4478; GFX8-NEXT: ; %bb.1: 4479; GFX8-NEXT: v_mov_b32_e32 v0, 0 4480; GFX8-NEXT: v_mov_b32_e32 v3, s4 4481; GFX8-NEXT: s_mov_b32 m0, -1 4482; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4483; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 4484; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4485; GFX8-NEXT: .LBB17_2: 4486; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4487; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4488; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4489; GFX8-NEXT: v_mov_b32_e32 v0, v1 4490; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 4491; GFX8-NEXT: s_mov_b32 s3, 0xf000 4492; GFX8-NEXT: s_mov_b32 s2, -1 4493; GFX8-NEXT: s_nop 0 4494; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4495; GFX8-NEXT: s_endpgm 4496; 4497; GFX9-LABEL: max_i32_varying: 4498; GFX9: ; %bb.0: ; %entry 4499; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4500; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4501; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4502; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4503; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4504; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4505; GFX9-NEXT: v_mov_b32_e32 v2, v0 4506; GFX9-NEXT: s_not_b64 exec, exec 4507; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 4508; GFX9-NEXT: s_not_b64 exec, exec 4509; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4510; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4511; GFX9-NEXT: s_nop 1 4512; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4513; GFX9-NEXT: s_nop 1 4514; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4515; GFX9-NEXT: s_nop 1 4516; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4517; GFX9-NEXT: s_nop 1 4518; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4519; GFX9-NEXT: s_nop 1 4520; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4521; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4522; GFX9-NEXT: s_nop 0 4523; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4524; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4525; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4526; GFX9-NEXT: ; implicit-def: $vgpr0 4527; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4528; GFX9-NEXT: s_cbranch_execz .LBB17_2 4529; GFX9-NEXT: ; %bb.1: 4530; GFX9-NEXT: v_mov_b32_e32 v0, 0 4531; GFX9-NEXT: v_mov_b32_e32 v3, s4 4532; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4533; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 4534; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4535; GFX9-NEXT: .LBB17_2: 4536; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4537; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4538; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4539; GFX9-NEXT: v_mov_b32_e32 v0, v1 4540; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 4541; GFX9-NEXT: s_mov_b32 s3, 0xf000 4542; GFX9-NEXT: s_mov_b32 s2, -1 4543; GFX9-NEXT: s_nop 0 4544; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4545; GFX9-NEXT: s_endpgm 4546; 4547; GFX1064-LABEL: max_i32_varying: 4548; GFX1064: ; %bb.0: ; %entry 4549; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4550; GFX1064-NEXT: s_not_b64 exec, exec 4551; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 4552; GFX1064-NEXT: s_not_b64 exec, exec 4553; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4554; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4555; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 4556; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4557; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4558; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4559; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4560; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4561; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4562; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4563; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4564; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4565; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4566; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4567; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4568; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4569; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4570; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4571; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4572; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4573; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4574; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4575; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4576; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4577; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4578; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4579; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4580; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4581; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4582; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4583; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4584; GFX1064-NEXT: s_mov_b32 s2, -1 4585; GFX1064-NEXT: ; implicit-def: $vgpr0 4586; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4587; GFX1064-NEXT: s_cbranch_execz .LBB17_2 4588; GFX1064-NEXT: ; %bb.1: 4589; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4590; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4591; GFX1064-NEXT: s_mov_b32 s3, s7 4592; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4593; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4594; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 4595; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4596; GFX1064-NEXT: buffer_gl0_inv 4597; GFX1064-NEXT: .LBB17_2: 4598; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4599; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4600; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4601; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4602; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 4603; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4604; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4605; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4606; GFX1064-NEXT: s_endpgm 4607; 4608; GFX1032-LABEL: max_i32_varying: 4609; GFX1032: ; %bb.0: ; %entry 4610; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4611; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4612; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 4613; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4614; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4615; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4616; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4617; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4618; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4619; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4620; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4621; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4622; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4623; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4624; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4625; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 4626; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4627; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4628; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4629; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4630; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4631; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4632; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4633; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4634; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4635; GFX1032-NEXT: s_mov_b32 s2, -1 4636; GFX1032-NEXT: ; implicit-def: $vgpr0 4637; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4638; GFX1032-NEXT: s_cbranch_execz .LBB17_2 4639; GFX1032-NEXT: ; %bb.1: 4640; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4641; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4642; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4643; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4644; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 4645; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4646; GFX1032-NEXT: buffer_gl0_inv 4647; GFX1032-NEXT: .LBB17_2: 4648; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4649; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4650; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4651; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4652; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 4653; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4654; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4655; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4656; GFX1032-NEXT: s_endpgm 4657; 4658; GFX1164-LABEL: max_i32_varying: 4659; GFX1164: ; %bb.0: ; %entry 4660; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4661; GFX1164-NEXT: s_not_b64 exec, exec 4662; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 4663; GFX1164-NEXT: s_not_b64 exec, exec 4664; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4665; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4666; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4667; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 4668; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4669; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4670; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4671; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4672; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4673; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4674; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4675; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4676; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4677; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4678; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4679; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4680; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4681; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4682; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4683; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4684; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4685; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4686; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4687; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4688; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4689; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4690; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4691; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4692; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4693; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4694; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4695; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4696; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4697; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 4698; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4699; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4700; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4701; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4702; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4703; GFX1164-NEXT: s_mov_b32 s2, -1 4704; GFX1164-NEXT: ; implicit-def: $vgpr0 4705; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4706; GFX1164-NEXT: s_cbranch_execz .LBB17_2 4707; GFX1164-NEXT: ; %bb.1: 4708; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4709; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4710; GFX1164-NEXT: s_mov_b32 s3, s7 4711; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4712; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4713; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 4714; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4715; GFX1164-NEXT: buffer_gl0_inv 4716; GFX1164-NEXT: .LBB17_2: 4717; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4718; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4719; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4720; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4721; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 4722; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4723; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4724; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4725; GFX1164-NEXT: s_endpgm 4726; 4727; GFX1132-LABEL: max_i32_varying: 4728; GFX1132: ; %bb.0: ; %entry 4729; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4730; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4731; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 4732; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4733; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4734; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4735; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4736; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4737; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4738; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4739; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4740; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4741; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4742; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4743; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4744; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4745; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4746; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4747; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4748; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 4749; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4750; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4751; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4752; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4753; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4754; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4755; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4756; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4757; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4758; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 4759; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4760; GFX1132-NEXT: s_mov_b32 s2, -1 4761; GFX1132-NEXT: ; implicit-def: $vgpr0 4762; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4763; GFX1132-NEXT: s_cbranch_execz .LBB17_2 4764; GFX1132-NEXT: ; %bb.1: 4765; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4766; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4767; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4768; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4769; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 4770; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4771; GFX1132-NEXT: buffer_gl0_inv 4772; GFX1132-NEXT: .LBB17_2: 4773; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4774; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4775; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4776; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 4777; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 4778; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4779; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4780; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4781; GFX1132-NEXT: s_endpgm 4782entry: 4783 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4784 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4785 store i32 %old, i32 addrspace(1)* %out 4786 ret void 4787} 4788 4789define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 4790; 4791; 4792; GFX7LESS-LABEL: max_i64_constant: 4793; GFX7LESS: ; %bb.0: ; %entry 4794; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4795; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4796; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4797; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4798; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4799; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4800; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 4801; GFX7LESS-NEXT: ; %bb.1: 4802; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4803; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4804; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4805; GFX7LESS-NEXT: s_mov_b32 m0, -1 4806; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4807; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4808; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4809; GFX7LESS-NEXT: .LBB18_2: 4810; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4811; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4812; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4813; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4814; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 4815; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4816; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4817; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4818; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4819; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4820; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 4821; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4822; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4823; GFX7LESS-NEXT: s_mov_b32 s2, -1 4824; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4825; GFX7LESS-NEXT: s_endpgm 4826; 4827; GFX8-LABEL: max_i64_constant: 4828; GFX8: ; %bb.0: ; %entry 4829; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4830; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4831; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4832; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4833; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4834; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4835; GFX8-NEXT: s_cbranch_execz .LBB18_2 4836; GFX8-NEXT: ; %bb.1: 4837; GFX8-NEXT: v_mov_b32_e32 v0, 5 4838; GFX8-NEXT: v_mov_b32_e32 v2, 0 4839; GFX8-NEXT: v_mov_b32_e32 v1, 0 4840; GFX8-NEXT: s_mov_b32 m0, -1 4841; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4842; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4843; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4844; GFX8-NEXT: .LBB18_2: 4845; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4846; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4847; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4848; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 4849; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4850; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4851; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4852; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4853; GFX8-NEXT: v_mov_b32_e32 v2, s3 4854; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4855; GFX8-NEXT: v_mov_b32_e32 v2, s2 4856; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4857; GFX8-NEXT: s_mov_b32 s3, 0xf000 4858; GFX8-NEXT: s_mov_b32 s2, -1 4859; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4860; GFX8-NEXT: s_endpgm 4861; 4862; GFX9-LABEL: max_i64_constant: 4863; GFX9: ; %bb.0: ; %entry 4864; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4865; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4866; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4867; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4868; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4869; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4870; GFX9-NEXT: s_cbranch_execz .LBB18_2 4871; GFX9-NEXT: ; %bb.1: 4872; GFX9-NEXT: v_mov_b32_e32 v0, 5 4873; GFX9-NEXT: v_mov_b32_e32 v1, 0 4874; GFX9-NEXT: v_mov_b32_e32 v2, 0 4875; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4876; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4877; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4878; GFX9-NEXT: .LBB18_2: 4879; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4880; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4881; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4882; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 4883; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4884; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4885; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4886; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4887; GFX9-NEXT: v_mov_b32_e32 v2, s3 4888; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4889; GFX9-NEXT: v_mov_b32_e32 v2, s2 4890; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4891; GFX9-NEXT: s_mov_b32 s3, 0xf000 4892; GFX9-NEXT: s_mov_b32 s2, -1 4893; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4894; GFX9-NEXT: s_endpgm 4895; 4896; GFX1064-LABEL: max_i64_constant: 4897; GFX1064: ; %bb.0: ; %entry 4898; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4899; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4900; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4901; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4902; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4903; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4904; GFX1064-NEXT: s_cbranch_execz .LBB18_2 4905; GFX1064-NEXT: ; %bb.1: 4906; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4907; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4908; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4909; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4910; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4911; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4912; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4913; GFX1064-NEXT: buffer_gl0_inv 4914; GFX1064-NEXT: .LBB18_2: 4915; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4916; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4917; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4918; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4919; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4920; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4921; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4922; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4923; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4924; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4925; GFX1064-NEXT: s_mov_b32 s2, -1 4926; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4927; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4928; GFX1064-NEXT: s_endpgm 4929; 4930; GFX1032-LABEL: max_i64_constant: 4931; GFX1032: ; %bb.0: ; %entry 4932; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4933; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4934; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4935; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4936; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4937; GFX1032-NEXT: s_cbranch_execz .LBB18_2 4938; GFX1032-NEXT: ; %bb.1: 4939; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4940; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4941; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4942; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4943; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4944; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4945; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4946; GFX1032-NEXT: buffer_gl0_inv 4947; GFX1032-NEXT: .LBB18_2: 4948; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4949; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4950; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4951; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4952; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4953; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4954; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4955; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4956; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4957; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4958; GFX1032-NEXT: s_mov_b32 s2, -1 4959; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4960; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4961; GFX1032-NEXT: s_endpgm 4962; 4963; GFX1164-LABEL: max_i64_constant: 4964; GFX1164: ; %bb.0: ; %entry 4965; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4966; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4967; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4968; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4969; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4970; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 4971; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 4972; GFX1164-NEXT: s_cbranch_execz .LBB18_2 4973; GFX1164-NEXT: ; %bb.1: 4974; GFX1164-NEXT: v_mov_b32_e32 v0, 5 4975; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4976; GFX1164-NEXT: v_mov_b32_e32 v2, 0 4977; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4978; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4979; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4980; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4981; GFX1164-NEXT: buffer_gl0_inv 4982; GFX1164-NEXT: .LBB18_2: 4983; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 4984; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 4985; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 4986; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4987; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4988; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 4989; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4990; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4991; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4992; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4993; GFX1164-NEXT: s_mov_b32 s2, -1 4994; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4995; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 4996; GFX1164-NEXT: s_endpgm 4997; 4998; GFX1132-LABEL: max_i64_constant: 4999; GFX1132: ; %bb.0: ; %entry 5000; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5001; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5002; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5003; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5004; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5005; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5006; GFX1132-NEXT: s_cbranch_execz .LBB18_2 5007; GFX1132-NEXT: ; %bb.1: 5008; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5009; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5010; GFX1132-NEXT: v_mov_b32_e32 v2, 0 5011; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5012; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5013; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 5014; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5015; GFX1132-NEXT: buffer_gl0_inv 5016; GFX1132-NEXT: .LBB18_2: 5017; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5018; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5019; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5020; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 5021; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 5022; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5023; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 5024; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5025; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5026; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5027; GFX1132-NEXT: s_mov_b32 s2, -1 5028; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5029; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5030; GFX1132-NEXT: s_endpgm 5031entry: 5032 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 5033 store i64 %old, i64 addrspace(1)* %out 5034 ret void 5035} 5036 5037define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 5038; 5039; 5040; GFX7LESS-LABEL: min_i32_varying: 5041; GFX7LESS: ; %bb.0: ; %entry 5042; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5043; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5044; GFX7LESS-NEXT: s_mov_b32 m0, -1 5045; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5046; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 5047; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5048; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5049; GFX7LESS-NEXT: s_mov_b32 s2, -1 5050; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5051; GFX7LESS-NEXT: s_endpgm 5052; 5053; GFX8-LABEL: min_i32_varying: 5054; GFX8: ; %bb.0: ; %entry 5055; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5056; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5057; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5058; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5059; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 5060; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5061; GFX8-NEXT: v_mov_b32_e32 v2, v0 5062; GFX8-NEXT: s_not_b64 exec, exec 5063; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 5064; GFX8-NEXT: s_not_b64 exec, exec 5065; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5066; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5067; GFX8-NEXT: s_nop 1 5068; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5069; GFX8-NEXT: s_nop 1 5070; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5071; GFX8-NEXT: s_nop 1 5072; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5073; GFX8-NEXT: s_nop 1 5074; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5075; GFX8-NEXT: s_nop 1 5076; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5077; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5078; GFX8-NEXT: s_nop 0 5079; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5080; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5081; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5082; GFX8-NEXT: ; implicit-def: $vgpr0 5083; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5084; GFX8-NEXT: s_cbranch_execz .LBB19_2 5085; GFX8-NEXT: ; %bb.1: 5086; GFX8-NEXT: v_mov_b32_e32 v0, 0 5087; GFX8-NEXT: v_mov_b32_e32 v3, s4 5088; GFX8-NEXT: s_mov_b32 m0, -1 5089; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5090; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 5091; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5092; GFX8-NEXT: .LBB19_2: 5093; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5094; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5095; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5096; GFX8-NEXT: v_mov_b32_e32 v0, v1 5097; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 5098; GFX8-NEXT: s_mov_b32 s3, 0xf000 5099; GFX8-NEXT: s_mov_b32 s2, -1 5100; GFX8-NEXT: s_nop 0 5101; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5102; GFX8-NEXT: s_endpgm 5103; 5104; GFX9-LABEL: min_i32_varying: 5105; GFX9: ; %bb.0: ; %entry 5106; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5107; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5108; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5109; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5110; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 5111; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5112; GFX9-NEXT: v_mov_b32_e32 v2, v0 5113; GFX9-NEXT: s_not_b64 exec, exec 5114; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 5115; GFX9-NEXT: s_not_b64 exec, exec 5116; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5117; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 5118; GFX9-NEXT: s_nop 1 5119; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 5120; GFX9-NEXT: s_nop 1 5121; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 5122; GFX9-NEXT: s_nop 1 5123; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 5124; GFX9-NEXT: s_nop 1 5125; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5126; GFX9-NEXT: s_nop 1 5127; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5128; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5129; GFX9-NEXT: s_nop 0 5130; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5131; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5132; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5133; GFX9-NEXT: ; implicit-def: $vgpr0 5134; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5135; GFX9-NEXT: s_cbranch_execz .LBB19_2 5136; GFX9-NEXT: ; %bb.1: 5137; GFX9-NEXT: v_mov_b32_e32 v0, 0 5138; GFX9-NEXT: v_mov_b32_e32 v3, s4 5139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5140; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 5141; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5142; GFX9-NEXT: .LBB19_2: 5143; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5144; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5145; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5146; GFX9-NEXT: v_mov_b32_e32 v0, v1 5147; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 5148; GFX9-NEXT: s_mov_b32 s3, 0xf000 5149; GFX9-NEXT: s_mov_b32 s2, -1 5150; GFX9-NEXT: s_nop 0 5151; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5152; GFX9-NEXT: s_endpgm 5153; 5154; GFX1064-LABEL: min_i32_varying: 5155; GFX1064: ; %bb.0: ; %entry 5156; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5157; GFX1064-NEXT: s_not_b64 exec, exec 5158; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 5159; GFX1064-NEXT: s_not_b64 exec, exec 5160; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5161; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5162; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 5163; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5164; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5165; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5166; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5167; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5168; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5169; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5170; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5171; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5172; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5173; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5174; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5175; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5176; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5177; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5178; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5179; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5180; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5181; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5182; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5183; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5184; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5185; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5186; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5187; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5188; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5189; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5190; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5191; GFX1064-NEXT: s_mov_b32 s2, -1 5192; GFX1064-NEXT: ; implicit-def: $vgpr0 5193; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5194; GFX1064-NEXT: s_cbranch_execz .LBB19_2 5195; GFX1064-NEXT: ; %bb.1: 5196; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5197; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5198; GFX1064-NEXT: s_mov_b32 s3, s7 5199; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5200; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5201; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 5202; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5203; GFX1064-NEXT: buffer_gl0_inv 5204; GFX1064-NEXT: .LBB19_2: 5205; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5206; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5207; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5208; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5209; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 5210; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5211; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5212; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5213; GFX1064-NEXT: s_endpgm 5214; 5215; GFX1032-LABEL: min_i32_varying: 5216; GFX1032: ; %bb.0: ; %entry 5217; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5218; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5219; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 5220; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5221; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5222; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5223; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5224; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5225; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5226; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5227; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5228; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5229; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5230; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5231; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5232; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 5233; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5234; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5235; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5236; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5237; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5238; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5239; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5240; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5241; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5242; GFX1032-NEXT: s_mov_b32 s2, -1 5243; GFX1032-NEXT: ; implicit-def: $vgpr0 5244; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5245; GFX1032-NEXT: s_cbranch_execz .LBB19_2 5246; GFX1032-NEXT: ; %bb.1: 5247; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5248; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5249; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5250; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5251; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 5252; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5253; GFX1032-NEXT: buffer_gl0_inv 5254; GFX1032-NEXT: .LBB19_2: 5255; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5256; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5257; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5258; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5259; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 5260; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5261; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5262; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5263; GFX1032-NEXT: s_endpgm 5264; 5265; GFX1164-LABEL: min_i32_varying: 5266; GFX1164: ; %bb.0: ; %entry 5267; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5268; GFX1164-NEXT: s_not_b64 exec, exec 5269; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 5270; GFX1164-NEXT: s_not_b64 exec, exec 5271; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5272; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5273; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5274; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 5275; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5276; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5277; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5278; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5279; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5280; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5281; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5282; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5283; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5284; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5285; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5286; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5287; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5288; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5289; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5290; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5291; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5292; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5293; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5294; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5295; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5296; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5297; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5298; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5299; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5300; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5301; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5302; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5303; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5304; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 5305; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5306; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5307; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5308; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5309; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5310; GFX1164-NEXT: s_mov_b32 s2, -1 5311; GFX1164-NEXT: ; implicit-def: $vgpr0 5312; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5313; GFX1164-NEXT: s_cbranch_execz .LBB19_2 5314; GFX1164-NEXT: ; %bb.1: 5315; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5316; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5317; GFX1164-NEXT: s_mov_b32 s3, s7 5318; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5319; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5320; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 5321; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5322; GFX1164-NEXT: buffer_gl0_inv 5323; GFX1164-NEXT: .LBB19_2: 5324; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5325; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5326; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5327; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5328; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 5329; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5330; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5331; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5332; GFX1164-NEXT: s_endpgm 5333; 5334; GFX1132-LABEL: min_i32_varying: 5335; GFX1132: ; %bb.0: ; %entry 5336; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5337; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5338; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 5339; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5340; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5341; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5342; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5343; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5344; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5345; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5346; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5347; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5348; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5349; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5350; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5351; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5352; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5353; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5354; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5355; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 5356; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5357; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5358; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 5359; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5360; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5361; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5362; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5363; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5364; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5365; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 5366; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5367; GFX1132-NEXT: s_mov_b32 s2, -1 5368; GFX1132-NEXT: ; implicit-def: $vgpr0 5369; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5370; GFX1132-NEXT: s_cbranch_execz .LBB19_2 5371; GFX1132-NEXT: ; %bb.1: 5372; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5373; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5374; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5375; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5376; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 5377; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5378; GFX1132-NEXT: buffer_gl0_inv 5379; GFX1132-NEXT: .LBB19_2: 5380; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5381; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5382; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5383; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5384; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 5385; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5386; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5387; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5388; GFX1132-NEXT: s_endpgm 5389entry: 5390 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5391 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5392 store i32 %old, i32 addrspace(1)* %out 5393 ret void 5394} 5395 5396define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 5397; 5398; 5399; GFX7LESS-LABEL: min_i64_constant: 5400; GFX7LESS: ; %bb.0: ; %entry 5401; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5402; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5403; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5404; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5405; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5406; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5407; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 5408; GFX7LESS-NEXT: ; %bb.1: 5409; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5410; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5411; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5412; GFX7LESS-NEXT: s_mov_b32 m0, -1 5413; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5414; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5415; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5416; GFX7LESS-NEXT: .LBB20_2: 5417; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5418; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5419; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5420; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5421; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 5422; GFX7LESS-NEXT: s_mov_b32 s2, -1 5423; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5424; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5425; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 5426; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 5427; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5428; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5429; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5430; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5431; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5432; GFX7LESS-NEXT: s_endpgm 5433; 5434; GFX8-LABEL: min_i64_constant: 5435; GFX8: ; %bb.0: ; %entry 5436; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5437; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5438; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5439; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5440; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5441; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5442; GFX8-NEXT: s_cbranch_execz .LBB20_2 5443; GFX8-NEXT: ; %bb.1: 5444; GFX8-NEXT: v_mov_b32_e32 v0, 5 5445; GFX8-NEXT: v_mov_b32_e32 v2, 0 5446; GFX8-NEXT: v_mov_b32_e32 v1, 0 5447; GFX8-NEXT: s_mov_b32 m0, -1 5448; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5449; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5450; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5451; GFX8-NEXT: .LBB20_2: 5452; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5453; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5454; GFX8-NEXT: v_readfirstlane_b32 s4, v0 5455; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 5456; GFX8-NEXT: v_readfirstlane_b32 s5, v1 5457; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5458; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5459; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5460; GFX8-NEXT: v_mov_b32_e32 v2, s5 5461; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5462; GFX8-NEXT: v_mov_b32_e32 v2, s4 5463; GFX8-NEXT: s_mov_b32 s2, -1 5464; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5465; GFX8-NEXT: s_mov_b32 s3, 0xf000 5466; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5467; GFX8-NEXT: s_endpgm 5468; 5469; GFX9-LABEL: min_i64_constant: 5470; GFX9: ; %bb.0: ; %entry 5471; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5472; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5473; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5474; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5475; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5476; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5477; GFX9-NEXT: s_cbranch_execz .LBB20_2 5478; GFX9-NEXT: ; %bb.1: 5479; GFX9-NEXT: v_mov_b32_e32 v0, 5 5480; GFX9-NEXT: v_mov_b32_e32 v1, 0 5481; GFX9-NEXT: v_mov_b32_e32 v2, 0 5482; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5483; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5484; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5485; GFX9-NEXT: .LBB20_2: 5486; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5487; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5488; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5489; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 5490; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5491; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5492; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5493; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5494; GFX9-NEXT: v_mov_b32_e32 v2, s5 5495; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5496; GFX9-NEXT: v_mov_b32_e32 v2, s4 5497; GFX9-NEXT: s_mov_b32 s2, -1 5498; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5499; GFX9-NEXT: s_mov_b32 s3, 0xf000 5500; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5501; GFX9-NEXT: s_endpgm 5502; 5503; GFX1064-LABEL: min_i64_constant: 5504; GFX1064: ; %bb.0: ; %entry 5505; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5506; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5507; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5508; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5509; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5510; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5511; GFX1064-NEXT: s_cbranch_execz .LBB20_2 5512; GFX1064-NEXT: ; %bb.1: 5513; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5514; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5515; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5516; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5517; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5518; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5519; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5520; GFX1064-NEXT: buffer_gl0_inv 5521; GFX1064-NEXT: .LBB20_2: 5522; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5523; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5524; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5525; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5526; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5527; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5528; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5529; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5530; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5531; GFX1064-NEXT: s_mov_b32 s2, -1 5532; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5533; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5534; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5535; GFX1064-NEXT: s_endpgm 5536; 5537; GFX1032-LABEL: min_i64_constant: 5538; GFX1032: ; %bb.0: ; %entry 5539; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5540; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5541; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5542; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5543; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5544; GFX1032-NEXT: s_cbranch_execz .LBB20_2 5545; GFX1032-NEXT: ; %bb.1: 5546; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5547; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5548; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5549; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5550; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5551; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5552; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5553; GFX1032-NEXT: buffer_gl0_inv 5554; GFX1032-NEXT: .LBB20_2: 5555; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5556; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5557; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5558; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5559; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5560; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5561; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5562; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5563; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5564; GFX1032-NEXT: s_mov_b32 s2, -1 5565; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5566; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5567; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5568; GFX1032-NEXT: s_endpgm 5569; 5570; GFX1164-LABEL: min_i64_constant: 5571; GFX1164: ; %bb.0: ; %entry 5572; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5573; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5574; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5575; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5576; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5577; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5578; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5579; GFX1164-NEXT: s_cbranch_execz .LBB20_2 5580; GFX1164-NEXT: ; %bb.1: 5581; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5582; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5583; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5584; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5585; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5586; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5587; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5588; GFX1164-NEXT: buffer_gl0_inv 5589; GFX1164-NEXT: .LBB20_2: 5590; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5591; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5592; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5593; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5594; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5595; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5596; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5597; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5598; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5599; GFX1164-NEXT: s_mov_b32 s2, -1 5600; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5601; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5602; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5603; GFX1164-NEXT: s_endpgm 5604; 5605; GFX1132-LABEL: min_i64_constant: 5606; GFX1132: ; %bb.0: ; %entry 5607; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5608; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5609; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5610; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5611; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5612; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5613; GFX1132-NEXT: s_cbranch_execz .LBB20_2 5614; GFX1132-NEXT: ; %bb.1: 5615; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5616; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5617; GFX1132-NEXT: v_mov_b32_e32 v2, 0 5618; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5619; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5620; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5621; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5622; GFX1132-NEXT: buffer_gl0_inv 5623; GFX1132-NEXT: .LBB20_2: 5624; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5625; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5626; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5627; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5628; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5629; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5630; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5631; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5632; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5633; GFX1132-NEXT: s_mov_b32 s2, -1 5634; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5635; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5636; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5637; GFX1132-NEXT: s_endpgm 5638entry: 5639 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 5640 store i64 %old, i64 addrspace(1)* %out 5641 ret void 5642} 5643 5644define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 5645; 5646; 5647; GFX7LESS-LABEL: umax_i32_varying: 5648; GFX7LESS: ; %bb.0: ; %entry 5649; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5650; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5651; GFX7LESS-NEXT: s_mov_b32 m0, -1 5652; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5653; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 5654; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5655; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5656; GFX7LESS-NEXT: s_mov_b32 s2, -1 5657; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5658; GFX7LESS-NEXT: s_endpgm 5659; 5660; GFX8-LABEL: umax_i32_varying: 5661; GFX8: ; %bb.0: ; %entry 5662; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5663; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5664; GFX8-NEXT: v_mov_b32_e32 v1, 0 5665; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5666; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5667; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5668; GFX8-NEXT: v_mov_b32_e32 v2, v0 5669; GFX8-NEXT: s_not_b64 exec, exec 5670; GFX8-NEXT: v_mov_b32_e32 v2, 0 5671; GFX8-NEXT: s_not_b64 exec, exec 5672; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5673; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5674; GFX8-NEXT: s_nop 1 5675; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5676; GFX8-NEXT: s_nop 1 5677; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5678; GFX8-NEXT: s_nop 1 5679; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5680; GFX8-NEXT: s_nop 1 5681; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5682; GFX8-NEXT: s_nop 1 5683; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5684; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5685; GFX8-NEXT: s_nop 0 5686; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5687; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5688; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5689; GFX8-NEXT: ; implicit-def: $vgpr0 5690; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5691; GFX8-NEXT: s_cbranch_execz .LBB21_2 5692; GFX8-NEXT: ; %bb.1: 5693; GFX8-NEXT: v_mov_b32_e32 v0, 0 5694; GFX8-NEXT: v_mov_b32_e32 v3, s4 5695; GFX8-NEXT: s_mov_b32 m0, -1 5696; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5697; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 5698; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5699; GFX8-NEXT: .LBB21_2: 5700; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5701; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5702; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5703; GFX8-NEXT: v_mov_b32_e32 v0, v1 5704; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 5705; GFX8-NEXT: s_mov_b32 s3, 0xf000 5706; GFX8-NEXT: s_mov_b32 s2, -1 5707; GFX8-NEXT: s_nop 0 5708; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5709; GFX8-NEXT: s_endpgm 5710; 5711; GFX9-LABEL: umax_i32_varying: 5712; GFX9: ; %bb.0: ; %entry 5713; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5714; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5715; GFX9-NEXT: v_mov_b32_e32 v1, 0 5716; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5717; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5718; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5719; GFX9-NEXT: v_mov_b32_e32 v2, v0 5720; GFX9-NEXT: s_not_b64 exec, exec 5721; GFX9-NEXT: v_mov_b32_e32 v2, 0 5722; GFX9-NEXT: s_not_b64 exec, exec 5723; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5724; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5725; GFX9-NEXT: s_nop 1 5726; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5727; GFX9-NEXT: s_nop 1 5728; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5729; GFX9-NEXT: s_nop 1 5730; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5731; GFX9-NEXT: s_nop 1 5732; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5733; GFX9-NEXT: s_nop 1 5734; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5735; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5736; GFX9-NEXT: s_nop 0 5737; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5738; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5739; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5740; GFX9-NEXT: ; implicit-def: $vgpr0 5741; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5742; GFX9-NEXT: s_cbranch_execz .LBB21_2 5743; GFX9-NEXT: ; %bb.1: 5744; GFX9-NEXT: v_mov_b32_e32 v0, 0 5745; GFX9-NEXT: v_mov_b32_e32 v3, s4 5746; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5747; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 5748; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5749; GFX9-NEXT: .LBB21_2: 5750; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5751; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5752; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5753; GFX9-NEXT: v_mov_b32_e32 v0, v1 5754; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 5755; GFX9-NEXT: s_mov_b32 s3, 0xf000 5756; GFX9-NEXT: s_mov_b32 s2, -1 5757; GFX9-NEXT: s_nop 0 5758; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5759; GFX9-NEXT: s_endpgm 5760; 5761; GFX1064-LABEL: umax_i32_varying: 5762; GFX1064: ; %bb.0: ; %entry 5763; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5764; GFX1064-NEXT: s_not_b64 exec, exec 5765; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5766; GFX1064-NEXT: s_not_b64 exec, exec 5767; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5768; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5769; GFX1064-NEXT: v_mov_b32_e32 v3, 0 5770; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5771; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5772; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5773; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5774; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5775; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5776; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5777; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5778; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5779; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5780; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5781; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5782; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5783; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5784; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5785; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5786; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5787; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5788; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5789; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5790; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5791; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5792; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5793; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5794; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5795; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5796; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5797; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5798; GFX1064-NEXT: s_mov_b32 s2, -1 5799; GFX1064-NEXT: ; implicit-def: $vgpr0 5800; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5801; GFX1064-NEXT: s_cbranch_execz .LBB21_2 5802; GFX1064-NEXT: ; %bb.1: 5803; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5804; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5805; GFX1064-NEXT: s_mov_b32 s3, s7 5806; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5807; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5808; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 5809; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5810; GFX1064-NEXT: buffer_gl0_inv 5811; GFX1064-NEXT: .LBB21_2: 5812; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5813; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5814; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5815; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5816; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 5817; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5818; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5819; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5820; GFX1064-NEXT: s_endpgm 5821; 5822; GFX1032-LABEL: umax_i32_varying: 5823; GFX1032: ; %bb.0: ; %entry 5824; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5825; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5826; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5827; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5828; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5829; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5830; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5831; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5832; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5833; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5834; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5835; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5836; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5837; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5838; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5839; GFX1032-NEXT: v_mov_b32_e32 v3, 0 5840; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5841; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5842; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5843; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5844; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5845; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5846; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5847; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5848; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5849; GFX1032-NEXT: s_mov_b32 s2, -1 5850; GFX1032-NEXT: ; implicit-def: $vgpr0 5851; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5852; GFX1032-NEXT: s_cbranch_execz .LBB21_2 5853; GFX1032-NEXT: ; %bb.1: 5854; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5855; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5856; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5857; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5858; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 5859; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5860; GFX1032-NEXT: buffer_gl0_inv 5861; GFX1032-NEXT: .LBB21_2: 5862; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5863; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5864; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5865; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5866; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 5867; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5868; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5869; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5870; GFX1032-NEXT: s_endpgm 5871; 5872; GFX1164-LABEL: umax_i32_varying: 5873; GFX1164: ; %bb.0: ; %entry 5874; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5875; GFX1164-NEXT: s_not_b64 exec, exec 5876; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5877; GFX1164-NEXT: s_not_b64 exec, exec 5878; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5879; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5880; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5881; GFX1164-NEXT: v_mov_b32_e32 v3, 0 5882; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5883; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5884; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5885; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5886; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5887; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5888; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5889; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5890; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5891; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5892; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5893; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5894; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5895; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5896; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5897; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5898; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5899; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5900; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5901; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5902; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5903; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5904; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5905; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5906; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5907; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5908; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5909; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5910; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5911; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 5912; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5913; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5914; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5915; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5916; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5917; GFX1164-NEXT: s_mov_b32 s2, -1 5918; GFX1164-NEXT: ; implicit-def: $vgpr0 5919; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5920; GFX1164-NEXT: s_cbranch_execz .LBB21_2 5921; GFX1164-NEXT: ; %bb.1: 5922; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5923; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5924; GFX1164-NEXT: s_mov_b32 s3, s7 5925; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5926; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5927; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v4 5928; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5929; GFX1164-NEXT: buffer_gl0_inv 5930; GFX1164-NEXT: .LBB21_2: 5931; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5932; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5933; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5934; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 5935; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 5936; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5937; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5938; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5939; GFX1164-NEXT: s_endpgm 5940; 5941; GFX1132-LABEL: umax_i32_varying: 5942; GFX1132: ; %bb.0: ; %entry 5943; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5944; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5945; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5946; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5947; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5948; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5949; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5950; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5951; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5952; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5953; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5954; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5955; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5956; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5957; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5958; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5959; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5960; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 5961; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5962; GFX1132-NEXT: v_mov_b32_e32 v3, 0 5963; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5964; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5965; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 5966; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5967; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5968; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5969; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5970; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5971; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5972; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 5973; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5974; GFX1132-NEXT: s_mov_b32 s2, -1 5975; GFX1132-NEXT: ; implicit-def: $vgpr0 5976; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5977; GFX1132-NEXT: s_cbranch_execz .LBB21_2 5978; GFX1132-NEXT: ; %bb.1: 5979; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5980; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5981; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5982; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5983; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v4 5984; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5985; GFX1132-NEXT: buffer_gl0_inv 5986; GFX1132-NEXT: .LBB21_2: 5987; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5988; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5989; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5990; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 5991; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 5992; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5993; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5994; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5995; GFX1132-NEXT: s_endpgm 5996entry: 5997 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5998 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5999 store i32 %old, i32 addrspace(1)* %out 6000 ret void 6001} 6002 6003define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 6004; 6005; 6006; GFX7LESS-LABEL: umax_i64_constant: 6007; GFX7LESS: ; %bb.0: ; %entry 6008; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6009; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6010; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6011; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6012; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6013; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6014; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 6015; GFX7LESS-NEXT: ; %bb.1: 6016; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6017; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6018; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6019; GFX7LESS-NEXT: s_mov_b32 m0, -1 6020; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6021; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6022; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6023; GFX7LESS-NEXT: .LBB22_2: 6024; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6025; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6026; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6027; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6028; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6029; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6030; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6031; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6032; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 6033; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6034; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 6035; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6036; GFX7LESS-NEXT: s_mov_b32 s2, -1 6037; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6038; GFX7LESS-NEXT: s_endpgm 6039; 6040; GFX8-LABEL: umax_i64_constant: 6041; GFX8: ; %bb.0: ; %entry 6042; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6043; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6044; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6045; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6046; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6047; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6048; GFX8-NEXT: s_cbranch_execz .LBB22_2 6049; GFX8-NEXT: ; %bb.1: 6050; GFX8-NEXT: v_mov_b32_e32 v0, 5 6051; GFX8-NEXT: v_mov_b32_e32 v2, 0 6052; GFX8-NEXT: v_mov_b32_e32 v1, 0 6053; GFX8-NEXT: s_mov_b32 m0, -1 6054; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6055; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6056; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6057; GFX8-NEXT: .LBB22_2: 6058; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6059; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6060; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6061; GFX8-NEXT: v_readfirstlane_b32 s3, v1 6062; GFX8-NEXT: v_mov_b32_e32 v1, 0 6063; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6064; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6065; GFX8-NEXT: v_mov_b32_e32 v2, s2 6066; GFX8-NEXT: v_mov_b32_e32 v1, s3 6067; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6068; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6069; GFX8-NEXT: s_mov_b32 s3, 0xf000 6070; GFX8-NEXT: s_mov_b32 s2, -1 6071; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6072; GFX8-NEXT: s_endpgm 6073; 6074; GFX9-LABEL: umax_i64_constant: 6075; GFX9: ; %bb.0: ; %entry 6076; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6077; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6078; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6079; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6080; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6081; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6082; GFX9-NEXT: s_cbranch_execz .LBB22_2 6083; GFX9-NEXT: ; %bb.1: 6084; GFX9-NEXT: v_mov_b32_e32 v0, 5 6085; GFX9-NEXT: v_mov_b32_e32 v1, 0 6086; GFX9-NEXT: v_mov_b32_e32 v2, 0 6087; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6088; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6089; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6090; GFX9-NEXT: .LBB22_2: 6091; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6092; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6093; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6094; GFX9-NEXT: v_readfirstlane_b32 s3, v1 6095; GFX9-NEXT: v_mov_b32_e32 v1, 0 6096; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6097; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6098; GFX9-NEXT: v_mov_b32_e32 v2, s2 6099; GFX9-NEXT: v_mov_b32_e32 v1, s3 6100; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6101; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 6102; GFX9-NEXT: s_mov_b32 s3, 0xf000 6103; GFX9-NEXT: s_mov_b32 s2, -1 6104; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6105; GFX9-NEXT: s_endpgm 6106; 6107; GFX1064-LABEL: umax_i64_constant: 6108; GFX1064: ; %bb.0: ; %entry 6109; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6110; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6111; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6112; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6113; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6114; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6115; GFX1064-NEXT: s_cbranch_execz .LBB22_2 6116; GFX1064-NEXT: ; %bb.1: 6117; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6118; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6119; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6120; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6121; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6122; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6123; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6124; GFX1064-NEXT: buffer_gl0_inv 6125; GFX1064-NEXT: .LBB22_2: 6126; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6127; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6128; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6129; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6130; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6131; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6132; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6133; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6134; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 6135; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6136; GFX1064-NEXT: s_mov_b32 s2, -1 6137; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6138; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6139; GFX1064-NEXT: s_endpgm 6140; 6141; GFX1032-LABEL: umax_i64_constant: 6142; GFX1032: ; %bb.0: ; %entry 6143; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6144; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6146; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6147; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6148; GFX1032-NEXT: s_cbranch_execz .LBB22_2 6149; GFX1032-NEXT: ; %bb.1: 6150; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6151; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6152; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6153; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6154; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6155; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6156; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6157; GFX1032-NEXT: buffer_gl0_inv 6158; GFX1032-NEXT: .LBB22_2: 6159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6161; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6162; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6163; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6164; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6165; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6166; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6167; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6168; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6169; GFX1032-NEXT: s_mov_b32 s2, -1 6170; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6171; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6172; GFX1032-NEXT: s_endpgm 6173; 6174; GFX1164-LABEL: umax_i64_constant: 6175; GFX1164: ; %bb.0: ; %entry 6176; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6177; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6178; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6179; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6180; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6181; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6182; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6183; GFX1164-NEXT: s_cbranch_execz .LBB22_2 6184; GFX1164-NEXT: ; %bb.1: 6185; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6186; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6187; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6188; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6189; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6190; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6191; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6192; GFX1164-NEXT: buffer_gl0_inv 6193; GFX1164-NEXT: .LBB22_2: 6194; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6195; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6196; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6197; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6198; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 6199; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6200; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 6201; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6202; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 6203; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6204; GFX1164-NEXT: s_mov_b32 s2, -1 6205; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6206; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6207; GFX1164-NEXT: s_endpgm 6208; 6209; GFX1132-LABEL: umax_i64_constant: 6210; GFX1132: ; %bb.0: ; %entry 6211; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6212; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6213; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6214; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6215; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6216; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6217; GFX1132-NEXT: s_cbranch_execz .LBB22_2 6218; GFX1132-NEXT: ; %bb.1: 6219; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6220; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6221; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6222; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6223; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6224; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6225; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6226; GFX1132-NEXT: buffer_gl0_inv 6227; GFX1132-NEXT: .LBB22_2: 6228; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6229; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6230; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6231; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6232; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6233; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6234; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6235; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6236; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6237; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6238; GFX1132-NEXT: s_mov_b32 s2, -1 6239; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6240; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6241; GFX1132-NEXT: s_endpgm 6242entry: 6243 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 6244 store i64 %old, i64 addrspace(1)* %out 6245 ret void 6246} 6247 6248define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 6249; 6250; 6251; GFX7LESS-LABEL: umin_i32_varying: 6252; GFX7LESS: ; %bb.0: ; %entry 6253; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6254; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6255; GFX7LESS-NEXT: s_mov_b32 m0, -1 6256; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6257; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 6258; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6259; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6260; GFX7LESS-NEXT: s_mov_b32 s2, -1 6261; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 6262; GFX7LESS-NEXT: s_endpgm 6263; 6264; GFX8-LABEL: umin_i32_varying: 6265; GFX8: ; %bb.0: ; %entry 6266; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6267; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6268; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6269; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6270; GFX8-NEXT: v_mov_b32_e32 v1, -1 6271; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6272; GFX8-NEXT: v_mov_b32_e32 v2, v0 6273; GFX8-NEXT: s_not_b64 exec, exec 6274; GFX8-NEXT: v_mov_b32_e32 v2, -1 6275; GFX8-NEXT: s_not_b64 exec, exec 6276; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6277; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6278; GFX8-NEXT: s_nop 1 6279; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6280; GFX8-NEXT: s_nop 1 6281; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6282; GFX8-NEXT: s_nop 1 6283; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6284; GFX8-NEXT: s_nop 1 6285; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6286; GFX8-NEXT: s_nop 1 6287; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6288; GFX8-NEXT: v_readlane_b32 s4, v2, 63 6289; GFX8-NEXT: s_nop 0 6290; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6291; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6292; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6293; GFX8-NEXT: ; implicit-def: $vgpr0 6294; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6295; GFX8-NEXT: s_cbranch_execz .LBB23_2 6296; GFX8-NEXT: ; %bb.1: 6297; GFX8-NEXT: v_mov_b32_e32 v0, 0 6298; GFX8-NEXT: v_mov_b32_e32 v3, s4 6299; GFX8-NEXT: s_mov_b32 m0, -1 6300; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6301; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 6302; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6303; GFX8-NEXT: .LBB23_2: 6304; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6305; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6306; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6307; GFX8-NEXT: v_mov_b32_e32 v0, v1 6308; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 6309; GFX8-NEXT: s_mov_b32 s3, 0xf000 6310; GFX8-NEXT: s_mov_b32 s2, -1 6311; GFX8-NEXT: s_nop 0 6312; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 6313; GFX8-NEXT: s_endpgm 6314; 6315; GFX9-LABEL: umin_i32_varying: 6316; GFX9: ; %bb.0: ; %entry 6317; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6318; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6319; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6320; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6321; GFX9-NEXT: v_mov_b32_e32 v1, -1 6322; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6323; GFX9-NEXT: v_mov_b32_e32 v2, v0 6324; GFX9-NEXT: s_not_b64 exec, exec 6325; GFX9-NEXT: v_mov_b32_e32 v2, -1 6326; GFX9-NEXT: s_not_b64 exec, exec 6327; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6328; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6329; GFX9-NEXT: s_nop 1 6330; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6331; GFX9-NEXT: s_nop 1 6332; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6333; GFX9-NEXT: s_nop 1 6334; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6335; GFX9-NEXT: s_nop 1 6336; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6337; GFX9-NEXT: s_nop 1 6338; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6339; GFX9-NEXT: v_readlane_b32 s4, v2, 63 6340; GFX9-NEXT: s_nop 0 6341; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6342; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6343; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6344; GFX9-NEXT: ; implicit-def: $vgpr0 6345; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6346; GFX9-NEXT: s_cbranch_execz .LBB23_2 6347; GFX9-NEXT: ; %bb.1: 6348; GFX9-NEXT: v_mov_b32_e32 v0, 0 6349; GFX9-NEXT: v_mov_b32_e32 v3, s4 6350; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6351; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 6352; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6353; GFX9-NEXT: .LBB23_2: 6354; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6355; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6356; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6357; GFX9-NEXT: v_mov_b32_e32 v0, v1 6358; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 6359; GFX9-NEXT: s_mov_b32 s3, 0xf000 6360; GFX9-NEXT: s_mov_b32 s2, -1 6361; GFX9-NEXT: s_nop 0 6362; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 6363; GFX9-NEXT: s_endpgm 6364; 6365; GFX1064-LABEL: umin_i32_varying: 6366; GFX1064: ; %bb.0: ; %entry 6367; GFX1064-NEXT: v_mov_b32_e32 v1, v0 6368; GFX1064-NEXT: s_not_b64 exec, exec 6369; GFX1064-NEXT: v_mov_b32_e32 v1, -1 6370; GFX1064-NEXT: s_not_b64 exec, exec 6371; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6372; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6373; GFX1064-NEXT: v_mov_b32_e32 v3, -1 6374; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6375; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6376; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6377; GFX1064-NEXT: v_mov_b32_e32 v2, v1 6378; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6379; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6380; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 6381; GFX1064-NEXT: v_mov_b32_e32 v2, s4 6382; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6383; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 6384; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6385; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6386; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6387; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6388; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 6389; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 6390; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6391; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6392; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6393; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 6394; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 6395; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 6396; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6397; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6398; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 6399; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 6400; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 6401; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6402; GFX1064-NEXT: s_mov_b32 s2, -1 6403; GFX1064-NEXT: ; implicit-def: $vgpr0 6404; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 6405; GFX1064-NEXT: s_cbranch_execz .LBB23_2 6406; GFX1064-NEXT: ; %bb.1: 6407; GFX1064-NEXT: v_mov_b32_e32 v0, 0 6408; GFX1064-NEXT: v_mov_b32_e32 v4, s7 6409; GFX1064-NEXT: s_mov_b32 s3, s7 6410; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6411; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6412; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 6413; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6414; GFX1064-NEXT: buffer_gl0_inv 6415; GFX1064-NEXT: .LBB23_2: 6416; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6417; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 6418; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 6419; GFX1064-NEXT: v_mov_b32_e32 v0, v3 6420; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 6421; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6422; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6423; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 6424; GFX1064-NEXT: s_endpgm 6425; 6426; GFX1032-LABEL: umin_i32_varying: 6427; GFX1032: ; %bb.0: ; %entry 6428; GFX1032-NEXT: v_mov_b32_e32 v1, v0 6429; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6430; GFX1032-NEXT: v_mov_b32_e32 v1, -1 6431; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6432; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6433; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6434; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6435; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6436; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6437; GFX1032-NEXT: v_mov_b32_e32 v2, v1 6438; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6439; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6440; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6441; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6442; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6443; GFX1032-NEXT: v_mov_b32_e32 v3, -1 6444; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 6445; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 6446; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6447; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6448; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6449; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6450; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 6451; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6452; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6453; GFX1032-NEXT: s_mov_b32 s2, -1 6454; GFX1032-NEXT: ; implicit-def: $vgpr0 6455; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 6456; GFX1032-NEXT: s_cbranch_execz .LBB23_2 6457; GFX1032-NEXT: ; %bb.1: 6458; GFX1032-NEXT: v_mov_b32_e32 v0, 0 6459; GFX1032-NEXT: v_mov_b32_e32 v4, s4 6460; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6461; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6462; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 6463; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6464; GFX1032-NEXT: buffer_gl0_inv 6465; GFX1032-NEXT: .LBB23_2: 6466; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6467; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 6468; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 6469; GFX1032-NEXT: v_mov_b32_e32 v0, v3 6470; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 6471; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6472; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6473; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 6474; GFX1032-NEXT: s_endpgm 6475; 6476; GFX1164-LABEL: umin_i32_varying: 6477; GFX1164: ; %bb.0: ; %entry 6478; GFX1164-NEXT: v_mov_b32_e32 v1, v0 6479; GFX1164-NEXT: s_not_b64 exec, exec 6480; GFX1164-NEXT: v_mov_b32_e32 v1, -1 6481; GFX1164-NEXT: s_not_b64 exec, exec 6482; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6483; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6484; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6485; GFX1164-NEXT: v_mov_b32_e32 v3, -1 6486; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6487; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6488; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6489; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6490; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6491; GFX1164-NEXT: v_mov_b32_e32 v2, v1 6492; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6493; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6494; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6495; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 6496; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6497; GFX1164-NEXT: v_mov_b32_e32 v2, s4 6498; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6499; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6500; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 6501; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6502; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6503; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6504; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6505; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 6506; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 6507; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6508; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6509; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6510; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6511; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 6512; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 6513; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 6514; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6515; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) 6516; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6517; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 6518; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 6519; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 6520; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6521; GFX1164-NEXT: s_mov_b32 s2, -1 6522; GFX1164-NEXT: ; implicit-def: $vgpr0 6523; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 6524; GFX1164-NEXT: s_cbranch_execz .LBB23_2 6525; GFX1164-NEXT: ; %bb.1: 6526; GFX1164-NEXT: v_mov_b32_e32 v0, 0 6527; GFX1164-NEXT: v_mov_b32_e32 v4, s7 6528; GFX1164-NEXT: s_mov_b32 s3, s7 6529; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6530; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6531; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 6532; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6533; GFX1164-NEXT: buffer_gl0_inv 6534; GFX1164-NEXT: .LBB23_2: 6535; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 6536; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 6537; GFX1164-NEXT: v_mov_b32_e32 v0, v3 6538; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6539; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 6540; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6541; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6542; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6543; GFX1164-NEXT: s_endpgm 6544; 6545; GFX1132-LABEL: umin_i32_varying: 6546; GFX1132: ; %bb.0: ; %entry 6547; GFX1132-NEXT: v_mov_b32_e32 v1, v0 6548; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6549; GFX1132-NEXT: v_mov_b32_e32 v1, -1 6550; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6551; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6552; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6553; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6554; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6555; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6556; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6557; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6558; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6559; GFX1132-NEXT: v_mov_b32_e32 v2, v1 6560; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6561; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6562; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6563; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6564; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 6565; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6566; GFX1132-NEXT: v_mov_b32_e32 v3, -1 6567; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 6568; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 6569; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 6570; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6571; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6572; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6573; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6574; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 6575; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6576; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) 6577; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6578; GFX1132-NEXT: s_mov_b32 s2, -1 6579; GFX1132-NEXT: ; implicit-def: $vgpr0 6580; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 6581; GFX1132-NEXT: s_cbranch_execz .LBB23_2 6582; GFX1132-NEXT: ; %bb.1: 6583; GFX1132-NEXT: v_mov_b32_e32 v0, 0 6584; GFX1132-NEXT: v_mov_b32_e32 v4, s4 6585; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6586; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6587; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 6588; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6589; GFX1132-NEXT: buffer_gl0_inv 6590; GFX1132-NEXT: .LBB23_2: 6591; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 6592; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 6593; GFX1132-NEXT: v_mov_b32_e32 v0, v3 6594; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6595; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 6596; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6597; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6598; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6599; GFX1132-NEXT: s_endpgm 6600entry: 6601 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6602 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 6603 store i32 %old, i32 addrspace(1)* %out 6604 ret void 6605} 6606 6607define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 6608; 6609; 6610; GFX7LESS-LABEL: umin_i64_constant: 6611; GFX7LESS: ; %bb.0: ; %entry 6612; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6613; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6614; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6615; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6616; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6617; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6618; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 6619; GFX7LESS-NEXT: ; %bb.1: 6620; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6621; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6622; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6623; GFX7LESS-NEXT: s_mov_b32 m0, -1 6624; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6625; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6626; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6627; GFX7LESS-NEXT: .LBB24_2: 6628; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6629; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6630; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6631; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6632; GFX7LESS-NEXT: s_mov_b32 s2, -1 6633; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6634; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6635; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 6636; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6637; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6638; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6639; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6640; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6641; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6642; GFX7LESS-NEXT: s_endpgm 6643; 6644; GFX8-LABEL: umin_i64_constant: 6645; GFX8: ; %bb.0: ; %entry 6646; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6647; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6648; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6649; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6650; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6651; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6652; GFX8-NEXT: s_cbranch_execz .LBB24_2 6653; GFX8-NEXT: ; %bb.1: 6654; GFX8-NEXT: v_mov_b32_e32 v0, 5 6655; GFX8-NEXT: v_mov_b32_e32 v2, 0 6656; GFX8-NEXT: v_mov_b32_e32 v1, 0 6657; GFX8-NEXT: s_mov_b32 m0, -1 6658; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6659; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6660; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6661; GFX8-NEXT: .LBB24_2: 6662; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6663; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6664; GFX8-NEXT: v_readfirstlane_b32 s4, v0 6665; GFX8-NEXT: v_readfirstlane_b32 s5, v1 6666; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6667; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6668; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6669; GFX8-NEXT: v_mov_b32_e32 v2, s5 6670; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6671; GFX8-NEXT: v_mov_b32_e32 v2, s4 6672; GFX8-NEXT: s_mov_b32 s2, -1 6673; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6674; GFX8-NEXT: s_mov_b32 s3, 0xf000 6675; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6676; GFX8-NEXT: s_endpgm 6677; 6678; GFX9-LABEL: umin_i64_constant: 6679; GFX9: ; %bb.0: ; %entry 6680; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6681; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6682; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6683; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6684; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6685; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6686; GFX9-NEXT: s_cbranch_execz .LBB24_2 6687; GFX9-NEXT: ; %bb.1: 6688; GFX9-NEXT: v_mov_b32_e32 v0, 5 6689; GFX9-NEXT: v_mov_b32_e32 v1, 0 6690; GFX9-NEXT: v_mov_b32_e32 v2, 0 6691; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6692; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6693; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6694; GFX9-NEXT: .LBB24_2: 6695; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6696; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6697; GFX9-NEXT: v_readfirstlane_b32 s4, v0 6698; GFX9-NEXT: v_readfirstlane_b32 s5, v1 6699; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6700; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6701; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6702; GFX9-NEXT: v_mov_b32_e32 v2, s5 6703; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6704; GFX9-NEXT: v_mov_b32_e32 v2, s4 6705; GFX9-NEXT: s_mov_b32 s2, -1 6706; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6707; GFX9-NEXT: s_mov_b32 s3, 0xf000 6708; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6709; GFX9-NEXT: s_endpgm 6710; 6711; GFX1064-LABEL: umin_i64_constant: 6712; GFX1064: ; %bb.0: ; %entry 6713; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6714; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6715; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6716; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6717; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6718; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6719; GFX1064-NEXT: s_cbranch_execz .LBB24_2 6720; GFX1064-NEXT: ; %bb.1: 6721; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6722; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6723; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6724; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6725; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6726; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6727; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6728; GFX1064-NEXT: buffer_gl0_inv 6729; GFX1064-NEXT: .LBB24_2: 6730; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6731; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6732; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6733; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6734; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6735; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6736; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6737; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6738; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6739; GFX1064-NEXT: s_mov_b32 s2, -1 6740; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6741; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6742; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6743; GFX1064-NEXT: s_endpgm 6744; 6745; GFX1032-LABEL: umin_i64_constant: 6746; GFX1032: ; %bb.0: ; %entry 6747; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6748; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6749; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6750; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6751; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6752; GFX1032-NEXT: s_cbranch_execz .LBB24_2 6753; GFX1032-NEXT: ; %bb.1: 6754; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6755; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6756; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6757; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6758; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6759; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6760; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6761; GFX1032-NEXT: buffer_gl0_inv 6762; GFX1032-NEXT: .LBB24_2: 6763; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6764; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6765; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6766; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6767; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6768; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6769; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6770; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6771; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6772; GFX1032-NEXT: s_mov_b32 s2, -1 6773; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6774; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6775; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6776; GFX1032-NEXT: s_endpgm 6777; 6778; GFX1164-LABEL: umin_i64_constant: 6779; GFX1164: ; %bb.0: ; %entry 6780; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6781; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6782; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6783; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6784; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6785; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6786; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6787; GFX1164-NEXT: s_cbranch_execz .LBB24_2 6788; GFX1164-NEXT: ; %bb.1: 6789; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6790; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6791; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6792; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6793; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6794; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6795; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6796; GFX1164-NEXT: buffer_gl0_inv 6797; GFX1164-NEXT: .LBB24_2: 6798; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6799; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6800; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6801; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6802; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6803; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) 6804; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6805; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6806; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6807; GFX1164-NEXT: s_mov_b32 s2, -1 6808; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6809; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6810; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6811; GFX1164-NEXT: s_endpgm 6812; 6813; GFX1132-LABEL: umin_i64_constant: 6814; GFX1132: ; %bb.0: ; %entry 6815; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6816; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6817; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6818; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6819; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6820; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6821; GFX1132-NEXT: s_cbranch_execz .LBB24_2 6822; GFX1132-NEXT: ; %bb.1: 6823; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6824; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6825; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6826; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6827; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6828; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6829; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6830; GFX1132-NEXT: buffer_gl0_inv 6831; GFX1132-NEXT: .LBB24_2: 6832; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6833; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6834; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6835; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6836; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6837; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) 6838; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6839; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6840; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6841; GFX1132-NEXT: s_mov_b32 s2, -1 6842; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6843; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6844; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6845; GFX1132-NEXT: s_endpgm 6846entry: 6847 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 6848 store i64 %old, i64 addrspace(1)* %out 6849 ret void 6850} 6851