1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11 12@local_var32 = addrspace(3) global i32 undef, align 4 13@local_var64 = addrspace(3) global i64 undef, align 8 14 15; Show what the atomic optimization pass will do for local pointers. 16 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 18; 19; 20; GFX7LESS-LABEL: add_i32_constant: 21; GFX7LESS: ; %bb.0: ; %entry 22; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 23; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 24; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 25; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 26; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 27; GFX7LESS-NEXT: ; implicit-def: $vgpr1 28; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 29; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 30; GFX7LESS-NEXT: ; %bb.1: 31; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 32; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 33; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 34; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 35; GFX7LESS-NEXT: s_mov_b32 m0, -1 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: .LBB0_2: 40; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 41; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 42; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 43; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 44; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 45; GFX7LESS-NEXT: s_mov_b32 s2, -1 46; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; GFX7LESS-NEXT: s_endpgm 48; 49; GFX8-LABEL: add_i32_constant: 50; GFX8: ; %bb.0: ; %entry 51; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 52; GFX8-NEXT: s_mov_b64 s[2:3], exec 53; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 54; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 55; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 56; GFX8-NEXT: ; implicit-def: $vgpr1 57; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 58; GFX8-NEXT: s_cbranch_execz .LBB0_2 59; GFX8-NEXT: ; %bb.1: 60; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 61; GFX8-NEXT: s_mul_i32 s2, s2, 5 62; GFX8-NEXT: v_mov_b32_e32 v1, 0 63; GFX8-NEXT: v_mov_b32_e32 v2, s2 64; GFX8-NEXT: s_mov_b32 m0, -1 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: .LBB0_2: 69; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 70; GFX8-NEXT: s_waitcnt lgkmcnt(0) 71; GFX8-NEXT: v_readfirstlane_b32 s2, v1 72; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 73; GFX8-NEXT: s_mov_b32 s3, 0xf000 74; GFX8-NEXT: s_mov_b32 s2, -1 75; GFX8-NEXT: s_nop 1 76; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX8-NEXT: s_endpgm 78; 79; GFX9-LABEL: add_i32_constant: 80; GFX9: ; %bb.0: ; %entry 81; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 82; GFX9-NEXT: s_mov_b64 s[2:3], exec 83; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 84; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 85; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 86; GFX9-NEXT: ; implicit-def: $vgpr1 87; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX9-NEXT: s_cbranch_execz .LBB0_2 89; GFX9-NEXT: ; %bb.1: 90; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 91; GFX9-NEXT: s_mul_i32 s2, s2, 5 92; GFX9-NEXT: v_mov_b32_e32 v1, 0 93; GFX9-NEXT: v_mov_b32_e32 v2, s2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: .LBB0_2: 98; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 99; GFX9-NEXT: s_waitcnt lgkmcnt(0) 100; GFX9-NEXT: v_readfirstlane_b32 s2, v1 101; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 102; GFX9-NEXT: s_mov_b32 s3, 0xf000 103; GFX9-NEXT: s_mov_b32 s2, -1 104; GFX9-NEXT: s_nop 1 105; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; GFX9-NEXT: s_endpgm 107; 108; GFX1064-LABEL: add_i32_constant: 109; GFX1064: ; %bb.0: ; %entry 110; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 111; GFX1064-NEXT: s_mov_b64 s[2:3], exec 112; GFX1064-NEXT: ; implicit-def: $vgpr1 113; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 114; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 115; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 116; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 117; GFX1064-NEXT: s_cbranch_execz .LBB0_2 118; GFX1064-NEXT: ; %bb.1: 119; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 120; GFX1064-NEXT: v_mov_b32_e32 v1, 0 121; GFX1064-NEXT: s_mul_i32 s2, s2, 5 122; GFX1064-NEXT: v_mov_b32_e32 v2, s2 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 125; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 126; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 127; GFX1064-NEXT: buffer_gl0_inv 128; GFX1064-NEXT: .LBB0_2: 129; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 130; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 131; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 132; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 133; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 134; GFX1064-NEXT: s_mov_b32 s2, -1 135; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 136; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 137; GFX1064-NEXT: s_endpgm 138; 139; GFX1032-LABEL: add_i32_constant: 140; GFX1032: ; %bb.0: ; %entry 141; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 142; GFX1032-NEXT: s_mov_b32 s3, exec_lo 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz .LBB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 150; GFX1032-NEXT: v_mov_b32_e32 v1, 0 151; GFX1032-NEXT: s_mul_i32 s3, s3, 5 152; GFX1032-NEXT: v_mov_b32_e32 v2, s3 153; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 154; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 155; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 156; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 157; GFX1032-NEXT: buffer_gl0_inv 158; GFX1032-NEXT: .LBB0_2: 159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 166; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 167; GFX1032-NEXT: s_endpgm 168; 169; GFX1164-LABEL: add_i32_constant: 170; GFX1164: ; %bb.0: ; %entry 171; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 172; GFX1164-NEXT: s_mov_b64 s[2:3], exec 173; GFX1164-NEXT: s_mov_b64 s[4:5], exec 174; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 175; GFX1164-NEXT: ; implicit-def: $vgpr1 176; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 177; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 178; GFX1164-NEXT: s_cbranch_execz .LBB0_2 179; GFX1164-NEXT: ; %bb.1: 180; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 181; GFX1164-NEXT: v_mov_b32_e32 v1, 0 182; GFX1164-NEXT: s_mul_i32 s2, s2, 5 183; GFX1164-NEXT: v_mov_b32_e32 v2, s2 184; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 185; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 186; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 187; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 188; GFX1164-NEXT: buffer_gl0_inv 189; GFX1164-NEXT: .LBB0_2: 190; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 191; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 192; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 193; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 194; GFX1164-NEXT: s_mov_b32 s2, -1 195; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 196; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 197; GFX1164-NEXT: s_endpgm 198; 199; GFX1132-LABEL: add_i32_constant: 200; GFX1132: ; %bb.0: ; %entry 201; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 202; GFX1132-NEXT: s_mov_b32 s3, exec_lo 203; GFX1132-NEXT: s_mov_b32 s2, exec_lo 204; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 205; GFX1132-NEXT: ; implicit-def: $vgpr1 206; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 207; GFX1132-NEXT: s_cbranch_execz .LBB0_2 208; GFX1132-NEXT: ; %bb.1: 209; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 210; GFX1132-NEXT: v_mov_b32_e32 v1, 0 211; GFX1132-NEXT: s_mul_i32 s3, s3, 5 212; GFX1132-NEXT: v_mov_b32_e32 v2, s3 213; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 214; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 215; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 216; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 217; GFX1132-NEXT: buffer_gl0_inv 218; GFX1132-NEXT: .LBB0_2: 219; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 220; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 221; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 222; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 223; GFX1132-NEXT: s_mov_b32 s2, -1 224; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 225; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 226; GFX1132-NEXT: s_endpgm 227entry: 228 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 229 store i32 %old, i32 addrspace(1)* %out 230 ret void 231} 232 233define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 234; 235; 236; GFX7LESS-LABEL: add_i32_uniform: 237; GFX7LESS: ; %bb.0: ; %entry 238; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 239; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 240; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 241; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 242; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 243; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 244; GFX7LESS-NEXT: ; implicit-def: $vgpr1 245; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 246; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 247; GFX7LESS-NEXT: ; %bb.1: 248; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 249; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 250; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 251; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 252; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 253; GFX7LESS-NEXT: s_mov_b32 m0, -1 254; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 255; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 256; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 257; GFX7LESS-NEXT: .LBB1_2: 258; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 259; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 260; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 261; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 262; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 263; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 264; GFX7LESS-NEXT: s_mov_b32 s6, -1 265; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 266; GFX7LESS-NEXT: s_endpgm 267; 268; GFX8-LABEL: add_i32_uniform: 269; GFX8: ; %bb.0: ; %entry 270; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 271; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 272; GFX8-NEXT: s_mov_b64 s[2:3], exec 273; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 274; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 275; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 276; GFX8-NEXT: ; implicit-def: $vgpr1 277; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 278; GFX8-NEXT: s_cbranch_execz .LBB1_2 279; GFX8-NEXT: ; %bb.1: 280; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 281; GFX8-NEXT: s_waitcnt lgkmcnt(0) 282; GFX8-NEXT: s_mul_i32 s2, s6, s2 283; GFX8-NEXT: v_mov_b32_e32 v1, 0 284; GFX8-NEXT: v_mov_b32_e32 v2, s2 285; GFX8-NEXT: s_mov_b32 m0, -1 286; GFX8-NEXT: s_waitcnt lgkmcnt(0) 287; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 288; GFX8-NEXT: s_waitcnt lgkmcnt(0) 289; GFX8-NEXT: .LBB1_2: 290; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 291; GFX8-NEXT: s_waitcnt lgkmcnt(0) 292; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 293; GFX8-NEXT: v_readfirstlane_b32 s0, v1 294; GFX8-NEXT: s_mov_b32 s7, 0xf000 295; GFX8-NEXT: s_mov_b32 s6, -1 296; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 297; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; GFX8-NEXT: s_endpgm 299; 300; GFX9-LABEL: add_i32_uniform: 301; GFX9: ; %bb.0: ; %entry 302; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 303; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 304; GFX9-NEXT: s_mov_b64 s[2:3], exec 305; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 306; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 307; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 308; GFX9-NEXT: ; implicit-def: $vgpr1 309; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 310; GFX9-NEXT: s_cbranch_execz .LBB1_2 311; GFX9-NEXT: ; %bb.1: 312; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 314; GFX9-NEXT: s_mul_i32 s2, s6, s2 315; GFX9-NEXT: v_mov_b32_e32 v1, 0 316; GFX9-NEXT: v_mov_b32_e32 v2, s2 317; GFX9-NEXT: s_waitcnt lgkmcnt(0) 318; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 319; GFX9-NEXT: s_waitcnt lgkmcnt(0) 320; GFX9-NEXT: .LBB1_2: 321; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 323; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 324; GFX9-NEXT: v_readfirstlane_b32 s0, v1 325; GFX9-NEXT: s_mov_b32 s7, 0xf000 326; GFX9-NEXT: s_mov_b32 s6, -1 327; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 328; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 329; GFX9-NEXT: s_endpgm 330; 331; GFX1064-LABEL: add_i32_uniform: 332; GFX1064: ; %bb.0: ; %entry 333; GFX1064-NEXT: s_clause 0x1 334; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 335; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 336; GFX1064-NEXT: s_mov_b64 s[2:3], exec 337; GFX1064-NEXT: ; implicit-def: $vgpr1 338; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 339; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 340; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 341; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 342; GFX1064-NEXT: s_cbranch_execz .LBB1_2 343; GFX1064-NEXT: ; %bb.1: 344; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 345; GFX1064-NEXT: v_mov_b32_e32 v1, 0 346; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 347; GFX1064-NEXT: s_mul_i32 s2, s6, s2 348; GFX1064-NEXT: v_mov_b32_e32 v2, s2 349; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 350; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 351; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 352; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 353; GFX1064-NEXT: buffer_gl0_inv 354; GFX1064-NEXT: .LBB1_2: 355; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 356; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 357; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 358; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 359; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 360; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1] 361; GFX1064-NEXT: s_mov_b32 s6, -1 362; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 363; GFX1064-NEXT: s_endpgm 364; 365; GFX1032-LABEL: add_i32_uniform: 366; GFX1032: ; %bb.0: ; %entry 367; GFX1032-NEXT: s_clause 0x1 368; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 369; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 370; GFX1032-NEXT: s_mov_b32 s3, exec_lo 371; GFX1032-NEXT: ; implicit-def: $vgpr1 372; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 373; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 374; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 375; GFX1032-NEXT: s_cbranch_execz .LBB1_2 376; GFX1032-NEXT: ; %bb.1: 377; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 378; GFX1032-NEXT: v_mov_b32_e32 v1, 0 379; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 380; GFX1032-NEXT: s_mul_i32 s1, s2, s1 381; GFX1032-NEXT: v_mov_b32_e32 v2, s1 382; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 383; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 384; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 385; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 386; GFX1032-NEXT: buffer_gl0_inv 387; GFX1032-NEXT: .LBB1_2: 388; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 389; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 390; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 391; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 392; GFX1032-NEXT: s_mov_b32 s6, -1 393; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 394; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] 395; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 396; GFX1032-NEXT: s_endpgm 397; 398; GFX1164-LABEL: add_i32_uniform: 399; GFX1164: ; %bb.0: ; %entry 400; GFX1164-NEXT: s_clause 0x1 401; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 402; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 403; GFX1164-NEXT: s_mov_b64 s[2:3], exec 404; GFX1164-NEXT: s_mov_b64 s[0:1], exec 405; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 406; GFX1164-NEXT: ; implicit-def: $vgpr1 407; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 408; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 409; GFX1164-NEXT: s_cbranch_execz .LBB1_2 410; GFX1164-NEXT: ; %bb.1: 411; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 412; GFX1164-NEXT: v_mov_b32_e32 v1, 0 413; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 414; GFX1164-NEXT: s_mul_i32 s2, s6, s2 415; GFX1164-NEXT: v_mov_b32_e32 v2, s2 416; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 417; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 418; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 419; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 420; GFX1164-NEXT: buffer_gl0_inv 421; GFX1164-NEXT: .LBB1_2: 422; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 423; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 424; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 425; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 426; GFX1164-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v0, s[0:1] 427; GFX1164-NEXT: s_mov_b32 s6, -1 428; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 429; GFX1164-NEXT: s_endpgm 430; 431; GFX1132-LABEL: add_i32_uniform: 432; GFX1132: ; %bb.0: ; %entry 433; GFX1132-NEXT: s_clause 0x1 434; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 435; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 436; GFX1132-NEXT: s_mov_b32 s2, exec_lo 437; GFX1132-NEXT: s_mov_b32 s1, exec_lo 438; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 439; GFX1132-NEXT: ; implicit-def: $vgpr1 440; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 441; GFX1132-NEXT: s_cbranch_execz .LBB1_2 442; GFX1132-NEXT: ; %bb.1: 443; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 444; GFX1132-NEXT: v_mov_b32_e32 v1, 0 445; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 446; GFX1132-NEXT: s_mul_i32 s2, s0, s2 447; GFX1132-NEXT: v_mov_b32_e32 v2, s2 448; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 449; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 450; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 451; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 452; GFX1132-NEXT: buffer_gl0_inv 453; GFX1132-NEXT: .LBB1_2: 454; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 455; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 456; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 457; GFX1132-NEXT: s_mov_b32 s6, -1 458; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 459; GFX1132-NEXT: v_mad_u64_u32 v[1:2], s0, s0, v0, s[2:3] 460; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 461; GFX1132-NEXT: s_endpgm 462entry: 463 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 464 store i32 %old, i32 addrspace(1)* %out 465 ret void 466} 467 468define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 469; 470; 471; GFX7LESS-LABEL: add_i32_varying: 472; GFX7LESS: ; %bb.0: ; %entry 473; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 474; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 475; GFX7LESS-NEXT: s_mov_b32 m0, -1 476; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 477; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 478; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 479; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 480; GFX7LESS-NEXT: s_mov_b32 s2, -1 481; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 482; GFX7LESS-NEXT: s_endpgm 483; 484; GFX8-LABEL: add_i32_varying: 485; GFX8: ; %bb.0: ; %entry 486; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 487; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 488; GFX8-NEXT: v_mov_b32_e32 v1, 0 489; GFX8-NEXT: s_mov_b64 exec, s[2:3] 490; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 491; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 492; GFX8-NEXT: v_mov_b32_e32 v2, v0 493; GFX8-NEXT: s_not_b64 exec, exec 494; GFX8-NEXT: v_mov_b32_e32 v2, 0 495; GFX8-NEXT: s_not_b64 exec, exec 496; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 497; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 498; GFX8-NEXT: s_nop 1 499; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 500; GFX8-NEXT: s_nop 1 501; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 502; GFX8-NEXT: s_nop 1 503; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 504; GFX8-NEXT: s_nop 1 505; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 506; GFX8-NEXT: s_nop 1 507; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 508; GFX8-NEXT: v_readlane_b32 s4, v2, 63 509; GFX8-NEXT: s_nop 0 510; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 511; GFX8-NEXT: s_mov_b64 exec, s[2:3] 512; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 513; GFX8-NEXT: ; implicit-def: $vgpr0 514; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 515; GFX8-NEXT: s_cbranch_execz .LBB2_2 516; GFX8-NEXT: ; %bb.1: 517; GFX8-NEXT: v_mov_b32_e32 v0, 0 518; GFX8-NEXT: v_mov_b32_e32 v3, s4 519; GFX8-NEXT: s_mov_b32 m0, -1 520; GFX8-NEXT: s_waitcnt lgkmcnt(0) 521; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 522; GFX8-NEXT: s_waitcnt lgkmcnt(0) 523; GFX8-NEXT: .LBB2_2: 524; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 525; GFX8-NEXT: s_waitcnt lgkmcnt(0) 526; GFX8-NEXT: v_readfirstlane_b32 s2, v0 527; GFX8-NEXT: v_mov_b32_e32 v0, v1 528; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 529; GFX8-NEXT: s_mov_b32 s3, 0xf000 530; GFX8-NEXT: s_mov_b32 s2, -1 531; GFX8-NEXT: s_nop 0 532; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 533; GFX8-NEXT: s_endpgm 534; 535; GFX9-LABEL: add_i32_varying: 536; GFX9: ; %bb.0: ; %entry 537; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 538; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 539; GFX9-NEXT: v_mov_b32_e32 v1, 0 540; GFX9-NEXT: s_mov_b64 exec, s[2:3] 541; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 542; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 543; GFX9-NEXT: v_mov_b32_e32 v2, v0 544; GFX9-NEXT: s_not_b64 exec, exec 545; GFX9-NEXT: v_mov_b32_e32 v2, 0 546; GFX9-NEXT: s_not_b64 exec, exec 547; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 548; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 549; GFX9-NEXT: s_nop 1 550; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 551; GFX9-NEXT: s_nop 1 552; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 553; GFX9-NEXT: s_nop 1 554; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 555; GFX9-NEXT: s_nop 1 556; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 557; GFX9-NEXT: s_nop 1 558; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 559; GFX9-NEXT: v_readlane_b32 s4, v2, 63 560; GFX9-NEXT: s_nop 0 561; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 562; GFX9-NEXT: s_mov_b64 exec, s[2:3] 563; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 564; GFX9-NEXT: ; implicit-def: $vgpr0 565; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 566; GFX9-NEXT: s_cbranch_execz .LBB2_2 567; GFX9-NEXT: ; %bb.1: 568; GFX9-NEXT: v_mov_b32_e32 v0, 0 569; GFX9-NEXT: v_mov_b32_e32 v3, s4 570; GFX9-NEXT: s_waitcnt lgkmcnt(0) 571; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 572; GFX9-NEXT: s_waitcnt lgkmcnt(0) 573; GFX9-NEXT: .LBB2_2: 574; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 575; GFX9-NEXT: s_waitcnt lgkmcnt(0) 576; GFX9-NEXT: v_readfirstlane_b32 s2, v0 577; GFX9-NEXT: v_mov_b32_e32 v0, v1 578; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 579; GFX9-NEXT: s_mov_b32 s3, 0xf000 580; GFX9-NEXT: s_mov_b32 s2, -1 581; GFX9-NEXT: s_nop 0 582; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 583; GFX9-NEXT: s_endpgm 584; 585; GFX1064-LABEL: add_i32_varying: 586; GFX1064: ; %bb.0: ; %entry 587; GFX1064-NEXT: v_mov_b32_e32 v1, v0 588; GFX1064-NEXT: s_not_b64 exec, exec 589; GFX1064-NEXT: v_mov_b32_e32 v1, 0 590; GFX1064-NEXT: s_not_b64 exec, exec 591; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 592; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 593; GFX1064-NEXT: v_mov_b32_e32 v3, 0 594; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 595; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 596; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 597; GFX1064-NEXT: v_mov_b32_e32 v2, v1 598; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 599; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 600; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 601; GFX1064-NEXT: v_mov_b32_e32 v2, s4 602; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 603; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 604; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 605; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 606; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 607; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 608; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 609; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 610; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 611; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 612; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 613; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 614; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 615; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 616; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 617; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 618; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 619; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 620; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 621; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 622; GFX1064-NEXT: s_mov_b32 s2, -1 623; GFX1064-NEXT: ; implicit-def: $vgpr0 624; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 625; GFX1064-NEXT: s_cbranch_execz .LBB2_2 626; GFX1064-NEXT: ; %bb.1: 627; GFX1064-NEXT: v_mov_b32_e32 v0, 0 628; GFX1064-NEXT: v_mov_b32_e32 v4, s7 629; GFX1064-NEXT: s_mov_b32 s3, s7 630; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 631; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 632; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 633; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 634; GFX1064-NEXT: buffer_gl0_inv 635; GFX1064-NEXT: .LBB2_2: 636; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 637; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 638; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 639; GFX1064-NEXT: v_mov_b32_e32 v0, v3 640; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 641; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 642; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 643; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 644; GFX1064-NEXT: s_endpgm 645; 646; GFX1032-LABEL: add_i32_varying: 647; GFX1032: ; %bb.0: ; %entry 648; GFX1032-NEXT: v_mov_b32_e32 v1, v0 649; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 650; GFX1032-NEXT: v_mov_b32_e32 v1, 0 651; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 652; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 653; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 654; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 655; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 656; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 657; GFX1032-NEXT: v_mov_b32_e32 v2, v1 658; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 659; GFX1032-NEXT: s_mov_b32 exec_lo, s2 660; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 661; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 662; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 663; GFX1032-NEXT: v_mov_b32_e32 v3, 0 664; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 665; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 666; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 667; GFX1032-NEXT: s_mov_b32 exec_lo, s2 668; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 669; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 670; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 671; GFX1032-NEXT: s_mov_b32 exec_lo, s2 672; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 673; GFX1032-NEXT: s_mov_b32 s2, -1 674; GFX1032-NEXT: ; implicit-def: $vgpr0 675; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 676; GFX1032-NEXT: s_cbranch_execz .LBB2_2 677; GFX1032-NEXT: ; %bb.1: 678; GFX1032-NEXT: v_mov_b32_e32 v0, 0 679; GFX1032-NEXT: v_mov_b32_e32 v4, s4 680; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 681; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 682; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 683; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 684; GFX1032-NEXT: buffer_gl0_inv 685; GFX1032-NEXT: .LBB2_2: 686; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 687; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 688; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 689; GFX1032-NEXT: v_mov_b32_e32 v0, v3 690; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 691; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 692; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 693; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 694; GFX1032-NEXT: s_endpgm 695; 696; GFX1164-LABEL: add_i32_varying: 697; GFX1164: ; %bb.0: ; %entry 698; GFX1164-NEXT: v_mov_b32_e32 v1, v0 699; GFX1164-NEXT: s_not_b64 exec, exec 700; GFX1164-NEXT: v_mov_b32_e32 v1, 0 701; GFX1164-NEXT: s_not_b64 exec, exec 702; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 703; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 704; GFX1164-NEXT: v_mov_b32_e32 v3, 0 705; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 707; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 708; GFX1164-NEXT: v_mov_b32_e32 v2, v1 709; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 710; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 711; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 712; GFX1164-NEXT: v_mov_b32_e32 v2, s4 713; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 714; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 715; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 716; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 717; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 718; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 719; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 720; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 721; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 722; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 723; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 724; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 725; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 726; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 727; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 728; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 729; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 730; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 731; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 732; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 733; GFX1164-NEXT: s_mov_b32 s2, -1 734; GFX1164-NEXT: ; implicit-def: $vgpr0 735; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 736; GFX1164-NEXT: s_cbranch_execz .LBB2_2 737; GFX1164-NEXT: ; %bb.1: 738; GFX1164-NEXT: v_mov_b32_e32 v0, 0 739; GFX1164-NEXT: v_mov_b32_e32 v4, s7 740; GFX1164-NEXT: s_mov_b32 s3, s7 741; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 742; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 743; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v4 744; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 745; GFX1164-NEXT: buffer_gl0_inv 746; GFX1164-NEXT: .LBB2_2: 747; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 748; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 749; GFX1164-NEXT: v_mov_b32_e32 v0, v3 750; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 751; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 752; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 753; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 754; GFX1164-NEXT: s_endpgm 755; 756; GFX1132-LABEL: add_i32_varying: 757; GFX1132: ; %bb.0: ; %entry 758; GFX1132-NEXT: v_mov_b32_e32 v1, v0 759; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 760; GFX1132-NEXT: v_mov_b32_e32 v1, 0 761; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 762; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 763; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 764; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 765; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 766; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 767; GFX1132-NEXT: v_mov_b32_e32 v2, v1 768; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 769; GFX1132-NEXT: s_mov_b32 exec_lo, s2 770; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 771; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 772; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 773; GFX1132-NEXT: v_mov_b32_e32 v3, 0 774; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 775; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 776; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 777; GFX1132-NEXT: s_mov_b32 exec_lo, s2 778; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 779; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 780; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 781; GFX1132-NEXT: s_mov_b32 exec_lo, s2 782; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 783; GFX1132-NEXT: s_mov_b32 s2, -1 784; GFX1132-NEXT: ; implicit-def: $vgpr0 785; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 786; GFX1132-NEXT: s_cbranch_execz .LBB2_2 787; GFX1132-NEXT: ; %bb.1: 788; GFX1132-NEXT: v_mov_b32_e32 v0, 0 789; GFX1132-NEXT: v_mov_b32_e32 v4, s4 790; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 791; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 792; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v4 793; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 794; GFX1132-NEXT: buffer_gl0_inv 795; GFX1132-NEXT: .LBB2_2: 796; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 797; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 798; GFX1132-NEXT: v_mov_b32_e32 v0, v3 799; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 800; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 801; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 802; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 803; GFX1132-NEXT: s_endpgm 804entry: 805 %lane = call i32 @llvm.amdgcn.workitem.id.x() 806 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 807 store i32 %old, i32 addrspace(1)* %out 808 ret void 809} 810 811define amdgpu_kernel void @add_i32_varying_nouse() { 812; GFX7LESS-LABEL: add_i32_varying_nouse: 813; GFX7LESS: ; %bb.0: ; %entry 814; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 815; GFX7LESS-NEXT: s_mov_b32 m0, -1 816; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 817; GFX7LESS-NEXT: ds_add_u32 v1, v0 818; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 819; GFX7LESS-NEXT: s_endpgm 820; 821; GFX8-LABEL: add_i32_varying_nouse: 822; GFX8: ; %bb.0: ; %entry 823; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 824; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 825; GFX8-NEXT: v_mov_b32_e32 v1, v0 826; GFX8-NEXT: s_not_b64 exec, exec 827; GFX8-NEXT: v_mov_b32_e32 v1, 0 828; GFX8-NEXT: s_not_b64 exec, exec 829; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 830; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 831; GFX8-NEXT: s_nop 1 832; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 833; GFX8-NEXT: s_nop 1 834; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 835; GFX8-NEXT: s_nop 1 836; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 837; GFX8-NEXT: s_nop 1 838; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 839; GFX8-NEXT: s_nop 1 840; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 841; GFX8-NEXT: v_readlane_b32 s2, v1, 63 842; GFX8-NEXT: s_mov_b64 exec, s[0:1] 843; GFX8-NEXT: s_mov_b32 s0, s2 844; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 845; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 846; GFX8-NEXT: s_cbranch_execz .LBB3_2 847; GFX8-NEXT: ; %bb.1: 848; GFX8-NEXT: v_mov_b32_e32 v0, 0 849; GFX8-NEXT: v_mov_b32_e32 v2, s0 850; GFX8-NEXT: s_mov_b32 m0, -1 851; GFX8-NEXT: s_waitcnt lgkmcnt(0) 852; GFX8-NEXT: ds_add_u32 v0, v2 853; GFX8-NEXT: s_waitcnt lgkmcnt(0) 854; GFX8-NEXT: .LBB3_2: 855; GFX8-NEXT: s_endpgm 856; 857; GFX9-LABEL: add_i32_varying_nouse: 858; GFX9: ; %bb.0: ; %entry 859; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 860; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 861; GFX9-NEXT: v_mov_b32_e32 v1, v0 862; GFX9-NEXT: s_not_b64 exec, exec 863; GFX9-NEXT: v_mov_b32_e32 v1, 0 864; GFX9-NEXT: s_not_b64 exec, exec 865; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 866; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 867; GFX9-NEXT: s_nop 1 868; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 869; GFX9-NEXT: s_nop 1 870; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 871; GFX9-NEXT: s_nop 1 872; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 873; GFX9-NEXT: s_nop 1 874; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 875; GFX9-NEXT: s_nop 1 876; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 877; GFX9-NEXT: v_readlane_b32 s2, v1, 63 878; GFX9-NEXT: s_mov_b64 exec, s[0:1] 879; GFX9-NEXT: s_mov_b32 s0, s2 880; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 881; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 882; GFX9-NEXT: s_cbranch_execz .LBB3_2 883; GFX9-NEXT: ; %bb.1: 884; GFX9-NEXT: v_mov_b32_e32 v0, 0 885; GFX9-NEXT: v_mov_b32_e32 v2, s0 886; GFX9-NEXT: s_waitcnt lgkmcnt(0) 887; GFX9-NEXT: ds_add_u32 v0, v2 888; GFX9-NEXT: s_waitcnt lgkmcnt(0) 889; GFX9-NEXT: .LBB3_2: 890; GFX9-NEXT: s_endpgm 891; 892; GFX1064-LABEL: add_i32_varying_nouse: 893; GFX1064: ; %bb.0: ; %entry 894; GFX1064-NEXT: v_mov_b32_e32 v1, v0 895; GFX1064-NEXT: s_not_b64 exec, exec 896; GFX1064-NEXT: v_mov_b32_e32 v1, 0 897; GFX1064-NEXT: s_not_b64 exec, exec 898; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 899; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 900; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 901; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 902; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 903; GFX1064-NEXT: v_mov_b32_e32 v2, v1 904; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 905; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 906; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 907; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 908; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 909; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 910; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 911; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 912; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 913; GFX1064-NEXT: s_add_i32 s0, s2, s3 914; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 915; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 916; GFX1064-NEXT: s_cbranch_execz .LBB3_2 917; GFX1064-NEXT: ; %bb.1: 918; GFX1064-NEXT: v_mov_b32_e32 v0, 0 919; GFX1064-NEXT: v_mov_b32_e32 v3, s0 920; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 921; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 922; GFX1064-NEXT: ds_add_u32 v0, v3 923; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 924; GFX1064-NEXT: buffer_gl0_inv 925; GFX1064-NEXT: .LBB3_2: 926; GFX1064-NEXT: s_endpgm 927; 928; GFX1032-LABEL: add_i32_varying_nouse: 929; GFX1032: ; %bb.0: ; %entry 930; GFX1032-NEXT: v_mov_b32_e32 v1, v0 931; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 932; GFX1032-NEXT: v_mov_b32_e32 v1, 0 933; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 934; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 935; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 936; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 937; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 938; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 939; GFX1032-NEXT: v_mov_b32_e32 v2, v1 940; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 941; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 942; GFX1032-NEXT: s_mov_b32 exec_lo, s0 943; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 944; GFX1032-NEXT: v_mov_b32_e32 v0, v1 945; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 946; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 947; GFX1032-NEXT: s_cbranch_execz .LBB3_2 948; GFX1032-NEXT: ; %bb.1: 949; GFX1032-NEXT: v_mov_b32_e32 v3, 0 950; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 951; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 952; GFX1032-NEXT: ds_add_u32 v3, v0 953; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 954; GFX1032-NEXT: buffer_gl0_inv 955; GFX1032-NEXT: .LBB3_2: 956; GFX1032-NEXT: s_endpgm 957; 958; GFX1164-LABEL: add_i32_varying_nouse: 959; GFX1164: ; %bb.0: ; %entry 960; GFX1164-NEXT: v_mov_b32_e32 v1, v0 961; GFX1164-NEXT: s_not_b64 exec, exec 962; GFX1164-NEXT: v_mov_b32_e32 v1, 0 963; GFX1164-NEXT: s_not_b64 exec, exec 964; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 965; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 966; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 967; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 968; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 969; GFX1164-NEXT: v_mov_b32_e32 v2, v1 970; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 971; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 972; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 973; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 974; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 975; GFX1164-NEXT: v_readlane_b32 s2, v1, 0 976; GFX1164-NEXT: v_readlane_b32 s3, v1, 32 977; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 978; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 979; GFX1164-NEXT: s_add_i32 s0, s2, s3 980; GFX1164-NEXT: s_mov_b64 s[2:3], exec 981; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 982; GFX1164-NEXT: s_cbranch_execz .LBB3_2 983; GFX1164-NEXT: ; %bb.1: 984; GFX1164-NEXT: v_mov_b32_e32 v0, 0 985; GFX1164-NEXT: v_mov_b32_e32 v3, s0 986; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 987; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 988; GFX1164-NEXT: ds_add_u32 v0, v3 989; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 990; GFX1164-NEXT: buffer_gl0_inv 991; GFX1164-NEXT: .LBB3_2: 992; GFX1164-NEXT: s_endpgm 993; 994; GFX1132-LABEL: add_i32_varying_nouse: 995; GFX1132: ; %bb.0: ; %entry 996; GFX1132-NEXT: v_mov_b32_e32 v1, v0 997; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 998; GFX1132-NEXT: v_mov_b32_e32 v1, 0 999; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 1000; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 1001; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1002; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1003; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1004; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1005; GFX1132-NEXT: v_mov_b32_e32 v2, v1 1006; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1007; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 1008; GFX1132-NEXT: s_mov_b32 exec_lo, s0 1009; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1010; GFX1132-NEXT: v_mov_b32_e32 v0, v1 1011; GFX1132-NEXT: s_mov_b32 s0, exec_lo 1012; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 1013; GFX1132-NEXT: s_cbranch_execz .LBB3_2 1014; GFX1132-NEXT: ; %bb.1: 1015; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1016; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1017; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1018; GFX1132-NEXT: ds_add_u32 v3, v0 1019; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1020; GFX1132-NEXT: buffer_gl0_inv 1021; GFX1132-NEXT: .LBB3_2: 1022; GFX1132-NEXT: s_endpgm 1023entry: 1024 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1025 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1026 ret void 1027} 1028 1029define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1030; 1031; 1032; GFX7LESS-LABEL: add_i64_constant: 1033; GFX7LESS: ; %bb.0: ; %entry 1034; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1035; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1036; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1037; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1038; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1039; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1040; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1041; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1042; GFX7LESS-NEXT: ; %bb.1: 1043; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1044; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1045; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1046; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1047; GFX7LESS-NEXT: s_mov_b32 m0, -1 1048; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1049; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1050; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1051; GFX7LESS-NEXT: .LBB4_2: 1052; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1053; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1054; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1055; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1056; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1057; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1058; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1059; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1060; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1061; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1062; GFX7LESS-NEXT: s_mov_b32 s2, -1 1063; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1064; GFX7LESS-NEXT: s_endpgm 1065; 1066; GFX8-LABEL: add_i64_constant: 1067; GFX8: ; %bb.0: ; %entry 1068; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1069; GFX8-NEXT: s_mov_b64 s[4:5], exec 1070; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1071; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1072; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1073; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1074; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1075; GFX8-NEXT: s_cbranch_execz .LBB4_2 1076; GFX8-NEXT: ; %bb.1: 1077; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1078; GFX8-NEXT: s_mul_i32 s4, s4, 5 1079; GFX8-NEXT: v_mov_b32_e32 v0, s4 1080; GFX8-NEXT: v_mov_b32_e32 v1, 0 1081; GFX8-NEXT: s_mov_b32 m0, -1 1082; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1084; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1085; GFX8-NEXT: .LBB4_2: 1086; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1087; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1089; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1090; GFX8-NEXT: v_mov_b32_e32 v0, s2 1091; GFX8-NEXT: v_mov_b32_e32 v1, s3 1092; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1093; GFX8-NEXT: s_mov_b32 s3, 0xf000 1094; GFX8-NEXT: s_mov_b32 s2, -1 1095; GFX8-NEXT: s_nop 2 1096; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1097; GFX8-NEXT: s_endpgm 1098; 1099; GFX9-LABEL: add_i64_constant: 1100; GFX9: ; %bb.0: ; %entry 1101; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1102; GFX9-NEXT: s_mov_b64 s[4:5], exec 1103; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1104; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1105; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1106; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1107; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1108; GFX9-NEXT: s_cbranch_execz .LBB4_2 1109; GFX9-NEXT: ; %bb.1: 1110; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1111; GFX9-NEXT: s_mul_i32 s4, s4, 5 1112; GFX9-NEXT: v_mov_b32_e32 v0, s4 1113; GFX9-NEXT: v_mov_b32_e32 v1, 0 1114; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1115; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1116; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1117; GFX9-NEXT: .LBB4_2: 1118; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1119; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1120; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1121; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1122; GFX9-NEXT: v_mov_b32_e32 v0, s2 1123; GFX9-NEXT: v_mov_b32_e32 v1, s3 1124; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1125; GFX9-NEXT: s_mov_b32 s3, 0xf000 1126; GFX9-NEXT: s_mov_b32 s2, -1 1127; GFX9-NEXT: s_nop 2 1128; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1129; GFX9-NEXT: s_endpgm 1130; 1131; GFX1064-LABEL: add_i64_constant: 1132; GFX1064: ; %bb.0: ; %entry 1133; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1134; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1135; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1136; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1137; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1138; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1139; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1140; GFX1064-NEXT: s_cbranch_execz .LBB4_2 1141; GFX1064-NEXT: ; %bb.1: 1142; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1143; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1144; GFX1064-NEXT: s_mul_i32 s4, s4, 5 1145; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1146; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1147; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1148; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1149; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1150; GFX1064-NEXT: buffer_gl0_inv 1151; GFX1064-NEXT: .LBB4_2: 1152; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1153; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1154; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1155; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1156; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1157; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1158; GFX1064-NEXT: s_mov_b32 s2, -1 1159; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1161; GFX1064-NEXT: s_endpgm 1162; 1163; GFX1032-LABEL: add_i64_constant: 1164; GFX1032: ; %bb.0: ; %entry 1165; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1166; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1167; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1168; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1169; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1170; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1171; GFX1032-NEXT: s_cbranch_execz .LBB4_2 1172; GFX1032-NEXT: ; %bb.1: 1173; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1174; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1175; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1176; GFX1032-NEXT: v_mov_b32_e32 v0, s3 1177; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1178; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1179; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1180; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1181; GFX1032-NEXT: buffer_gl0_inv 1182; GFX1032-NEXT: .LBB4_2: 1183; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1184; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1185; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1186; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1187; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1188; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1189; GFX1032-NEXT: s_mov_b32 s2, -1 1190; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1191; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1192; GFX1032-NEXT: s_endpgm 1193; 1194; GFX1164-LABEL: add_i64_constant: 1195; GFX1164: ; %bb.0: ; %entry 1196; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1197; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1198; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1199; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1200; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1201; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1202; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1203; GFX1164-NEXT: s_cbranch_execz .LBB4_2 1204; GFX1164-NEXT: ; %bb.1: 1205; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1206; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1207; GFX1164-NEXT: s_mul_i32 s4, s4, 5 1208; GFX1164-NEXT: v_mov_b32_e32 v0, s4 1209; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1210; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1211; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1212; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1213; GFX1164-NEXT: buffer_gl0_inv 1214; GFX1164-NEXT: .LBB4_2: 1215; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 1216; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1217; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1218; GFX1164-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1219; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1220; GFX1164-NEXT: s_mov_b32 s2, -1 1221; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1223; GFX1164-NEXT: s_endpgm 1224; 1225; GFX1132-LABEL: add_i64_constant: 1226; GFX1132: ; %bb.0: ; %entry 1227; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1228; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1229; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1230; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1231; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1232; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1233; GFX1132-NEXT: s_cbranch_execz .LBB4_2 1234; GFX1132-NEXT: ; %bb.1: 1235; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1236; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1237; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1238; GFX1132-NEXT: v_mov_b32_e32 v0, s3 1239; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1240; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1241; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1242; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1243; GFX1132-NEXT: buffer_gl0_inv 1244; GFX1132-NEXT: .LBB4_2: 1245; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1246; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1247; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1248; GFX1132-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1249; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1250; GFX1132-NEXT: s_mov_b32 s2, -1 1251; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1252; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1253; GFX1132-NEXT: s_endpgm 1254entry: 1255 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1256 store i64 %old, i64 addrspace(1)* %out 1257 ret void 1258} 1259 1260define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1261; 1262; 1263; GFX7LESS-LABEL: add_i64_uniform: 1264; GFX7LESS: ; %bb.0: ; %entry 1265; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1266; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1267; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1268; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1269; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1270; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1271; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1272; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 1273; GFX7LESS-NEXT: ; %bb.1: 1274; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1275; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 1276; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1277; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1278; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1279; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 1280; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1281; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1282; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1283; GFX7LESS-NEXT: s_mov_b32 m0, -1 1284; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1285; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1286; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX7LESS-NEXT: .LBB5_2: 1288; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1289; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1290; GFX7LESS-NEXT: s_mov_b32 s6, -1 1291; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1292; GFX7LESS-NEXT: s_mov_b32 s4, s0 1293; GFX7LESS-NEXT: s_mov_b32 s5, s1 1294; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 1295; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1296; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 1297; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 1298; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 1299; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1300; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 1301; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1302; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1303; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1304; GFX7LESS-NEXT: s_endpgm 1305; 1306; GFX8-LABEL: add_i64_uniform: 1307; GFX8: ; %bb.0: ; %entry 1308; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1309; GFX8-NEXT: s_mov_b64 s[6:7], exec 1310; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1311; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1312; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1313; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1314; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1315; GFX8-NEXT: s_cbranch_execz .LBB5_2 1316; GFX8-NEXT: ; %bb.1: 1317; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 1318; GFX8-NEXT: v_mov_b32_e32 v0, s8 1319; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1320; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 1321; GFX8-NEXT: s_mul_i32 s6, s3, s8 1322; GFX8-NEXT: v_mov_b32_e32 v3, 0 1323; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1324; GFX8-NEXT: s_mov_b32 m0, -1 1325; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1326; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1327; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1328; GFX8-NEXT: .LBB5_2: 1329; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1330; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1332; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1333; GFX8-NEXT: v_mov_b32_e32 v0, s4 1334; GFX8-NEXT: v_mov_b32_e32 v1, s5 1335; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 1336; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 1337; GFX8-NEXT: s_mov_b32 s7, 0xf000 1338; GFX8-NEXT: s_mov_b32 s6, -1 1339; GFX8-NEXT: s_mov_b32 s4, s0 1340; GFX8-NEXT: s_mov_b32 s5, s1 1341; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1342; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1343; GFX8-NEXT: s_endpgm 1344; 1345; GFX9-LABEL: add_i64_uniform: 1346; GFX9: ; %bb.0: ; %entry 1347; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1348; GFX9-NEXT: s_mov_b64 s[6:7], exec 1349; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1350; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1351; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1352; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1353; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1354; GFX9-NEXT: s_cbranch_execz .LBB5_2 1355; GFX9-NEXT: ; %bb.1: 1356; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1357; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1358; GFX9-NEXT: s_mul_i32 s7, s3, s6 1359; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1360; GFX9-NEXT: s_add_i32 s8, s8, s7 1361; GFX9-NEXT: s_mul_i32 s6, s2, s6 1362; GFX9-NEXT: v_mov_b32_e32 v0, s6 1363; GFX9-NEXT: v_mov_b32_e32 v1, s8 1364; GFX9-NEXT: v_mov_b32_e32 v3, 0 1365; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1366; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1367; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1368; GFX9-NEXT: .LBB5_2: 1369; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1370; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1371; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1372; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1373; GFX9-NEXT: v_mov_b32_e32 v0, s4 1374; GFX9-NEXT: v_mov_b32_e32 v1, s5 1375; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] 1376; GFX9-NEXT: s_mov_b32 s7, 0xf000 1377; GFX9-NEXT: s_mov_b32 s6, -1 1378; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1379; GFX9-NEXT: s_mov_b32 s4, s0 1380; GFX9-NEXT: s_mov_b32 s5, s1 1381; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1382; GFX9-NEXT: s_endpgm 1383; 1384; GFX1064-LABEL: add_i64_uniform: 1385; GFX1064: ; %bb.0: ; %entry 1386; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1387; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1388; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1389; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1390; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1391; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1392; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1393; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1394; GFX1064-NEXT: ; %bb.1: 1395; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1396; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1397; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1398; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1399; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1400; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1401; GFX1064-NEXT: s_add_i32 s8, s8, s7 1402; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1403; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1404; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1405; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1406; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1407; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1408; GFX1064-NEXT: buffer_gl0_inv 1409; GFX1064-NEXT: .LBB5_2: 1410; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1411; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1412; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1413; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1414; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1415; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] 1416; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1417; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1418; GFX1064-NEXT: s_mov_b32 s2, -1 1419; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1420; GFX1064-NEXT: s_endpgm 1421; 1422; GFX1032-LABEL: add_i64_uniform: 1423; GFX1032: ; %bb.0: ; %entry 1424; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1425; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1426; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1427; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1428; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1429; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1430; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1431; GFX1032-NEXT: ; %bb.1: 1432; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1433; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1434; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1435; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1436; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1437; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1438; GFX1032-NEXT: s_add_i32 s7, s7, s6 1439; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1440; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1441; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1442; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1443; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1444; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1445; GFX1032-NEXT: buffer_gl0_inv 1446; GFX1032-NEXT: .LBB5_2: 1447; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1448; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1449; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1450; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1451; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1452; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1453; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] 1454; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1455; GFX1032-NEXT: s_mov_b32 s2, -1 1456; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1457; GFX1032-NEXT: s_endpgm 1458; 1459; GFX1164-LABEL: add_i64_uniform: 1460; GFX1164: ; %bb.0: ; %entry 1461; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1462; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1463; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1464; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1465; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1466; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1467; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1468; GFX1164-NEXT: s_cbranch_execz .LBB5_2 1469; GFX1164-NEXT: ; %bb.1: 1470; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1471; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1472; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1473; GFX1164-NEXT: s_mul_i32 s7, s3, s6 1474; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 1475; GFX1164-NEXT: s_mul_i32 s6, s2, s6 1476; GFX1164-NEXT: s_add_i32 s8, s8, s7 1477; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1478; GFX1164-NEXT: v_mov_b32_e32 v1, s8 1479; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1480; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1481; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1482; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1483; GFX1164-NEXT: buffer_gl0_inv 1484; GFX1164-NEXT: .LBB5_2: 1485; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1486; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 1487; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 1488; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1489; GFX1164-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] 1490; GFX1164-NEXT: v_mad_u64_u32 v[3:4], s[2:3], s3, v2, v[1:2] 1491; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1492; GFX1164-NEXT: s_mov_b32 s2, -1 1493; GFX1164-NEXT: v_mov_b32_e32 v1, v3 1494; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1495; GFX1164-NEXT: s_endpgm 1496; 1497; GFX1132-LABEL: add_i64_uniform: 1498; GFX1132: ; %bb.0: ; %entry 1499; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1500; GFX1132-NEXT: s_mov_b32 s5, exec_lo 1501; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1502; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1503; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1504; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1505; GFX1132-NEXT: s_cbranch_execz .LBB5_2 1506; GFX1132-NEXT: ; %bb.1: 1507; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 1508; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1509; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1510; GFX1132-NEXT: s_mul_i32 s6, s3, s5 1511; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 1512; GFX1132-NEXT: s_mul_i32 s5, s2, s5 1513; GFX1132-NEXT: s_add_i32 s7, s7, s6 1514; GFX1132-NEXT: v_mov_b32_e32 v0, s5 1515; GFX1132-NEXT: v_mov_b32_e32 v1, s7 1516; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1517; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1518; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1519; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1520; GFX1132-NEXT: buffer_gl0_inv 1521; GFX1132-NEXT: .LBB5_2: 1522; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1523; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 1524; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 1525; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1526; GFX1132-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1527; GFX1132-NEXT: v_mad_u64_u32 v[3:4], s2, s3, v2, v[1:2] 1528; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1529; GFX1132-NEXT: s_mov_b32 s2, -1 1530; GFX1132-NEXT: v_mov_b32_e32 v1, v3 1531; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1532; GFX1132-NEXT: s_endpgm 1533entry: 1534 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1535 store i64 %old, i64 addrspace(1)* %out 1536 ret void 1537} 1538 1539define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1540; 1541; 1542; GFX7LESS-LABEL: add_i64_varying: 1543; GFX7LESS: ; %bb.0: ; %entry 1544; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1545; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1546; GFX7LESS-NEXT: s_mov_b32 m0, -1 1547; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1548; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1549; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1550; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1551; GFX7LESS-NEXT: s_mov_b32 s2, -1 1552; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1553; GFX7LESS-NEXT: s_endpgm 1554; 1555; GFX8-LABEL: add_i64_varying: 1556; GFX8: ; %bb.0: ; %entry 1557; GFX8-NEXT: v_mov_b32_e32 v1, 0 1558; GFX8-NEXT: s_mov_b32 m0, -1 1559; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1560; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1561; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1562; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1563; GFX8-NEXT: s_mov_b32 s3, 0xf000 1564; GFX8-NEXT: s_mov_b32 s2, -1 1565; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1566; GFX8-NEXT: s_endpgm 1567; 1568; GFX9-LABEL: add_i64_varying: 1569; GFX9: ; %bb.0: ; %entry 1570; GFX9-NEXT: v_mov_b32_e32 v1, 0 1571; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1572; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1573; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1574; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1575; GFX9-NEXT: s_mov_b32 s3, 0xf000 1576; GFX9-NEXT: s_mov_b32 s2, -1 1577; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1578; GFX9-NEXT: s_endpgm 1579; 1580; GFX10-LABEL: add_i64_varying: 1581; GFX10: ; %bb.0: ; %entry 1582; GFX10-NEXT: v_mov_b32_e32 v1, 0 1583; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1584; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1585; GFX10-NEXT: s_mov_b32 s2, -1 1586; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1587; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1588; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1589; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1590; GFX10-NEXT: buffer_gl0_inv 1591; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1592; GFX10-NEXT: s_endpgm 1593; 1594; GFX11-LABEL: add_i64_varying: 1595; GFX11: ; %bb.0: ; %entry 1596; GFX11-NEXT: v_mov_b32_e32 v1, 0 1597; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1598; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1599; GFX11-NEXT: s_mov_b32 s2, -1 1600; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1601; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1602; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1603; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1604; GFX11-NEXT: buffer_gl0_inv 1605; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1606; GFX11-NEXT: s_endpgm 1607entry: 1608 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1609 %zext = zext i32 %lane to i64 1610 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1611 store i64 %old, i64 addrspace(1)* %out 1612 ret void 1613} 1614 1615define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1616; 1617; 1618; GFX7LESS-LABEL: sub_i32_constant: 1619; GFX7LESS: ; %bb.0: ; %entry 1620; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1621; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1622; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1623; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1624; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1625; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1626; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1627; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1628; GFX7LESS-NEXT: ; %bb.1: 1629; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1630; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1631; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1632; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1633; GFX7LESS-NEXT: s_mov_b32 m0, -1 1634; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1635; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1636; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1637; GFX7LESS-NEXT: .LBB7_2: 1638; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1639; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1640; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1641; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1642; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1643; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1644; GFX7LESS-NEXT: s_mov_b32 s2, -1 1645; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1646; GFX7LESS-NEXT: s_endpgm 1647; 1648; GFX8-LABEL: sub_i32_constant: 1649; GFX8: ; %bb.0: ; %entry 1650; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1651; GFX8-NEXT: s_mov_b64 s[2:3], exec 1652; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1653; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1654; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1655; GFX8-NEXT: ; implicit-def: $vgpr1 1656; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1657; GFX8-NEXT: s_cbranch_execz .LBB7_2 1658; GFX8-NEXT: ; %bb.1: 1659; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1660; GFX8-NEXT: s_mul_i32 s2, s2, 5 1661; GFX8-NEXT: v_mov_b32_e32 v1, 0 1662; GFX8-NEXT: v_mov_b32_e32 v2, s2 1663; GFX8-NEXT: s_mov_b32 m0, -1 1664; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1665; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1666; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1667; GFX8-NEXT: .LBB7_2: 1668; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1669; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1671; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1672; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1673; GFX8-NEXT: s_mov_b32 s3, 0xf000 1674; GFX8-NEXT: s_mov_b32 s2, -1 1675; GFX8-NEXT: s_nop 0 1676; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1677; GFX8-NEXT: s_endpgm 1678; 1679; GFX9-LABEL: sub_i32_constant: 1680; GFX9: ; %bb.0: ; %entry 1681; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1682; GFX9-NEXT: s_mov_b64 s[2:3], exec 1683; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1684; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1685; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1686; GFX9-NEXT: ; implicit-def: $vgpr1 1687; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1688; GFX9-NEXT: s_cbranch_execz .LBB7_2 1689; GFX9-NEXT: ; %bb.1: 1690; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1691; GFX9-NEXT: s_mul_i32 s2, s2, 5 1692; GFX9-NEXT: v_mov_b32_e32 v1, 0 1693; GFX9-NEXT: v_mov_b32_e32 v2, s2 1694; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1695; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1696; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1697; GFX9-NEXT: .LBB7_2: 1698; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1699; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1700; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1701; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1702; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1703; GFX9-NEXT: s_mov_b32 s3, 0xf000 1704; GFX9-NEXT: s_mov_b32 s2, -1 1705; GFX9-NEXT: s_nop 0 1706; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1707; GFX9-NEXT: s_endpgm 1708; 1709; GFX1064-LABEL: sub_i32_constant: 1710; GFX1064: ; %bb.0: ; %entry 1711; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1712; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1713; GFX1064-NEXT: ; implicit-def: $vgpr1 1714; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1715; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1716; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1717; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1718; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1719; GFX1064-NEXT: ; %bb.1: 1720; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1721; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1722; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1723; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1724; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1725; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1726; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1727; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1728; GFX1064-NEXT: buffer_gl0_inv 1729; GFX1064-NEXT: .LBB7_2: 1730; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1731; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1732; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1733; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1734; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1735; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1736; GFX1064-NEXT: s_mov_b32 s2, -1 1737; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1738; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1739; GFX1064-NEXT: s_endpgm 1740; 1741; GFX1032-LABEL: sub_i32_constant: 1742; GFX1032: ; %bb.0: ; %entry 1743; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1744; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1745; GFX1032-NEXT: ; implicit-def: $vgpr1 1746; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1747; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1748; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1749; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1750; GFX1032-NEXT: ; %bb.1: 1751; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1752; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1753; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1754; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1755; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1756; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1757; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1758; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1759; GFX1032-NEXT: buffer_gl0_inv 1760; GFX1032-NEXT: .LBB7_2: 1761; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1762; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1763; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1764; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1765; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1766; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1767; GFX1032-NEXT: s_mov_b32 s2, -1 1768; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1769; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1770; GFX1032-NEXT: s_endpgm 1771; 1772; GFX1164-LABEL: sub_i32_constant: 1773; GFX1164: ; %bb.0: ; %entry 1774; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1775; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1776; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1777; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1778; GFX1164-NEXT: ; implicit-def: $vgpr1 1779; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1780; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 1781; GFX1164-NEXT: s_cbranch_execz .LBB7_2 1782; GFX1164-NEXT: ; %bb.1: 1783; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1784; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1785; GFX1164-NEXT: s_mul_i32 s2, s2, 5 1786; GFX1164-NEXT: v_mov_b32_e32 v2, s2 1787; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1788; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1789; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 1790; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1791; GFX1164-NEXT: buffer_gl0_inv 1792; GFX1164-NEXT: .LBB7_2: 1793; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1794; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 1795; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1796; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1797; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1798; GFX1164-NEXT: s_mov_b32 s2, -1 1799; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1800; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1801; GFX1164-NEXT: s_endpgm 1802; 1803; GFX1132-LABEL: sub_i32_constant: 1804; GFX1132: ; %bb.0: ; %entry 1805; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1806; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1807; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1808; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1809; GFX1132-NEXT: ; implicit-def: $vgpr1 1810; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 1811; GFX1132-NEXT: s_cbranch_execz .LBB7_2 1812; GFX1132-NEXT: ; %bb.1: 1813; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1814; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1815; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1816; GFX1132-NEXT: v_mov_b32_e32 v2, s3 1817; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1818; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1819; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 1820; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1821; GFX1132-NEXT: buffer_gl0_inv 1822; GFX1132-NEXT: .LBB7_2: 1823; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1824; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 1825; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1826; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1827; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1828; GFX1132-NEXT: s_mov_b32 s2, -1 1829; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1830; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1831; GFX1132-NEXT: s_endpgm 1832entry: 1833 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1834 store i32 %old, i32 addrspace(1)* %out 1835 ret void 1836} 1837 1838define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1839; 1840; 1841; GFX7LESS-LABEL: sub_i32_uniform: 1842; GFX7LESS: ; %bb.0: ; %entry 1843; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1844; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1845; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1846; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1847; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1848; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1849; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1850; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1851; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1852; GFX7LESS-NEXT: ; %bb.1: 1853; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1854; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1856; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1857; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1858; GFX7LESS-NEXT: s_mov_b32 m0, -1 1859; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1860; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1861; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1862; GFX7LESS-NEXT: .LBB8_2: 1863; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1864; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1865; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1866; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1867; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1868; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1869; GFX7LESS-NEXT: s_mov_b32 s6, -1 1870; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1871; GFX7LESS-NEXT: s_endpgm 1872; 1873; GFX8-LABEL: sub_i32_uniform: 1874; GFX8: ; %bb.0: ; %entry 1875; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1876; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1877; GFX8-NEXT: s_mov_b64 s[2:3], exec 1878; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1879; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1880; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1881; GFX8-NEXT: ; implicit-def: $vgpr1 1882; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1883; GFX8-NEXT: s_cbranch_execz .LBB8_2 1884; GFX8-NEXT: ; %bb.1: 1885; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1886; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1887; GFX8-NEXT: s_mul_i32 s2, s6, s2 1888; GFX8-NEXT: v_mov_b32_e32 v1, 0 1889; GFX8-NEXT: v_mov_b32_e32 v2, s2 1890; GFX8-NEXT: s_mov_b32 m0, -1 1891; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1892; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1893; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1894; GFX8-NEXT: .LBB8_2: 1895; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1896; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1897; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1898; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1899; GFX8-NEXT: s_mov_b32 s7, 0xf000 1900; GFX8-NEXT: s_mov_b32 s6, -1 1901; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1902; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1903; GFX8-NEXT: s_endpgm 1904; 1905; GFX9-LABEL: sub_i32_uniform: 1906; GFX9: ; %bb.0: ; %entry 1907; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1908; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1909; GFX9-NEXT: s_mov_b64 s[2:3], exec 1910; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1911; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1912; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1913; GFX9-NEXT: ; implicit-def: $vgpr1 1914; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1915; GFX9-NEXT: s_cbranch_execz .LBB8_2 1916; GFX9-NEXT: ; %bb.1: 1917; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1918; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1919; GFX9-NEXT: s_mul_i32 s2, s6, s2 1920; GFX9-NEXT: v_mov_b32_e32 v1, 0 1921; GFX9-NEXT: v_mov_b32_e32 v2, s2 1922; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1923; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1924; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1925; GFX9-NEXT: .LBB8_2: 1926; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1927; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1928; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1929; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1930; GFX9-NEXT: s_mov_b32 s7, 0xf000 1931; GFX9-NEXT: s_mov_b32 s6, -1 1932; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1933; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1934; GFX9-NEXT: s_endpgm 1935; 1936; GFX1064-LABEL: sub_i32_uniform: 1937; GFX1064: ; %bb.0: ; %entry 1938; GFX1064-NEXT: s_clause 0x1 1939; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1940; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 1941; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1942; GFX1064-NEXT: ; implicit-def: $vgpr1 1943; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1944; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1945; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1946; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1947; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1948; GFX1064-NEXT: ; %bb.1: 1949; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1950; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1951; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1952; GFX1064-NEXT: s_mul_i32 s2, s6, s2 1953; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1954; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1955; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1956; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1957; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1958; GFX1064-NEXT: buffer_gl0_inv 1959; GFX1064-NEXT: .LBB8_2: 1960; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1961; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1962; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1963; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 1964; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1965; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1966; GFX1064-NEXT: s_mov_b32 s6, -1 1967; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1968; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1969; GFX1064-NEXT: s_endpgm 1970; 1971; GFX1032-LABEL: sub_i32_uniform: 1972; GFX1032: ; %bb.0: ; %entry 1973; GFX1032-NEXT: s_clause 0x1 1974; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1975; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1976; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1977; GFX1032-NEXT: ; implicit-def: $vgpr1 1978; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1979; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1980; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1981; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1982; GFX1032-NEXT: ; %bb.1: 1983; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1984; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1985; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1986; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1987; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1988; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1989; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1990; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1991; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1992; GFX1032-NEXT: buffer_gl0_inv 1993; GFX1032-NEXT: .LBB8_2: 1994; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1995; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1996; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1997; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1998; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1999; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 2000; GFX1032-NEXT: s_mov_b32 s6, -1 2001; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2002; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 2003; GFX1032-NEXT: s_endpgm 2004; 2005; GFX1164-LABEL: sub_i32_uniform: 2006; GFX1164: ; %bb.0: ; %entry 2007; GFX1164-NEXT: s_clause 0x1 2008; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2009; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 2010; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2011; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2012; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2013; GFX1164-NEXT: ; implicit-def: $vgpr1 2014; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2015; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 2016; GFX1164-NEXT: s_cbranch_execz .LBB8_2 2017; GFX1164-NEXT: ; %bb.1: 2018; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2019; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2020; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2021; GFX1164-NEXT: s_mul_i32 s2, s6, s2 2022; GFX1164-NEXT: v_mov_b32_e32 v2, s2 2023; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2024; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2025; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 2026; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2027; GFX1164-NEXT: buffer_gl0_inv 2028; GFX1164-NEXT: .LBB8_2: 2029; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 2030; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2031; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 2032; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 2033; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 2034; GFX1164-NEXT: s_mov_b32 s6, -1 2035; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2036; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2037; GFX1164-NEXT: s_endpgm 2038; 2039; GFX1132-LABEL: sub_i32_uniform: 2040; GFX1132: ; %bb.0: ; %entry 2041; GFX1132-NEXT: s_clause 0x1 2042; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2043; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 2044; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2045; GFX1132-NEXT: s_mov_b32 s1, exec_lo 2046; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2047; GFX1132-NEXT: ; implicit-def: $vgpr1 2048; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 2049; GFX1132-NEXT: s_cbranch_execz .LBB8_2 2050; GFX1132-NEXT: ; %bb.1: 2051; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 2052; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2053; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2054; GFX1132-NEXT: s_mul_i32 s2, s0, s2 2055; GFX1132-NEXT: v_mov_b32_e32 v2, s2 2056; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2057; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2058; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 2059; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2060; GFX1132-NEXT: buffer_gl0_inv 2061; GFX1132-NEXT: .LBB8_2: 2062; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 2063; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2064; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 2065; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 2066; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 2067; GFX1132-NEXT: s_mov_b32 s6, -1 2068; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2069; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2070; GFX1132-NEXT: s_endpgm 2071entry: 2072 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 2073 store i32 %old, i32 addrspace(1)* %out 2074 ret void 2075} 2076 2077define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 2078; 2079; 2080; GFX7LESS-LABEL: sub_i32_varying: 2081; GFX7LESS: ; %bb.0: ; %entry 2082; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2083; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2084; GFX7LESS-NEXT: s_mov_b32 m0, -1 2085; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2086; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 2087; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2088; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2089; GFX7LESS-NEXT: s_mov_b32 s2, -1 2090; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2091; GFX7LESS-NEXT: s_endpgm 2092; 2093; GFX8-LABEL: sub_i32_varying: 2094; GFX8: ; %bb.0: ; %entry 2095; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2096; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2097; GFX8-NEXT: v_mov_b32_e32 v1, 0 2098; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2099; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2100; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2101; GFX8-NEXT: v_mov_b32_e32 v2, v0 2102; GFX8-NEXT: s_not_b64 exec, exec 2103; GFX8-NEXT: v_mov_b32_e32 v2, 0 2104; GFX8-NEXT: s_not_b64 exec, exec 2105; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2106; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2107; GFX8-NEXT: s_nop 1 2108; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2109; GFX8-NEXT: s_nop 1 2110; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2111; GFX8-NEXT: s_nop 1 2112; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2113; GFX8-NEXT: s_nop 1 2114; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2115; GFX8-NEXT: s_nop 1 2116; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2117; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2118; GFX8-NEXT: s_nop 0 2119; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2120; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2121; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2122; GFX8-NEXT: ; implicit-def: $vgpr0 2123; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2124; GFX8-NEXT: s_cbranch_execz .LBB9_2 2125; GFX8-NEXT: ; %bb.1: 2126; GFX8-NEXT: v_mov_b32_e32 v0, 0 2127; GFX8-NEXT: v_mov_b32_e32 v3, s4 2128; GFX8-NEXT: s_mov_b32 m0, -1 2129; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2130; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 2131; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2132; GFX8-NEXT: .LBB9_2: 2133; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2134; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2135; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2136; GFX8-NEXT: v_mov_b32_e32 v0, v1 2137; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2138; GFX8-NEXT: s_mov_b32 s3, 0xf000 2139; GFX8-NEXT: s_mov_b32 s2, -1 2140; GFX8-NEXT: s_nop 0 2141; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2142; GFX8-NEXT: s_endpgm 2143; 2144; GFX9-LABEL: sub_i32_varying: 2145; GFX9: ; %bb.0: ; %entry 2146; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2147; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2148; GFX9-NEXT: v_mov_b32_e32 v1, 0 2149; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2150; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2151; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2152; GFX9-NEXT: v_mov_b32_e32 v2, v0 2153; GFX9-NEXT: s_not_b64 exec, exec 2154; GFX9-NEXT: v_mov_b32_e32 v2, 0 2155; GFX9-NEXT: s_not_b64 exec, exec 2156; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2157; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2158; GFX9-NEXT: s_nop 1 2159; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2160; GFX9-NEXT: s_nop 1 2161; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2162; GFX9-NEXT: s_nop 1 2163; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2164; GFX9-NEXT: s_nop 1 2165; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2166; GFX9-NEXT: s_nop 1 2167; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2168; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2169; GFX9-NEXT: s_nop 0 2170; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2171; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2172; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2173; GFX9-NEXT: ; implicit-def: $vgpr0 2174; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2175; GFX9-NEXT: s_cbranch_execz .LBB9_2 2176; GFX9-NEXT: ; %bb.1: 2177; GFX9-NEXT: v_mov_b32_e32 v0, 0 2178; GFX9-NEXT: v_mov_b32_e32 v3, s4 2179; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2180; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2181; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2182; GFX9-NEXT: .LBB9_2: 2183; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2184; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2185; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2186; GFX9-NEXT: v_mov_b32_e32 v0, v1 2187; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2188; GFX9-NEXT: s_mov_b32 s3, 0xf000 2189; GFX9-NEXT: s_mov_b32 s2, -1 2190; GFX9-NEXT: s_nop 0 2191; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2192; GFX9-NEXT: s_endpgm 2193; 2194; GFX1064-LABEL: sub_i32_varying: 2195; GFX1064: ; %bb.0: ; %entry 2196; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2197; GFX1064-NEXT: s_not_b64 exec, exec 2198; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2199; GFX1064-NEXT: s_not_b64 exec, exec 2200; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2201; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2202; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2203; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2204; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2205; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2206; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2207; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2208; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2209; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2210; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2211; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2212; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2213; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2214; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2215; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2216; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2217; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2218; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2219; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2220; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2221; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2222; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2223; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2224; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2225; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2226; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2227; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2228; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2229; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2230; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2231; GFX1064-NEXT: s_mov_b32 s2, -1 2232; GFX1064-NEXT: ; implicit-def: $vgpr0 2233; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2234; GFX1064-NEXT: s_cbranch_execz .LBB9_2 2235; GFX1064-NEXT: ; %bb.1: 2236; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2237; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2238; GFX1064-NEXT: s_mov_b32 s3, s7 2239; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2240; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2241; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 2242; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2243; GFX1064-NEXT: buffer_gl0_inv 2244; GFX1064-NEXT: .LBB9_2: 2245; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2246; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2247; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2248; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2249; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2250; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2251; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2252; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2253; GFX1064-NEXT: s_endpgm 2254; 2255; GFX1032-LABEL: sub_i32_varying: 2256; GFX1032: ; %bb.0: ; %entry 2257; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2258; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2259; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2260; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2261; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2262; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2263; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2264; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2265; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2266; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2267; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2268; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2269; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2270; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2271; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2272; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2273; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2274; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2275; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2276; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2277; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2278; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2279; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2280; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2281; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2282; GFX1032-NEXT: s_mov_b32 s2, -1 2283; GFX1032-NEXT: ; implicit-def: $vgpr0 2284; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2285; GFX1032-NEXT: s_cbranch_execz .LBB9_2 2286; GFX1032-NEXT: ; %bb.1: 2287; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2288; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2289; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2290; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2291; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 2292; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2293; GFX1032-NEXT: buffer_gl0_inv 2294; GFX1032-NEXT: .LBB9_2: 2295; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2296; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2297; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2298; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2299; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2300; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2301; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2303; GFX1032-NEXT: s_endpgm 2304; 2305; GFX1164-LABEL: sub_i32_varying: 2306; GFX1164: ; %bb.0: ; %entry 2307; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2308; GFX1164-NEXT: s_not_b64 exec, exec 2309; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2310; GFX1164-NEXT: s_not_b64 exec, exec 2311; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2312; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2313; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2314; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2315; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2316; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2317; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2318; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2319; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2320; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 2321; GFX1164-NEXT: v_mov_b32_e32 v2, s4 2322; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2323; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 2324; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2325; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2326; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2327; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2328; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 2329; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 2330; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2331; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2332; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2333; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 2334; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 2335; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 2336; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2337; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2338; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 2339; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 2340; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 2341; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2342; GFX1164-NEXT: s_mov_b32 s2, -1 2343; GFX1164-NEXT: ; implicit-def: $vgpr0 2344; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 2345; GFX1164-NEXT: s_cbranch_execz .LBB9_2 2346; GFX1164-NEXT: ; %bb.1: 2347; GFX1164-NEXT: v_mov_b32_e32 v0, 0 2348; GFX1164-NEXT: v_mov_b32_e32 v4, s7 2349; GFX1164-NEXT: s_mov_b32 s3, s7 2350; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2351; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2352; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v4 2353; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2354; GFX1164-NEXT: buffer_gl0_inv 2355; GFX1164-NEXT: .LBB9_2: 2356; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 2357; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 2358; GFX1164-NEXT: v_mov_b32_e32 v0, v3 2359; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2360; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2361; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2362; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2363; GFX1164-NEXT: s_endpgm 2364; 2365; GFX1132-LABEL: sub_i32_varying: 2366; GFX1132: ; %bb.0: ; %entry 2367; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2368; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2369; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2370; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2371; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2372; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2373; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2374; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2375; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2376; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2377; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2378; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2379; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2380; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2381; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2382; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2383; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 2384; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 2385; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2386; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2387; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2388; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2389; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 2390; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2391; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2392; GFX1132-NEXT: s_mov_b32 s2, -1 2393; GFX1132-NEXT: ; implicit-def: $vgpr0 2394; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 2395; GFX1132-NEXT: s_cbranch_execz .LBB9_2 2396; GFX1132-NEXT: ; %bb.1: 2397; GFX1132-NEXT: v_mov_b32_e32 v0, 0 2398; GFX1132-NEXT: v_mov_b32_e32 v4, s4 2399; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2400; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2401; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v4 2402; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2403; GFX1132-NEXT: buffer_gl0_inv 2404; GFX1132-NEXT: .LBB9_2: 2405; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 2406; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 2407; GFX1132-NEXT: v_mov_b32_e32 v0, v3 2408; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2409; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2410; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2411; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2412; GFX1132-NEXT: s_endpgm 2413entry: 2414 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2415 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2416 store i32 %old, i32 addrspace(1)* %out 2417 ret void 2418} 2419 2420define amdgpu_kernel void @sub_i32_varying_nouse() { 2421; GFX7LESS-LABEL: sub_i32_varying_nouse: 2422; GFX7LESS: ; %bb.0: ; %entry 2423; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2424; GFX7LESS-NEXT: s_mov_b32 m0, -1 2425; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2426; GFX7LESS-NEXT: ds_sub_u32 v1, v0 2427; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2428; GFX7LESS-NEXT: s_endpgm 2429; 2430; GFX8-LABEL: sub_i32_varying_nouse: 2431; GFX8: ; %bb.0: ; %entry 2432; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2433; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2434; GFX8-NEXT: v_mov_b32_e32 v1, v0 2435; GFX8-NEXT: s_not_b64 exec, exec 2436; GFX8-NEXT: v_mov_b32_e32 v1, 0 2437; GFX8-NEXT: s_not_b64 exec, exec 2438; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 2439; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2440; GFX8-NEXT: s_nop 1 2441; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2442; GFX8-NEXT: s_nop 1 2443; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2444; GFX8-NEXT: s_nop 1 2445; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2446; GFX8-NEXT: s_nop 1 2447; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2448; GFX8-NEXT: s_nop 1 2449; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2450; GFX8-NEXT: v_readlane_b32 s2, v1, 63 2451; GFX8-NEXT: s_mov_b64 exec, s[0:1] 2452; GFX8-NEXT: s_mov_b32 s0, s2 2453; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2454; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2455; GFX8-NEXT: s_cbranch_execz .LBB10_2 2456; GFX8-NEXT: ; %bb.1: 2457; GFX8-NEXT: v_mov_b32_e32 v0, 0 2458; GFX8-NEXT: v_mov_b32_e32 v2, s0 2459; GFX8-NEXT: s_mov_b32 m0, -1 2460; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2461; GFX8-NEXT: ds_sub_u32 v0, v2 2462; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2463; GFX8-NEXT: .LBB10_2: 2464; GFX8-NEXT: s_endpgm 2465; 2466; GFX9-LABEL: sub_i32_varying_nouse: 2467; GFX9: ; %bb.0: ; %entry 2468; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2469; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2470; GFX9-NEXT: v_mov_b32_e32 v1, v0 2471; GFX9-NEXT: s_not_b64 exec, exec 2472; GFX9-NEXT: v_mov_b32_e32 v1, 0 2473; GFX9-NEXT: s_not_b64 exec, exec 2474; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 2475; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2476; GFX9-NEXT: s_nop 1 2477; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2478; GFX9-NEXT: s_nop 1 2479; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2480; GFX9-NEXT: s_nop 1 2481; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2482; GFX9-NEXT: s_nop 1 2483; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2484; GFX9-NEXT: s_nop 1 2485; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2486; GFX9-NEXT: v_readlane_b32 s2, v1, 63 2487; GFX9-NEXT: s_mov_b64 exec, s[0:1] 2488; GFX9-NEXT: s_mov_b32 s0, s2 2489; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2490; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2491; GFX9-NEXT: s_cbranch_execz .LBB10_2 2492; GFX9-NEXT: ; %bb.1: 2493; GFX9-NEXT: v_mov_b32_e32 v0, 0 2494; GFX9-NEXT: v_mov_b32_e32 v2, s0 2495; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2496; GFX9-NEXT: ds_sub_u32 v0, v2 2497; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2498; GFX9-NEXT: .LBB10_2: 2499; GFX9-NEXT: s_endpgm 2500; 2501; GFX1064-LABEL: sub_i32_varying_nouse: 2502; GFX1064: ; %bb.0: ; %entry 2503; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2504; GFX1064-NEXT: s_not_b64 exec, exec 2505; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2506; GFX1064-NEXT: s_not_b64 exec, exec 2507; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2508; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2509; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2510; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2511; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2512; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2513; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2514; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 2515; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2516; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2517; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2518; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 2519; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 2520; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2521; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2522; GFX1064-NEXT: s_add_i32 s0, s2, s3 2523; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2524; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2525; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2526; GFX1064-NEXT: ; %bb.1: 2527; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2528; GFX1064-NEXT: v_mov_b32_e32 v3, s0 2529; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2530; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2531; GFX1064-NEXT: ds_sub_u32 v0, v3 2532; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2533; GFX1064-NEXT: buffer_gl0_inv 2534; GFX1064-NEXT: .LBB10_2: 2535; GFX1064-NEXT: s_endpgm 2536; 2537; GFX1032-LABEL: sub_i32_varying_nouse: 2538; GFX1032: ; %bb.0: ; %entry 2539; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2540; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2541; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2542; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2543; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 2544; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2545; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2546; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2547; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2548; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2549; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2550; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 2551; GFX1032-NEXT: s_mov_b32 exec_lo, s0 2552; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2553; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2554; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2555; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2556; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2557; GFX1032-NEXT: ; %bb.1: 2558; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2559; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2560; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2561; GFX1032-NEXT: ds_sub_u32 v3, v0 2562; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2563; GFX1032-NEXT: buffer_gl0_inv 2564; GFX1032-NEXT: .LBB10_2: 2565; GFX1032-NEXT: s_endpgm 2566; 2567; GFX1164-LABEL: sub_i32_varying_nouse: 2568; GFX1164: ; %bb.0: ; %entry 2569; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2570; GFX1164-NEXT: s_not_b64 exec, exec 2571; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2572; GFX1164-NEXT: s_not_b64 exec, exec 2573; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2574; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2575; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2576; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2577; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2578; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2579; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2580; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2581; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2582; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2583; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2584; GFX1164-NEXT: v_readlane_b32 s2, v1, 0 2585; GFX1164-NEXT: v_readlane_b32 s3, v1, 32 2586; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2587; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2588; GFX1164-NEXT: s_add_i32 s0, s2, s3 2589; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2590; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 2591; GFX1164-NEXT: s_cbranch_execz .LBB10_2 2592; GFX1164-NEXT: ; %bb.1: 2593; GFX1164-NEXT: v_mov_b32_e32 v0, 0 2594; GFX1164-NEXT: v_mov_b32_e32 v3, s0 2595; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2596; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2597; GFX1164-NEXT: ds_sub_u32 v0, v3 2598; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2599; GFX1164-NEXT: buffer_gl0_inv 2600; GFX1164-NEXT: .LBB10_2: 2601; GFX1164-NEXT: s_endpgm 2602; 2603; GFX1132-LABEL: sub_i32_varying_nouse: 2604; GFX1132: ; %bb.0: ; %entry 2605; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2606; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2607; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2608; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2609; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 2610; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2611; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2612; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2613; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2614; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2615; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2616; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 2617; GFX1132-NEXT: s_mov_b32 exec_lo, s0 2618; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2619; GFX1132-NEXT: v_mov_b32_e32 v0, v1 2620; GFX1132-NEXT: s_mov_b32 s0, exec_lo 2621; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 2622; GFX1132-NEXT: s_cbranch_execz .LBB10_2 2623; GFX1132-NEXT: ; %bb.1: 2624; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2625; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2626; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2627; GFX1132-NEXT: ds_sub_u32 v3, v0 2628; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2629; GFX1132-NEXT: buffer_gl0_inv 2630; GFX1132-NEXT: .LBB10_2: 2631; GFX1132-NEXT: s_endpgm 2632entry: 2633 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2634 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2635 ret void 2636} 2637 2638define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2639; 2640; 2641; GFX7LESS-LABEL: sub_i64_constant: 2642; GFX7LESS: ; %bb.0: ; %entry 2643; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2644; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2645; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2646; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 2647; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2648; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2649; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2650; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 2651; GFX7LESS-NEXT: ; %bb.1: 2652; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2653; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 2654; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2655; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 2656; GFX7LESS-NEXT: s_mov_b32 m0, -1 2657; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2658; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2659; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2660; GFX7LESS-NEXT: .LBB11_2: 2661; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2662; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2663; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 2664; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 2665; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2666; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2667; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2668; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2669; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2670; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2671; GFX7LESS-NEXT: s_mov_b32 s2, -1 2672; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2673; GFX7LESS-NEXT: s_endpgm 2674; 2675; GFX8-LABEL: sub_i64_constant: 2676; GFX8: ; %bb.0: ; %entry 2677; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2678; GFX8-NEXT: s_mov_b64 s[4:5], exec 2679; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2680; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2681; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2682; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2683; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2684; GFX8-NEXT: s_cbranch_execz .LBB11_2 2685; GFX8-NEXT: ; %bb.1: 2686; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2687; GFX8-NEXT: s_mul_i32 s4, s4, 5 2688; GFX8-NEXT: v_mov_b32_e32 v0, s4 2689; GFX8-NEXT: v_mov_b32_e32 v1, 0 2690; GFX8-NEXT: s_mov_b32 m0, -1 2691; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2692; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2693; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2694; GFX8-NEXT: .LBB11_2: 2695; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2696; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2697; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2698; GFX8-NEXT: v_readfirstlane_b32 s3, v1 2699; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2700; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2701; GFX8-NEXT: v_mov_b32_e32 v2, s3 2702; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2703; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2704; GFX8-NEXT: s_mov_b32 s3, 0xf000 2705; GFX8-NEXT: s_mov_b32 s2, -1 2706; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2707; GFX8-NEXT: s_endpgm 2708; 2709; GFX9-LABEL: sub_i64_constant: 2710; GFX9: ; %bb.0: ; %entry 2711; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2712; GFX9-NEXT: s_mov_b64 s[4:5], exec 2713; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2714; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2715; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2716; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2717; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2718; GFX9-NEXT: s_cbranch_execz .LBB11_2 2719; GFX9-NEXT: ; %bb.1: 2720; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2721; GFX9-NEXT: s_mul_i32 s4, s4, 5 2722; GFX9-NEXT: v_mov_b32_e32 v0, s4 2723; GFX9-NEXT: v_mov_b32_e32 v1, 0 2724; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2725; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2726; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2727; GFX9-NEXT: .LBB11_2: 2728; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2729; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2730; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2731; GFX9-NEXT: v_readfirstlane_b32 s3, v1 2732; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2733; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2734; GFX9-NEXT: v_mov_b32_e32 v2, s3 2735; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2736; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2737; GFX9-NEXT: s_mov_b32 s3, 0xf000 2738; GFX9-NEXT: s_mov_b32 s2, -1 2739; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2740; GFX9-NEXT: s_endpgm 2741; 2742; GFX1064-LABEL: sub_i64_constant: 2743; GFX1064: ; %bb.0: ; %entry 2744; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2745; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2746; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2747; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2748; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2749; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2750; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2751; GFX1064-NEXT: s_cbranch_execz .LBB11_2 2752; GFX1064-NEXT: ; %bb.1: 2753; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2754; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2755; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2756; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2757; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2758; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2759; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2760; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2761; GFX1064-NEXT: buffer_gl0_inv 2762; GFX1064-NEXT: .LBB11_2: 2763; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2764; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2765; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2766; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2767; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2768; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2769; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2770; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2771; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2772; GFX1064-NEXT: s_mov_b32 s2, -1 2773; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2774; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2775; GFX1064-NEXT: s_endpgm 2776; 2777; GFX1032-LABEL: sub_i64_constant: 2778; GFX1032: ; %bb.0: ; %entry 2779; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2780; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2781; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2782; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2783; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2784; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2785; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2786; GFX1032-NEXT: ; %bb.1: 2787; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2788; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2789; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2790; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2791; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2792; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2793; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2794; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2795; GFX1032-NEXT: buffer_gl0_inv 2796; GFX1032-NEXT: .LBB11_2: 2797; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2798; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2799; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2800; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2801; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2802; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2803; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2804; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2805; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2806; GFX1032-NEXT: s_mov_b32 s2, -1 2807; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2808; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2809; GFX1032-NEXT: s_endpgm 2810; 2811; GFX1164-LABEL: sub_i64_constant: 2812; GFX1164: ; %bb.0: ; %entry 2813; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2814; GFX1164-NEXT: s_mov_b64 s[4:5], exec 2815; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2816; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2817; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2818; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2819; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2820; GFX1164-NEXT: s_cbranch_execz .LBB11_2 2821; GFX1164-NEXT: ; %bb.1: 2822; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2823; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2824; GFX1164-NEXT: s_mul_i32 s4, s4, 5 2825; GFX1164-NEXT: v_mov_b32_e32 v0, s4 2826; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2827; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2828; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2829; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2830; GFX1164-NEXT: buffer_gl0_inv 2831; GFX1164-NEXT: .LBB11_2: 2832; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 2833; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2834; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2835; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 2836; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2837; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2838; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2839; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2840; GFX1164-NEXT: s_mov_b32 s2, -1 2841; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2842; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2843; GFX1164-NEXT: s_endpgm 2844; 2845; GFX1132-LABEL: sub_i64_constant: 2846; GFX1132: ; %bb.0: ; %entry 2847; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2848; GFX1132-NEXT: s_mov_b32 s3, exec_lo 2849; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2850; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2851; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 2852; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 2853; GFX1132-NEXT: s_cbranch_execz .LBB11_2 2854; GFX1132-NEXT: ; %bb.1: 2855; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 2856; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2857; GFX1132-NEXT: s_mul_i32 s3, s3, 5 2858; GFX1132-NEXT: v_mov_b32_e32 v0, s3 2859; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2860; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2861; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2862; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2863; GFX1132-NEXT: buffer_gl0_inv 2864; GFX1132-NEXT: .LBB11_2: 2865; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 2866; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2867; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2868; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 2869; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2870; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2871; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2872; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2873; GFX1132-NEXT: s_mov_b32 s2, -1 2874; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2875; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2876; GFX1132-NEXT: s_endpgm 2877entry: 2878 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2879 store i64 %old, i64 addrspace(1)* %out 2880 ret void 2881} 2882 2883define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2884; 2885; 2886; GFX7LESS-LABEL: sub_i64_uniform: 2887; GFX7LESS: ; %bb.0: ; %entry 2888; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2889; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2890; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2891; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 2892; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2893; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2894; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2895; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 2896; GFX7LESS-NEXT: ; %bb.1: 2897; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2898; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 2899; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2900; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2901; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2902; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 2903; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2904; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 2905; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2906; GFX7LESS-NEXT: s_mov_b32 m0, -1 2907; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2908; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2909; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2910; GFX7LESS-NEXT: .LBB12_2: 2911; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2912; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2913; GFX7LESS-NEXT: s_mov_b32 s6, -1 2914; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2915; GFX7LESS-NEXT: s_mov_b32 s4, s0 2916; GFX7LESS-NEXT: s_mov_b32 s5, s1 2917; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 2918; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 2919; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 2920; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 2921; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 2922; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 2923; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 2924; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 2925; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2926; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2927; GFX7LESS-NEXT: s_endpgm 2928; 2929; GFX8-LABEL: sub_i64_uniform: 2930; GFX8: ; %bb.0: ; %entry 2931; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2932; GFX8-NEXT: s_mov_b64 s[6:7], exec 2933; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2934; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2935; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2936; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2937; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2938; GFX8-NEXT: s_cbranch_execz .LBB12_2 2939; GFX8-NEXT: ; %bb.1: 2940; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 2941; GFX8-NEXT: v_mov_b32_e32 v0, s8 2942; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2943; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 2944; GFX8-NEXT: s_mul_i32 s6, s3, s8 2945; GFX8-NEXT: v_mov_b32_e32 v3, 0 2946; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 2947; GFX8-NEXT: s_mov_b32 m0, -1 2948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2949; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2950; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2951; GFX8-NEXT: .LBB12_2: 2952; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2954; GFX8-NEXT: s_mov_b32 s4, s0 2955; GFX8-NEXT: s_mov_b32 s5, s1 2956; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 2957; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2958; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2959; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2960; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 2961; GFX8-NEXT: v_mov_b32_e32 v3, s1 2962; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 2963; GFX8-NEXT: s_mov_b32 s7, 0xf000 2964; GFX8-NEXT: s_mov_b32 s6, -1 2965; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2966; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2967; GFX8-NEXT: s_endpgm 2968; 2969; GFX9-LABEL: sub_i64_uniform: 2970; GFX9: ; %bb.0: ; %entry 2971; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2972; GFX9-NEXT: s_mov_b64 s[6:7], exec 2973; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2974; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2975; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2976; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2977; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2978; GFX9-NEXT: s_cbranch_execz .LBB12_2 2979; GFX9-NEXT: ; %bb.1: 2980; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2981; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2982; GFX9-NEXT: s_mul_i32 s7, s3, s6 2983; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2984; GFX9-NEXT: s_add_i32 s8, s8, s7 2985; GFX9-NEXT: s_mul_i32 s6, s2, s6 2986; GFX9-NEXT: v_mov_b32_e32 v0, s6 2987; GFX9-NEXT: v_mov_b32_e32 v1, s8 2988; GFX9-NEXT: v_mov_b32_e32 v3, 0 2989; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2990; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2991; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2992; GFX9-NEXT: .LBB12_2: 2993; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2994; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2995; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 2996; GFX9-NEXT: s_mov_b32 s4, s0 2997; GFX9-NEXT: s_mov_b32 s5, s1 2998; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 2999; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3000; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3001; GFX9-NEXT: v_mov_b32_e32 v1, v4 3002; GFX9-NEXT: v_mov_b32_e32 v2, s1 3003; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 3004; GFX9-NEXT: s_mov_b32 s7, 0xf000 3005; GFX9-NEXT: s_mov_b32 s6, -1 3006; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 3007; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3008; GFX9-NEXT: s_endpgm 3009; 3010; GFX1064-LABEL: sub_i64_uniform: 3011; GFX1064: ; %bb.0: ; %entry 3012; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3013; GFX1064-NEXT: s_mov_b64 s[6:7], exec 3014; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3015; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3016; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3017; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3018; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3019; GFX1064-NEXT: s_cbranch_execz .LBB12_2 3020; GFX1064-NEXT: ; %bb.1: 3021; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3022; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3023; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3024; GFX1064-NEXT: s_mul_i32 s7, s3, s6 3025; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 3026; GFX1064-NEXT: s_mul_i32 s6, s2, s6 3027; GFX1064-NEXT: s_add_i32 s8, s8, s7 3028; GFX1064-NEXT: v_mov_b32_e32 v0, s6 3029; GFX1064-NEXT: v_mov_b32_e32 v1, s8 3030; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3031; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3032; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3033; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3034; GFX1064-NEXT: buffer_gl0_inv 3035; GFX1064-NEXT: .LBB12_2: 3036; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3037; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3038; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3039; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3040; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 3041; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 3042; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3043; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3044; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3045; GFX1064-NEXT: v_mov_b32_e32 v1, v4 3046; GFX1064-NEXT: s_mov_b32 s2, -1 3047; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3048; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3049; GFX1064-NEXT: s_endpgm 3050; 3051; GFX1032-LABEL: sub_i64_uniform: 3052; GFX1032: ; %bb.0: ; %entry 3053; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3054; GFX1032-NEXT: s_mov_b32 s5, exec_lo 3055; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3056; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3057; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 3058; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3059; GFX1032-NEXT: s_cbranch_execz .LBB12_2 3060; GFX1032-NEXT: ; %bb.1: 3061; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 3062; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3063; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3064; GFX1032-NEXT: s_mul_i32 s6, s3, s5 3065; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 3066; GFX1032-NEXT: s_mul_i32 s5, s2, s5 3067; GFX1032-NEXT: s_add_i32 s7, s7, s6 3068; GFX1032-NEXT: v_mov_b32_e32 v0, s5 3069; GFX1032-NEXT: v_mov_b32_e32 v1, s7 3070; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3071; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3072; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3073; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3074; GFX1032-NEXT: buffer_gl0_inv 3075; GFX1032-NEXT: .LBB12_2: 3076; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3077; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3078; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3079; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 3080; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 3081; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] 3082; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3083; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3084; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3085; GFX1032-NEXT: v_mov_b32_e32 v1, v4 3086; GFX1032-NEXT: s_mov_b32 s2, -1 3087; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3088; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3089; GFX1032-NEXT: s_endpgm 3090; 3091; GFX1164-LABEL: sub_i64_uniform: 3092; GFX1164: ; %bb.0: ; %entry 3093; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3094; GFX1164-NEXT: s_mov_b64 s[6:7], exec 3095; GFX1164-NEXT: s_mov_b64 s[4:5], exec 3096; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3097; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3098; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 3099; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 3100; GFX1164-NEXT: s_cbranch_execz .LBB12_2 3101; GFX1164-NEXT: ; %bb.1: 3102; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3103; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3104; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3105; GFX1164-NEXT: s_mul_i32 s7, s3, s6 3106; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 3107; GFX1164-NEXT: s_mul_i32 s6, s2, s6 3108; GFX1164-NEXT: s_add_i32 s8, s8, s7 3109; GFX1164-NEXT: v_mov_b32_e32 v0, s6 3110; GFX1164-NEXT: v_mov_b32_e32 v1, s8 3111; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3112; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3113; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3114; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3115; GFX1164-NEXT: buffer_gl0_inv 3116; GFX1164-NEXT: .LBB12_2: 3117; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3118; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3119; GFX1164-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3120; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 3121; GFX1164-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v2, v[4:5] 3122; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 3123; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3124; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3125; GFX1164-NEXT: v_mov_b32_e32 v1, v5 3126; GFX1164-NEXT: s_mov_b32 s2, -1 3127; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3128; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3129; GFX1164-NEXT: s_endpgm 3130; 3131; GFX1132-LABEL: sub_i64_uniform: 3132; GFX1132: ; %bb.0: ; %entry 3133; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3134; GFX1132-NEXT: s_mov_b32 s5, exec_lo 3135; GFX1132-NEXT: s_mov_b32 s4, exec_lo 3136; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3137; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 3138; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 3139; GFX1132-NEXT: s_cbranch_execz .LBB12_2 3140; GFX1132-NEXT: ; %bb.1: 3141; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 3142; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3143; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3144; GFX1132-NEXT: s_mul_i32 s6, s3, s5 3145; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 3146; GFX1132-NEXT: s_mul_i32 s5, s2, s5 3147; GFX1132-NEXT: s_add_i32 s7, s7, s6 3148; GFX1132-NEXT: v_mov_b32_e32 v0, s5 3149; GFX1132-NEXT: v_mov_b32_e32 v1, s7 3150; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3151; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3152; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3153; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3154; GFX1132-NEXT: buffer_gl0_inv 3155; GFX1132-NEXT: .LBB12_2: 3156; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 3157; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3158; GFX1132-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 3159; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 3160; GFX1132-NEXT: v_mad_u64_u32 v[5:6], s2, s3, v2, v[4:5] 3161; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 3162; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3163; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3164; GFX1132-NEXT: v_mov_b32_e32 v1, v5 3165; GFX1132-NEXT: s_mov_b32 s2, -1 3166; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3167; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3168; GFX1132-NEXT: s_endpgm 3169entry: 3170 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 3171 store i64 %old, i64 addrspace(1)* %out 3172 ret void 3173} 3174 3175define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 3176; 3177; 3178; GFX7LESS-LABEL: sub_i64_varying: 3179; GFX7LESS: ; %bb.0: ; %entry 3180; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3181; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3182; GFX7LESS-NEXT: s_mov_b32 m0, -1 3183; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3184; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3185; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3186; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3187; GFX7LESS-NEXT: s_mov_b32 s2, -1 3188; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3189; GFX7LESS-NEXT: s_endpgm 3190; 3191; GFX8-LABEL: sub_i64_varying: 3192; GFX8: ; %bb.0: ; %entry 3193; GFX8-NEXT: v_mov_b32_e32 v1, 0 3194; GFX8-NEXT: s_mov_b32 m0, -1 3195; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3196; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3197; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3198; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3199; GFX8-NEXT: s_mov_b32 s3, 0xf000 3200; GFX8-NEXT: s_mov_b32 s2, -1 3201; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3202; GFX8-NEXT: s_endpgm 3203; 3204; GFX9-LABEL: sub_i64_varying: 3205; GFX9: ; %bb.0: ; %entry 3206; GFX9-NEXT: v_mov_b32_e32 v1, 0 3207; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3208; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3209; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3210; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3211; GFX9-NEXT: s_mov_b32 s3, 0xf000 3212; GFX9-NEXT: s_mov_b32 s2, -1 3213; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3214; GFX9-NEXT: s_endpgm 3215; 3216; GFX10-LABEL: sub_i64_varying: 3217; GFX10: ; %bb.0: ; %entry 3218; GFX10-NEXT: v_mov_b32_e32 v1, 0 3219; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3220; GFX10-NEXT: s_mov_b32 s3, 0x31016000 3221; GFX10-NEXT: s_mov_b32 s2, -1 3222; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3223; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3224; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3225; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3226; GFX10-NEXT: buffer_gl0_inv 3227; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3228; GFX10-NEXT: s_endpgm 3229; 3230; GFX11-LABEL: sub_i64_varying: 3231; GFX11: ; %bb.0: ; %entry 3232; GFX11-NEXT: v_mov_b32_e32 v1, 0 3233; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3234; GFX11-NEXT: s_mov_b32 s3, 0x31016000 3235; GFX11-NEXT: s_mov_b32 s2, -1 3236; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3237; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3238; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3239; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3240; GFX11-NEXT: buffer_gl0_inv 3241; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3242; GFX11-NEXT: s_endpgm 3243entry: 3244 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3245 %zext = zext i32 %lane to i64 3246 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 3247 store i64 %old, i64 addrspace(1)* %out 3248 ret void 3249} 3250 3251define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 3252; 3253; 3254; GFX7LESS-LABEL: and_i32_varying: 3255; GFX7LESS: ; %bb.0: ; %entry 3256; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3257; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3258; GFX7LESS-NEXT: s_mov_b32 m0, -1 3259; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3260; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 3261; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3262; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3263; GFX7LESS-NEXT: s_mov_b32 s2, -1 3264; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3265; GFX7LESS-NEXT: s_endpgm 3266; 3267; GFX8-LABEL: and_i32_varying: 3268; GFX8: ; %bb.0: ; %entry 3269; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3270; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3271; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3272; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3273; GFX8-NEXT: v_mov_b32_e32 v1, -1 3274; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3275; GFX8-NEXT: v_mov_b32_e32 v2, v0 3276; GFX8-NEXT: s_not_b64 exec, exec 3277; GFX8-NEXT: v_mov_b32_e32 v2, -1 3278; GFX8-NEXT: s_not_b64 exec, exec 3279; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3280; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3281; GFX8-NEXT: s_nop 1 3282; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3283; GFX8-NEXT: s_nop 1 3284; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3285; GFX8-NEXT: s_nop 1 3286; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3287; GFX8-NEXT: s_nop 1 3288; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3289; GFX8-NEXT: s_nop 1 3290; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3291; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3292; GFX8-NEXT: s_nop 0 3293; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3294; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3295; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3296; GFX8-NEXT: ; implicit-def: $vgpr0 3297; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3298; GFX8-NEXT: s_cbranch_execz .LBB14_2 3299; GFX8-NEXT: ; %bb.1: 3300; GFX8-NEXT: v_mov_b32_e32 v0, 0 3301; GFX8-NEXT: v_mov_b32_e32 v3, s4 3302; GFX8-NEXT: s_mov_b32 m0, -1 3303; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3304; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 3305; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3306; GFX8-NEXT: .LBB14_2: 3307; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3308; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3309; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3310; GFX8-NEXT: v_mov_b32_e32 v0, v1 3311; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 3312; GFX8-NEXT: s_mov_b32 s3, 0xf000 3313; GFX8-NEXT: s_mov_b32 s2, -1 3314; GFX8-NEXT: s_nop 0 3315; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3316; GFX8-NEXT: s_endpgm 3317; 3318; GFX9-LABEL: and_i32_varying: 3319; GFX9: ; %bb.0: ; %entry 3320; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3321; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3322; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3323; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3324; GFX9-NEXT: v_mov_b32_e32 v1, -1 3325; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3326; GFX9-NEXT: v_mov_b32_e32 v2, v0 3327; GFX9-NEXT: s_not_b64 exec, exec 3328; GFX9-NEXT: v_mov_b32_e32 v2, -1 3329; GFX9-NEXT: s_not_b64 exec, exec 3330; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3331; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3332; GFX9-NEXT: s_nop 1 3333; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3334; GFX9-NEXT: s_nop 1 3335; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3336; GFX9-NEXT: s_nop 1 3337; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3338; GFX9-NEXT: s_nop 1 3339; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3340; GFX9-NEXT: s_nop 1 3341; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3342; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3343; GFX9-NEXT: s_nop 0 3344; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3345; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3346; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3347; GFX9-NEXT: ; implicit-def: $vgpr0 3348; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3349; GFX9-NEXT: s_cbranch_execz .LBB14_2 3350; GFX9-NEXT: ; %bb.1: 3351; GFX9-NEXT: v_mov_b32_e32 v0, 0 3352; GFX9-NEXT: v_mov_b32_e32 v3, s4 3353; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3354; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 3355; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3356; GFX9-NEXT: .LBB14_2: 3357; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3358; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3359; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3360; GFX9-NEXT: v_mov_b32_e32 v0, v1 3361; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 3362; GFX9-NEXT: s_mov_b32 s3, 0xf000 3363; GFX9-NEXT: s_mov_b32 s2, -1 3364; GFX9-NEXT: s_nop 0 3365; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3366; GFX9-NEXT: s_endpgm 3367; 3368; GFX1064-LABEL: and_i32_varying: 3369; GFX1064: ; %bb.0: ; %entry 3370; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3371; GFX1064-NEXT: s_not_b64 exec, exec 3372; GFX1064-NEXT: v_mov_b32_e32 v1, -1 3373; GFX1064-NEXT: s_not_b64 exec, exec 3374; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3375; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3376; GFX1064-NEXT: v_mov_b32_e32 v3, -1 3377; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3378; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3379; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3380; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3381; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3382; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3383; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3384; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3385; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3386; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3387; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3388; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3389; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3390; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3391; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3392; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3393; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3394; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3395; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3396; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3397; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3398; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3399; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3400; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3401; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3402; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3403; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3404; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3405; GFX1064-NEXT: s_mov_b32 s2, -1 3406; GFX1064-NEXT: ; implicit-def: $vgpr0 3407; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3408; GFX1064-NEXT: s_cbranch_execz .LBB14_2 3409; GFX1064-NEXT: ; %bb.1: 3410; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3411; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3412; GFX1064-NEXT: s_mov_b32 s3, s7 3413; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3414; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3415; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 3416; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3417; GFX1064-NEXT: buffer_gl0_inv 3418; GFX1064-NEXT: .LBB14_2: 3419; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3420; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3421; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3422; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3423; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 3424; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3425; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3426; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3427; GFX1064-NEXT: s_endpgm 3428; 3429; GFX1032-LABEL: and_i32_varying: 3430; GFX1032: ; %bb.0: ; %entry 3431; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3432; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3433; GFX1032-NEXT: v_mov_b32_e32 v1, -1 3434; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3435; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3436; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3437; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3438; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3439; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3440; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3441; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3442; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3443; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3444; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3445; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3446; GFX1032-NEXT: v_mov_b32_e32 v3, -1 3447; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3448; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3449; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3450; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3451; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3452; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3453; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3454; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3455; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3456; GFX1032-NEXT: s_mov_b32 s2, -1 3457; GFX1032-NEXT: ; implicit-def: $vgpr0 3458; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3459; GFX1032-NEXT: s_cbranch_execz .LBB14_2 3460; GFX1032-NEXT: ; %bb.1: 3461; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3462; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3463; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3464; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3465; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 3466; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3467; GFX1032-NEXT: buffer_gl0_inv 3468; GFX1032-NEXT: .LBB14_2: 3469; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3470; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3471; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3472; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3473; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 3474; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3475; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3476; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3477; GFX1032-NEXT: s_endpgm 3478; 3479; GFX1164-LABEL: and_i32_varying: 3480; GFX1164: ; %bb.0: ; %entry 3481; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3482; GFX1164-NEXT: s_not_b64 exec, exec 3483; GFX1164-NEXT: v_mov_b32_e32 v1, -1 3484; GFX1164-NEXT: s_not_b64 exec, exec 3485; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3486; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3487; GFX1164-NEXT: v_mov_b32_e32 v3, -1 3488; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3489; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3490; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3491; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3492; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3493; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3494; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3495; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3496; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3497; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3498; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3499; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3500; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3501; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3502; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3503; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3504; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3505; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3506; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3507; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3508; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3509; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3510; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3511; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3512; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3513; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3514; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3515; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3516; GFX1164-NEXT: s_mov_b32 s2, -1 3517; GFX1164-NEXT: ; implicit-def: $vgpr0 3518; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3519; GFX1164-NEXT: s_cbranch_execz .LBB14_2 3520; GFX1164-NEXT: ; %bb.1: 3521; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3522; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3523; GFX1164-NEXT: s_mov_b32 s3, s7 3524; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3525; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3526; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 3527; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3528; GFX1164-NEXT: buffer_gl0_inv 3529; GFX1164-NEXT: .LBB14_2: 3530; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3531; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3532; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3533; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 3534; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3535; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3536; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3537; GFX1164-NEXT: s_endpgm 3538; 3539; GFX1132-LABEL: and_i32_varying: 3540; GFX1132: ; %bb.0: ; %entry 3541; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3542; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3543; GFX1132-NEXT: v_mov_b32_e32 v1, -1 3544; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3545; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3546; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3547; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3548; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3549; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3550; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3551; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3552; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3553; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3554; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3555; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3556; GFX1132-NEXT: v_mov_b32_e32 v3, -1 3557; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3558; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3559; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3560; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3561; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3562; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3563; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3564; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3565; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3566; GFX1132-NEXT: s_mov_b32 s2, -1 3567; GFX1132-NEXT: ; implicit-def: $vgpr0 3568; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3569; GFX1132-NEXT: s_cbranch_execz .LBB14_2 3570; GFX1132-NEXT: ; %bb.1: 3571; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3572; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3573; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3574; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3575; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 3576; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3577; GFX1132-NEXT: buffer_gl0_inv 3578; GFX1132-NEXT: .LBB14_2: 3579; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3580; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3581; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3582; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 3583; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3584; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3585; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3586; GFX1132-NEXT: s_endpgm 3587entry: 3588 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3589 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3590 store i32 %old, i32 addrspace(1)* %out 3591 ret void 3592} 3593 3594define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 3595; 3596; 3597; GFX7LESS-LABEL: or_i32_varying: 3598; GFX7LESS: ; %bb.0: ; %entry 3599; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3600; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3601; GFX7LESS-NEXT: s_mov_b32 m0, -1 3602; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3603; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 3604; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3605; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3606; GFX7LESS-NEXT: s_mov_b32 s2, -1 3607; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3608; GFX7LESS-NEXT: s_endpgm 3609; 3610; GFX8-LABEL: or_i32_varying: 3611; GFX8: ; %bb.0: ; %entry 3612; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3613; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3614; GFX8-NEXT: v_mov_b32_e32 v1, 0 3615; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3616; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3617; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3618; GFX8-NEXT: v_mov_b32_e32 v2, v0 3619; GFX8-NEXT: s_not_b64 exec, exec 3620; GFX8-NEXT: v_mov_b32_e32 v2, 0 3621; GFX8-NEXT: s_not_b64 exec, exec 3622; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3623; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3624; GFX8-NEXT: s_nop 1 3625; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3626; GFX8-NEXT: s_nop 1 3627; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3628; GFX8-NEXT: s_nop 1 3629; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3630; GFX8-NEXT: s_nop 1 3631; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3632; GFX8-NEXT: s_nop 1 3633; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3634; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3635; GFX8-NEXT: s_nop 0 3636; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3637; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3638; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3639; GFX8-NEXT: ; implicit-def: $vgpr0 3640; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3641; GFX8-NEXT: s_cbranch_execz .LBB15_2 3642; GFX8-NEXT: ; %bb.1: 3643; GFX8-NEXT: v_mov_b32_e32 v0, 0 3644; GFX8-NEXT: v_mov_b32_e32 v3, s4 3645; GFX8-NEXT: s_mov_b32 m0, -1 3646; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3647; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 3648; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3649; GFX8-NEXT: .LBB15_2: 3650; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3651; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3652; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3653; GFX8-NEXT: v_mov_b32_e32 v0, v1 3654; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 3655; GFX8-NEXT: s_mov_b32 s3, 0xf000 3656; GFX8-NEXT: s_mov_b32 s2, -1 3657; GFX8-NEXT: s_nop 0 3658; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3659; GFX8-NEXT: s_endpgm 3660; 3661; GFX9-LABEL: or_i32_varying: 3662; GFX9: ; %bb.0: ; %entry 3663; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3664; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3665; GFX9-NEXT: v_mov_b32_e32 v1, 0 3666; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3667; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3668; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3669; GFX9-NEXT: v_mov_b32_e32 v2, v0 3670; GFX9-NEXT: s_not_b64 exec, exec 3671; GFX9-NEXT: v_mov_b32_e32 v2, 0 3672; GFX9-NEXT: s_not_b64 exec, exec 3673; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3674; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3675; GFX9-NEXT: s_nop 1 3676; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3677; GFX9-NEXT: s_nop 1 3678; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3679; GFX9-NEXT: s_nop 1 3680; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3681; GFX9-NEXT: s_nop 1 3682; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3683; GFX9-NEXT: s_nop 1 3684; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3685; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3686; GFX9-NEXT: s_nop 0 3687; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3688; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3689; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3690; GFX9-NEXT: ; implicit-def: $vgpr0 3691; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3692; GFX9-NEXT: s_cbranch_execz .LBB15_2 3693; GFX9-NEXT: ; %bb.1: 3694; GFX9-NEXT: v_mov_b32_e32 v0, 0 3695; GFX9-NEXT: v_mov_b32_e32 v3, s4 3696; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3697; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3698; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3699; GFX9-NEXT: .LBB15_2: 3700; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3701; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3702; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3703; GFX9-NEXT: v_mov_b32_e32 v0, v1 3704; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3705; GFX9-NEXT: s_mov_b32 s3, 0xf000 3706; GFX9-NEXT: s_mov_b32 s2, -1 3707; GFX9-NEXT: s_nop 0 3708; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3709; GFX9-NEXT: s_endpgm 3710; 3711; GFX1064-LABEL: or_i32_varying: 3712; GFX1064: ; %bb.0: ; %entry 3713; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3714; GFX1064-NEXT: s_not_b64 exec, exec 3715; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3716; GFX1064-NEXT: s_not_b64 exec, exec 3717; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3718; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3719; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3720; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3721; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3722; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3723; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3724; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3725; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3726; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3727; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3728; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3729; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3730; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3731; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3732; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3733; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3734; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3735; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3736; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3737; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3738; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3739; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3740; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3741; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3742; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3743; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3744; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3745; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3746; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3747; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3748; GFX1064-NEXT: s_mov_b32 s2, -1 3749; GFX1064-NEXT: ; implicit-def: $vgpr0 3750; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3751; GFX1064-NEXT: s_cbranch_execz .LBB15_2 3752; GFX1064-NEXT: ; %bb.1: 3753; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3754; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3755; GFX1064-NEXT: s_mov_b32 s3, s7 3756; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3757; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3758; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 3759; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3760; GFX1064-NEXT: buffer_gl0_inv 3761; GFX1064-NEXT: .LBB15_2: 3762; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3763; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3764; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3765; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3766; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3767; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3768; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3769; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3770; GFX1064-NEXT: s_endpgm 3771; 3772; GFX1032-LABEL: or_i32_varying: 3773; GFX1032: ; %bb.0: ; %entry 3774; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3775; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3776; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3777; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3778; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3779; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3780; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3781; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3782; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3783; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3784; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3785; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3786; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3787; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3788; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3789; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3790; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3791; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3792; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3793; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3794; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3795; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3796; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3797; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3798; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3799; GFX1032-NEXT: s_mov_b32 s2, -1 3800; GFX1032-NEXT: ; implicit-def: $vgpr0 3801; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3802; GFX1032-NEXT: s_cbranch_execz .LBB15_2 3803; GFX1032-NEXT: ; %bb.1: 3804; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3805; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3806; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3807; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3808; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 3809; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3810; GFX1032-NEXT: buffer_gl0_inv 3811; GFX1032-NEXT: .LBB15_2: 3812; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3813; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3814; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3815; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3816; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3817; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3818; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3819; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3820; GFX1032-NEXT: s_endpgm 3821; 3822; GFX1164-LABEL: or_i32_varying: 3823; GFX1164: ; %bb.0: ; %entry 3824; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3825; GFX1164-NEXT: s_not_b64 exec, exec 3826; GFX1164-NEXT: v_mov_b32_e32 v1, 0 3827; GFX1164-NEXT: s_not_b64 exec, exec 3828; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3829; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3830; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3831; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3832; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3833; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3834; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3835; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3836; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3837; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3838; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3839; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3840; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3841; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3842; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3843; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3844; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3845; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3846; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3847; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3848; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3849; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3850; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3851; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3852; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3853; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3854; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3855; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3856; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3857; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3858; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3859; GFX1164-NEXT: s_mov_b32 s2, -1 3860; GFX1164-NEXT: ; implicit-def: $vgpr0 3861; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3862; GFX1164-NEXT: s_cbranch_execz .LBB15_2 3863; GFX1164-NEXT: ; %bb.1: 3864; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3865; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3866; GFX1164-NEXT: s_mov_b32 s3, s7 3867; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3868; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3869; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v4 3870; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3871; GFX1164-NEXT: buffer_gl0_inv 3872; GFX1164-NEXT: .LBB15_2: 3873; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3874; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3875; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3876; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 3877; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3878; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3879; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3880; GFX1164-NEXT: s_endpgm 3881; 3882; GFX1132-LABEL: or_i32_varying: 3883; GFX1132: ; %bb.0: ; %entry 3884; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3885; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3886; GFX1132-NEXT: v_mov_b32_e32 v1, 0 3887; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3888; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3889; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3890; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3891; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3892; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3893; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3894; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3895; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3896; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3897; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3898; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3899; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3900; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3901; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3902; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3903; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3904; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3905; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3906; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3907; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3908; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3909; GFX1132-NEXT: s_mov_b32 s2, -1 3910; GFX1132-NEXT: ; implicit-def: $vgpr0 3911; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3912; GFX1132-NEXT: s_cbranch_execz .LBB15_2 3913; GFX1132-NEXT: ; %bb.1: 3914; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3915; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3916; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3917; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3918; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v4 3919; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3920; GFX1132-NEXT: buffer_gl0_inv 3921; GFX1132-NEXT: .LBB15_2: 3922; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3923; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3924; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3925; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 3926; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3927; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3928; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3929; GFX1132-NEXT: s_endpgm 3930entry: 3931 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3932 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3933 store i32 %old, i32 addrspace(1)* %out 3934 ret void 3935} 3936 3937define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3938; 3939; 3940; GFX7LESS-LABEL: xor_i32_varying: 3941; GFX7LESS: ; %bb.0: ; %entry 3942; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3943; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3944; GFX7LESS-NEXT: s_mov_b32 m0, -1 3945; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3946; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3947; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3948; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3949; GFX7LESS-NEXT: s_mov_b32 s2, -1 3950; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3951; GFX7LESS-NEXT: s_endpgm 3952; 3953; GFX8-LABEL: xor_i32_varying: 3954; GFX8: ; %bb.0: ; %entry 3955; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3956; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3957; GFX8-NEXT: v_mov_b32_e32 v1, 0 3958; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3959; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3960; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3961; GFX8-NEXT: v_mov_b32_e32 v2, v0 3962; GFX8-NEXT: s_not_b64 exec, exec 3963; GFX8-NEXT: v_mov_b32_e32 v2, 0 3964; GFX8-NEXT: s_not_b64 exec, exec 3965; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3966; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3967; GFX8-NEXT: s_nop 1 3968; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3969; GFX8-NEXT: s_nop 1 3970; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3971; GFX8-NEXT: s_nop 1 3972; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3973; GFX8-NEXT: s_nop 1 3974; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3975; GFX8-NEXT: s_nop 1 3976; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3977; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3978; GFX8-NEXT: s_nop 0 3979; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3980; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3981; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3982; GFX8-NEXT: ; implicit-def: $vgpr0 3983; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3984; GFX8-NEXT: s_cbranch_execz .LBB16_2 3985; GFX8-NEXT: ; %bb.1: 3986; GFX8-NEXT: v_mov_b32_e32 v0, 0 3987; GFX8-NEXT: v_mov_b32_e32 v3, s4 3988; GFX8-NEXT: s_mov_b32 m0, -1 3989; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3990; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3991; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3992; GFX8-NEXT: .LBB16_2: 3993; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3994; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3995; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3996; GFX8-NEXT: v_mov_b32_e32 v0, v1 3997; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3998; GFX8-NEXT: s_mov_b32 s3, 0xf000 3999; GFX8-NEXT: s_mov_b32 s2, -1 4000; GFX8-NEXT: s_nop 0 4001; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4002; GFX8-NEXT: s_endpgm 4003; 4004; GFX9-LABEL: xor_i32_varying: 4005; GFX9: ; %bb.0: ; %entry 4006; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4007; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4008; GFX9-NEXT: v_mov_b32_e32 v1, 0 4009; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4010; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4011; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4012; GFX9-NEXT: v_mov_b32_e32 v2, v0 4013; GFX9-NEXT: s_not_b64 exec, exec 4014; GFX9-NEXT: v_mov_b32_e32 v2, 0 4015; GFX9-NEXT: s_not_b64 exec, exec 4016; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4017; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4018; GFX9-NEXT: s_nop 1 4019; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4020; GFX9-NEXT: s_nop 1 4021; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4022; GFX9-NEXT: s_nop 1 4023; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4024; GFX9-NEXT: s_nop 1 4025; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4026; GFX9-NEXT: s_nop 1 4027; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4028; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4029; GFX9-NEXT: s_nop 0 4030; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4031; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4032; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4033; GFX9-NEXT: ; implicit-def: $vgpr0 4034; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4035; GFX9-NEXT: s_cbranch_execz .LBB16_2 4036; GFX9-NEXT: ; %bb.1: 4037; GFX9-NEXT: v_mov_b32_e32 v0, 0 4038; GFX9-NEXT: v_mov_b32_e32 v3, s4 4039; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4040; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 4041; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4042; GFX9-NEXT: .LBB16_2: 4043; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4044; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4045; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4046; GFX9-NEXT: v_mov_b32_e32 v0, v1 4047; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 4048; GFX9-NEXT: s_mov_b32 s3, 0xf000 4049; GFX9-NEXT: s_mov_b32 s2, -1 4050; GFX9-NEXT: s_nop 0 4051; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4052; GFX9-NEXT: s_endpgm 4053; 4054; GFX1064-LABEL: xor_i32_varying: 4055; GFX1064: ; %bb.0: ; %entry 4056; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4057; GFX1064-NEXT: s_not_b64 exec, exec 4058; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4059; GFX1064-NEXT: s_not_b64 exec, exec 4060; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4061; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4062; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4063; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4064; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4065; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4066; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4067; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4068; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4069; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4070; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4071; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4072; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4073; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4074; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4075; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4076; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4077; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4078; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4079; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4080; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4081; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4082; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4083; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4084; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4085; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4086; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4087; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4088; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4089; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4090; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4091; GFX1064-NEXT: s_mov_b32 s2, -1 4092; GFX1064-NEXT: ; implicit-def: $vgpr0 4093; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4094; GFX1064-NEXT: s_cbranch_execz .LBB16_2 4095; GFX1064-NEXT: ; %bb.1: 4096; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4097; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4098; GFX1064-NEXT: s_mov_b32 s3, s7 4099; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4100; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4101; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 4102; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4103; GFX1064-NEXT: buffer_gl0_inv 4104; GFX1064-NEXT: .LBB16_2: 4105; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4106; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4107; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4108; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4109; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 4110; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4111; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4112; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4113; GFX1064-NEXT: s_endpgm 4114; 4115; GFX1032-LABEL: xor_i32_varying: 4116; GFX1032: ; %bb.0: ; %entry 4117; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4118; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4119; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4120; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4121; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4122; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4123; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4124; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4125; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4126; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4127; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4128; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4129; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4130; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4131; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4132; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4133; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4134; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4135; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4136; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4137; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4138; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4139; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4140; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4141; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4142; GFX1032-NEXT: s_mov_b32 s2, -1 4143; GFX1032-NEXT: ; implicit-def: $vgpr0 4144; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4145; GFX1032-NEXT: s_cbranch_execz .LBB16_2 4146; GFX1032-NEXT: ; %bb.1: 4147; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4148; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4149; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4150; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4151; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 4152; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4153; GFX1032-NEXT: buffer_gl0_inv 4154; GFX1032-NEXT: .LBB16_2: 4155; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4156; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4157; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4158; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4159; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 4160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4161; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4162; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4163; GFX1032-NEXT: s_endpgm 4164; 4165; GFX1164-LABEL: xor_i32_varying: 4166; GFX1164: ; %bb.0: ; %entry 4167; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4168; GFX1164-NEXT: s_not_b64 exec, exec 4169; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4170; GFX1164-NEXT: s_not_b64 exec, exec 4171; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4172; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4173; GFX1164-NEXT: v_mov_b32_e32 v3, 0 4174; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4175; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4176; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4177; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4178; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4179; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4180; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4181; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4182; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4183; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4184; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4185; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4186; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4187; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4188; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4189; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4190; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4191; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4192; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4193; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4194; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4195; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4196; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4197; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4198; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4199; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4200; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4201; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4202; GFX1164-NEXT: s_mov_b32 s2, -1 4203; GFX1164-NEXT: ; implicit-def: $vgpr0 4204; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4205; GFX1164-NEXT: s_cbranch_execz .LBB16_2 4206; GFX1164-NEXT: ; %bb.1: 4207; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4208; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4209; GFX1164-NEXT: s_mov_b32 s3, s7 4210; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4211; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4212; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v4 4213; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4214; GFX1164-NEXT: buffer_gl0_inv 4215; GFX1164-NEXT: .LBB16_2: 4216; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4217; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4218; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4219; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 4220; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4221; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4222; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4223; GFX1164-NEXT: s_endpgm 4224; 4225; GFX1132-LABEL: xor_i32_varying: 4226; GFX1132: ; %bb.0: ; %entry 4227; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4228; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4229; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4230; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4231; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4232; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4233; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4234; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4235; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4236; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4237; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4238; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4239; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4240; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4241; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4242; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4243; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4244; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4245; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4246; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4247; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4248; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4249; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4250; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4251; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4252; GFX1132-NEXT: s_mov_b32 s2, -1 4253; GFX1132-NEXT: ; implicit-def: $vgpr0 4254; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4255; GFX1132-NEXT: s_cbranch_execz .LBB16_2 4256; GFX1132-NEXT: ; %bb.1: 4257; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4258; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4259; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4260; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4261; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v4 4262; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4263; GFX1132-NEXT: buffer_gl0_inv 4264; GFX1132-NEXT: .LBB16_2: 4265; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4266; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4267; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4268; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 4269; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4270; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4271; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4272; GFX1132-NEXT: s_endpgm 4273entry: 4274 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4275 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4276 store i32 %old, i32 addrspace(1)* %out 4277 ret void 4278} 4279 4280define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 4281; 4282; 4283; GFX7LESS-LABEL: max_i32_varying: 4284; GFX7LESS: ; %bb.0: ; %entry 4285; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4286; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4287; GFX7LESS-NEXT: s_mov_b32 m0, -1 4288; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4289; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 4290; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4291; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4292; GFX7LESS-NEXT: s_mov_b32 s2, -1 4293; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4294; GFX7LESS-NEXT: s_endpgm 4295; 4296; GFX8-LABEL: max_i32_varying: 4297; GFX8: ; %bb.0: ; %entry 4298; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4299; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4300; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4301; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4302; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4303; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4304; GFX8-NEXT: v_mov_b32_e32 v2, v0 4305; GFX8-NEXT: s_not_b64 exec, exec 4306; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 4307; GFX8-NEXT: s_not_b64 exec, exec 4308; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4309; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4310; GFX8-NEXT: s_nop 1 4311; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4312; GFX8-NEXT: s_nop 1 4313; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4314; GFX8-NEXT: s_nop 1 4315; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4316; GFX8-NEXT: s_nop 1 4317; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4318; GFX8-NEXT: s_nop 1 4319; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4320; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4321; GFX8-NEXT: s_nop 0 4322; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4323; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4324; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4325; GFX8-NEXT: ; implicit-def: $vgpr0 4326; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4327; GFX8-NEXT: s_cbranch_execz .LBB17_2 4328; GFX8-NEXT: ; %bb.1: 4329; GFX8-NEXT: v_mov_b32_e32 v0, 0 4330; GFX8-NEXT: v_mov_b32_e32 v3, s4 4331; GFX8-NEXT: s_mov_b32 m0, -1 4332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 4334; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4335; GFX8-NEXT: .LBB17_2: 4336; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4337; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4338; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4339; GFX8-NEXT: v_mov_b32_e32 v0, v1 4340; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 4341; GFX8-NEXT: s_mov_b32 s3, 0xf000 4342; GFX8-NEXT: s_mov_b32 s2, -1 4343; GFX8-NEXT: s_nop 0 4344; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4345; GFX8-NEXT: s_endpgm 4346; 4347; GFX9-LABEL: max_i32_varying: 4348; GFX9: ; %bb.0: ; %entry 4349; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4350; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4351; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4352; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4353; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4354; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4355; GFX9-NEXT: v_mov_b32_e32 v2, v0 4356; GFX9-NEXT: s_not_b64 exec, exec 4357; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 4358; GFX9-NEXT: s_not_b64 exec, exec 4359; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4360; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4361; GFX9-NEXT: s_nop 1 4362; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4363; GFX9-NEXT: s_nop 1 4364; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4365; GFX9-NEXT: s_nop 1 4366; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4367; GFX9-NEXT: s_nop 1 4368; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4369; GFX9-NEXT: s_nop 1 4370; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4371; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4372; GFX9-NEXT: s_nop 0 4373; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4374; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4375; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4376; GFX9-NEXT: ; implicit-def: $vgpr0 4377; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4378; GFX9-NEXT: s_cbranch_execz .LBB17_2 4379; GFX9-NEXT: ; %bb.1: 4380; GFX9-NEXT: v_mov_b32_e32 v0, 0 4381; GFX9-NEXT: v_mov_b32_e32 v3, s4 4382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4383; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 4384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4385; GFX9-NEXT: .LBB17_2: 4386; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4387; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4388; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4389; GFX9-NEXT: v_mov_b32_e32 v0, v1 4390; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 4391; GFX9-NEXT: s_mov_b32 s3, 0xf000 4392; GFX9-NEXT: s_mov_b32 s2, -1 4393; GFX9-NEXT: s_nop 0 4394; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4395; GFX9-NEXT: s_endpgm 4396; 4397; GFX1064-LABEL: max_i32_varying: 4398; GFX1064: ; %bb.0: ; %entry 4399; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4400; GFX1064-NEXT: s_not_b64 exec, exec 4401; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 4402; GFX1064-NEXT: s_not_b64 exec, exec 4403; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4404; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4405; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 4406; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4407; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4408; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4409; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4410; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4411; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4412; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4413; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4414; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4415; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4416; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4417; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4418; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4419; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4420; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4421; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4422; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4423; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4424; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4425; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4426; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4427; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4428; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4429; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4430; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4431; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4432; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4433; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4434; GFX1064-NEXT: s_mov_b32 s2, -1 4435; GFX1064-NEXT: ; implicit-def: $vgpr0 4436; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4437; GFX1064-NEXT: s_cbranch_execz .LBB17_2 4438; GFX1064-NEXT: ; %bb.1: 4439; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4440; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4441; GFX1064-NEXT: s_mov_b32 s3, s7 4442; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4443; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4444; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 4445; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4446; GFX1064-NEXT: buffer_gl0_inv 4447; GFX1064-NEXT: .LBB17_2: 4448; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4449; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4450; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4451; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4452; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 4453; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4454; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4455; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4456; GFX1064-NEXT: s_endpgm 4457; 4458; GFX1032-LABEL: max_i32_varying: 4459; GFX1032: ; %bb.0: ; %entry 4460; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4461; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4462; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 4463; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4464; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4465; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4466; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4467; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4468; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4469; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4470; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4471; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4472; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4473; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4474; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4475; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 4476; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4477; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4478; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4479; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4480; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4481; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4482; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4483; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4484; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4485; GFX1032-NEXT: s_mov_b32 s2, -1 4486; GFX1032-NEXT: ; implicit-def: $vgpr0 4487; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4488; GFX1032-NEXT: s_cbranch_execz .LBB17_2 4489; GFX1032-NEXT: ; %bb.1: 4490; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4491; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4492; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4493; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4494; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 4495; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4496; GFX1032-NEXT: buffer_gl0_inv 4497; GFX1032-NEXT: .LBB17_2: 4498; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4499; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4500; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4501; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4502; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 4503; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4504; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4505; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4506; GFX1032-NEXT: s_endpgm 4507; 4508; GFX1164-LABEL: max_i32_varying: 4509; GFX1164: ; %bb.0: ; %entry 4510; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4511; GFX1164-NEXT: s_not_b64 exec, exec 4512; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 4513; GFX1164-NEXT: s_not_b64 exec, exec 4514; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4515; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4516; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 4517; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4518; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4519; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4520; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4521; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4522; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4523; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4524; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4525; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4526; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4527; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4528; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4529; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4530; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4531; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4532; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4533; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4534; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4535; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4536; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4537; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4538; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4539; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4540; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4541; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4542; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4543; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4544; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4545; GFX1164-NEXT: s_mov_b32 s2, -1 4546; GFX1164-NEXT: ; implicit-def: $vgpr0 4547; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4548; GFX1164-NEXT: s_cbranch_execz .LBB17_2 4549; GFX1164-NEXT: ; %bb.1: 4550; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4551; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4552; GFX1164-NEXT: s_mov_b32 s3, s7 4553; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4554; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4555; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 4556; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4557; GFX1164-NEXT: buffer_gl0_inv 4558; GFX1164-NEXT: .LBB17_2: 4559; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4560; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4561; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4562; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 4563; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4564; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4565; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4566; GFX1164-NEXT: s_endpgm 4567; 4568; GFX1132-LABEL: max_i32_varying: 4569; GFX1132: ; %bb.0: ; %entry 4570; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4571; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4572; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 4573; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4574; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4575; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4576; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4577; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4578; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4579; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4580; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4581; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4582; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4583; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4584; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4585; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 4586; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4587; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4588; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4589; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4590; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4591; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4592; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4593; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4594; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4595; GFX1132-NEXT: s_mov_b32 s2, -1 4596; GFX1132-NEXT: ; implicit-def: $vgpr0 4597; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4598; GFX1132-NEXT: s_cbranch_execz .LBB17_2 4599; GFX1132-NEXT: ; %bb.1: 4600; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4601; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4602; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4603; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4604; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 4605; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4606; GFX1132-NEXT: buffer_gl0_inv 4607; GFX1132-NEXT: .LBB17_2: 4608; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4609; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4610; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4611; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 4612; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4613; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4614; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4615; GFX1132-NEXT: s_endpgm 4616entry: 4617 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4618 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4619 store i32 %old, i32 addrspace(1)* %out 4620 ret void 4621} 4622 4623define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 4624; 4625; 4626; GFX7LESS-LABEL: max_i64_constant: 4627; GFX7LESS: ; %bb.0: ; %entry 4628; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4629; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4630; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4631; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4632; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4633; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4634; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 4635; GFX7LESS-NEXT: ; %bb.1: 4636; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4637; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4638; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4639; GFX7LESS-NEXT: s_mov_b32 m0, -1 4640; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4641; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4642; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4643; GFX7LESS-NEXT: .LBB18_2: 4644; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4645; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4646; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4647; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4648; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 4649; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4650; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4651; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4652; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4653; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4654; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 4655; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4656; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4657; GFX7LESS-NEXT: s_mov_b32 s2, -1 4658; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4659; GFX7LESS-NEXT: s_endpgm 4660; 4661; GFX8-LABEL: max_i64_constant: 4662; GFX8: ; %bb.0: ; %entry 4663; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4664; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4665; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4666; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4667; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4668; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4669; GFX8-NEXT: s_cbranch_execz .LBB18_2 4670; GFX8-NEXT: ; %bb.1: 4671; GFX8-NEXT: v_mov_b32_e32 v0, 5 4672; GFX8-NEXT: v_mov_b32_e32 v2, 0 4673; GFX8-NEXT: v_mov_b32_e32 v1, 0 4674; GFX8-NEXT: s_mov_b32 m0, -1 4675; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4676; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4677; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4678; GFX8-NEXT: .LBB18_2: 4679; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4680; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4681; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4682; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 4683; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4684; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4685; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4686; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4687; GFX8-NEXT: v_mov_b32_e32 v2, s3 4688; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4689; GFX8-NEXT: v_mov_b32_e32 v2, s2 4690; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4691; GFX8-NEXT: s_mov_b32 s3, 0xf000 4692; GFX8-NEXT: s_mov_b32 s2, -1 4693; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4694; GFX8-NEXT: s_endpgm 4695; 4696; GFX9-LABEL: max_i64_constant: 4697; GFX9: ; %bb.0: ; %entry 4698; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4699; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4700; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4701; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4702; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4703; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4704; GFX9-NEXT: s_cbranch_execz .LBB18_2 4705; GFX9-NEXT: ; %bb.1: 4706; GFX9-NEXT: v_mov_b32_e32 v0, 5 4707; GFX9-NEXT: v_mov_b32_e32 v1, 0 4708; GFX9-NEXT: v_mov_b32_e32 v2, 0 4709; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4710; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4711; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4712; GFX9-NEXT: .LBB18_2: 4713; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4714; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4715; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4716; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 4717; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4718; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4719; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4720; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4721; GFX9-NEXT: v_mov_b32_e32 v2, s3 4722; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4723; GFX9-NEXT: v_mov_b32_e32 v2, s2 4724; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4725; GFX9-NEXT: s_mov_b32 s3, 0xf000 4726; GFX9-NEXT: s_mov_b32 s2, -1 4727; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4728; GFX9-NEXT: s_endpgm 4729; 4730; GFX1064-LABEL: max_i64_constant: 4731; GFX1064: ; %bb.0: ; %entry 4732; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4733; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4734; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4735; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4736; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4737; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4738; GFX1064-NEXT: s_cbranch_execz .LBB18_2 4739; GFX1064-NEXT: ; %bb.1: 4740; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4741; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4742; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4743; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4744; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4745; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4746; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4747; GFX1064-NEXT: buffer_gl0_inv 4748; GFX1064-NEXT: .LBB18_2: 4749; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4750; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4751; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4752; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4753; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4754; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4755; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4756; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4757; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4758; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4759; GFX1064-NEXT: s_mov_b32 s2, -1 4760; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4761; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4762; GFX1064-NEXT: s_endpgm 4763; 4764; GFX1032-LABEL: max_i64_constant: 4765; GFX1032: ; %bb.0: ; %entry 4766; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4767; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4768; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4769; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4770; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4771; GFX1032-NEXT: s_cbranch_execz .LBB18_2 4772; GFX1032-NEXT: ; %bb.1: 4773; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4774; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4775; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4776; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4777; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4778; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4779; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4780; GFX1032-NEXT: buffer_gl0_inv 4781; GFX1032-NEXT: .LBB18_2: 4782; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4783; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4784; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4785; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4786; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4787; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4788; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4789; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4790; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4791; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4792; GFX1032-NEXT: s_mov_b32 s2, -1 4793; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4794; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4795; GFX1032-NEXT: s_endpgm 4796; 4797; GFX1164-LABEL: max_i64_constant: 4798; GFX1164: ; %bb.0: ; %entry 4799; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4800; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4801; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4802; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4803; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 4804; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 4805; GFX1164-NEXT: s_cbranch_execz .LBB18_2 4806; GFX1164-NEXT: ; %bb.1: 4807; GFX1164-NEXT: v_mov_b32_e32 v0, 5 4808; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4809; GFX1164-NEXT: v_mov_b32_e32 v2, 0 4810; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4811; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4812; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4813; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4814; GFX1164-NEXT: buffer_gl0_inv 4815; GFX1164-NEXT: .LBB18_2: 4816; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 4817; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 4818; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 4819; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4820; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4821; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4822; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4823; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4824; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4825; GFX1164-NEXT: s_mov_b32 s2, -1 4826; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4827; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 4828; GFX1164-NEXT: s_endpgm 4829; 4830; GFX1132-LABEL: max_i64_constant: 4831; GFX1132: ; %bb.0: ; %entry 4832; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4833; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4834; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4835; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 4836; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 4837; GFX1132-NEXT: s_cbranch_execz .LBB18_2 4838; GFX1132-NEXT: ; %bb.1: 4839; GFX1132-NEXT: v_mov_b32_e32 v0, 5 4840; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4841; GFX1132-NEXT: v_mov_b32_e32 v2, 0 4842; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4843; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4844; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4845; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4846; GFX1132-NEXT: buffer_gl0_inv 4847; GFX1132-NEXT: .LBB18_2: 4848; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 4849; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 4850; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 4851; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4852; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4853; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4854; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4855; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4856; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4857; GFX1132-NEXT: s_mov_b32 s2, -1 4858; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4859; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 4860; GFX1132-NEXT: s_endpgm 4861entry: 4862 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 4863 store i64 %old, i64 addrspace(1)* %out 4864 ret void 4865} 4866 4867define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 4868; 4869; 4870; GFX7LESS-LABEL: min_i32_varying: 4871; GFX7LESS: ; %bb.0: ; %entry 4872; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4873; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4874; GFX7LESS-NEXT: s_mov_b32 m0, -1 4875; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4876; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 4877; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4878; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4879; GFX7LESS-NEXT: s_mov_b32 s2, -1 4880; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4881; GFX7LESS-NEXT: s_endpgm 4882; 4883; GFX8-LABEL: min_i32_varying: 4884; GFX8: ; %bb.0: ; %entry 4885; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4886; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4887; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4888; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4889; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 4890; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4891; GFX8-NEXT: v_mov_b32_e32 v2, v0 4892; GFX8-NEXT: s_not_b64 exec, exec 4893; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 4894; GFX8-NEXT: s_not_b64 exec, exec 4895; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4896; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4897; GFX8-NEXT: s_nop 1 4898; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4899; GFX8-NEXT: s_nop 1 4900; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4901; GFX8-NEXT: s_nop 1 4902; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4903; GFX8-NEXT: s_nop 1 4904; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4905; GFX8-NEXT: s_nop 1 4906; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4907; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4908; GFX8-NEXT: s_nop 0 4909; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4910; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4911; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4912; GFX8-NEXT: ; implicit-def: $vgpr0 4913; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4914; GFX8-NEXT: s_cbranch_execz .LBB19_2 4915; GFX8-NEXT: ; %bb.1: 4916; GFX8-NEXT: v_mov_b32_e32 v0, 0 4917; GFX8-NEXT: v_mov_b32_e32 v3, s4 4918; GFX8-NEXT: s_mov_b32 m0, -1 4919; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4920; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 4921; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4922; GFX8-NEXT: .LBB19_2: 4923; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4924; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4925; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4926; GFX8-NEXT: v_mov_b32_e32 v0, v1 4927; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 4928; GFX8-NEXT: s_mov_b32 s3, 0xf000 4929; GFX8-NEXT: s_mov_b32 s2, -1 4930; GFX8-NEXT: s_nop 0 4931; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4932; GFX8-NEXT: s_endpgm 4933; 4934; GFX9-LABEL: min_i32_varying: 4935; GFX9: ; %bb.0: ; %entry 4936; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4937; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4938; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4939; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4940; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 4941; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4942; GFX9-NEXT: v_mov_b32_e32 v2, v0 4943; GFX9-NEXT: s_not_b64 exec, exec 4944; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 4945; GFX9-NEXT: s_not_b64 exec, exec 4946; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4947; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4948; GFX9-NEXT: s_nop 1 4949; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4950; GFX9-NEXT: s_nop 1 4951; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4952; GFX9-NEXT: s_nop 1 4953; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4954; GFX9-NEXT: s_nop 1 4955; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4956; GFX9-NEXT: s_nop 1 4957; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4958; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4959; GFX9-NEXT: s_nop 0 4960; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4961; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4962; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4963; GFX9-NEXT: ; implicit-def: $vgpr0 4964; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4965; GFX9-NEXT: s_cbranch_execz .LBB19_2 4966; GFX9-NEXT: ; %bb.1: 4967; GFX9-NEXT: v_mov_b32_e32 v0, 0 4968; GFX9-NEXT: v_mov_b32_e32 v3, s4 4969; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4970; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 4971; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4972; GFX9-NEXT: .LBB19_2: 4973; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4974; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4975; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4976; GFX9-NEXT: v_mov_b32_e32 v0, v1 4977; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 4978; GFX9-NEXT: s_mov_b32 s3, 0xf000 4979; GFX9-NEXT: s_mov_b32 s2, -1 4980; GFX9-NEXT: s_nop 0 4981; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4982; GFX9-NEXT: s_endpgm 4983; 4984; GFX1064-LABEL: min_i32_varying: 4985; GFX1064: ; %bb.0: ; %entry 4986; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4987; GFX1064-NEXT: s_not_b64 exec, exec 4988; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 4989; GFX1064-NEXT: s_not_b64 exec, exec 4990; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4991; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4992; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 4993; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4994; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4995; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4996; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4997; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4998; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4999; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5000; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5001; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5002; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5003; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5004; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5005; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5006; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5007; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5008; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5009; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5010; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5011; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5012; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5013; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5014; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5015; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5016; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5017; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5018; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5019; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5020; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5021; GFX1064-NEXT: s_mov_b32 s2, -1 5022; GFX1064-NEXT: ; implicit-def: $vgpr0 5023; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5024; GFX1064-NEXT: s_cbranch_execz .LBB19_2 5025; GFX1064-NEXT: ; %bb.1: 5026; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5027; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5028; GFX1064-NEXT: s_mov_b32 s3, s7 5029; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5030; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5031; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 5032; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5033; GFX1064-NEXT: buffer_gl0_inv 5034; GFX1064-NEXT: .LBB19_2: 5035; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5036; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5037; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5038; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5039; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 5040; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5041; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5042; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5043; GFX1064-NEXT: s_endpgm 5044; 5045; GFX1032-LABEL: min_i32_varying: 5046; GFX1032: ; %bb.0: ; %entry 5047; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5048; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5049; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 5050; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5051; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5052; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5053; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5054; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5055; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5056; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5057; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5058; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5059; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5060; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5061; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5062; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 5063; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5064; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5065; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5066; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5067; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5068; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5069; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5070; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5071; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5072; GFX1032-NEXT: s_mov_b32 s2, -1 5073; GFX1032-NEXT: ; implicit-def: $vgpr0 5074; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5075; GFX1032-NEXT: s_cbranch_execz .LBB19_2 5076; GFX1032-NEXT: ; %bb.1: 5077; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5078; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5079; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5080; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5081; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 5082; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5083; GFX1032-NEXT: buffer_gl0_inv 5084; GFX1032-NEXT: .LBB19_2: 5085; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5086; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5087; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5088; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5089; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 5090; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5091; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5092; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5093; GFX1032-NEXT: s_endpgm 5094; 5095; GFX1164-LABEL: min_i32_varying: 5096; GFX1164: ; %bb.0: ; %entry 5097; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5098; GFX1164-NEXT: s_not_b64 exec, exec 5099; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 5100; GFX1164-NEXT: s_not_b64 exec, exec 5101; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5102; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5103; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 5104; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5105; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5106; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5107; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5108; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5109; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5110; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5111; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5112; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5113; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5114; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5115; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5116; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5117; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5118; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5119; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5120; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5121; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5122; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5123; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5124; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5125; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5126; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5127; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5128; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5129; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5130; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5131; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5132; GFX1164-NEXT: s_mov_b32 s2, -1 5133; GFX1164-NEXT: ; implicit-def: $vgpr0 5134; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5135; GFX1164-NEXT: s_cbranch_execz .LBB19_2 5136; GFX1164-NEXT: ; %bb.1: 5137; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5138; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5139; GFX1164-NEXT: s_mov_b32 s3, s7 5140; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5141; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5142; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 5143; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5144; GFX1164-NEXT: buffer_gl0_inv 5145; GFX1164-NEXT: .LBB19_2: 5146; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5147; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5148; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5149; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 5150; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5151; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5152; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5153; GFX1164-NEXT: s_endpgm 5154; 5155; GFX1132-LABEL: min_i32_varying: 5156; GFX1132: ; %bb.0: ; %entry 5157; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5158; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5159; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 5160; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5161; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5162; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5163; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5164; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5165; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5166; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5167; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5168; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5169; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5170; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5171; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5172; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 5173; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5174; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5175; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5176; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5177; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5178; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5179; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5180; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5181; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5182; GFX1132-NEXT: s_mov_b32 s2, -1 5183; GFX1132-NEXT: ; implicit-def: $vgpr0 5184; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5185; GFX1132-NEXT: s_cbranch_execz .LBB19_2 5186; GFX1132-NEXT: ; %bb.1: 5187; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5188; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5189; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5190; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5191; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 5192; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5193; GFX1132-NEXT: buffer_gl0_inv 5194; GFX1132-NEXT: .LBB19_2: 5195; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5196; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5197; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5198; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 5199; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5200; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5201; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5202; GFX1132-NEXT: s_endpgm 5203entry: 5204 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5205 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5206 store i32 %old, i32 addrspace(1)* %out 5207 ret void 5208} 5209 5210define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 5211; 5212; 5213; GFX7LESS-LABEL: min_i64_constant: 5214; GFX7LESS: ; %bb.0: ; %entry 5215; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5216; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5217; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5218; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5219; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5220; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5221; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 5222; GFX7LESS-NEXT: ; %bb.1: 5223; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5224; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5225; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5226; GFX7LESS-NEXT: s_mov_b32 m0, -1 5227; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5228; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5229; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5230; GFX7LESS-NEXT: .LBB20_2: 5231; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5232; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5233; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5234; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5235; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 5236; GFX7LESS-NEXT: s_mov_b32 s2, -1 5237; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5238; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5239; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 5240; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 5241; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5242; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5243; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5244; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5245; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5246; GFX7LESS-NEXT: s_endpgm 5247; 5248; GFX8-LABEL: min_i64_constant: 5249; GFX8: ; %bb.0: ; %entry 5250; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5251; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5252; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5253; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5254; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5255; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5256; GFX8-NEXT: s_cbranch_execz .LBB20_2 5257; GFX8-NEXT: ; %bb.1: 5258; GFX8-NEXT: v_mov_b32_e32 v0, 5 5259; GFX8-NEXT: v_mov_b32_e32 v2, 0 5260; GFX8-NEXT: v_mov_b32_e32 v1, 0 5261; GFX8-NEXT: s_mov_b32 m0, -1 5262; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5263; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5264; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5265; GFX8-NEXT: .LBB20_2: 5266; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5267; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5268; GFX8-NEXT: v_readfirstlane_b32 s4, v0 5269; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 5270; GFX8-NEXT: v_readfirstlane_b32 s5, v1 5271; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5272; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5273; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5274; GFX8-NEXT: v_mov_b32_e32 v2, s5 5275; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5276; GFX8-NEXT: v_mov_b32_e32 v2, s4 5277; GFX8-NEXT: s_mov_b32 s2, -1 5278; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5279; GFX8-NEXT: s_mov_b32 s3, 0xf000 5280; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5281; GFX8-NEXT: s_endpgm 5282; 5283; GFX9-LABEL: min_i64_constant: 5284; GFX9: ; %bb.0: ; %entry 5285; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5286; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5287; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5288; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5289; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5290; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5291; GFX9-NEXT: s_cbranch_execz .LBB20_2 5292; GFX9-NEXT: ; %bb.1: 5293; GFX9-NEXT: v_mov_b32_e32 v0, 5 5294; GFX9-NEXT: v_mov_b32_e32 v1, 0 5295; GFX9-NEXT: v_mov_b32_e32 v2, 0 5296; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5297; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5298; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5299; GFX9-NEXT: .LBB20_2: 5300; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5301; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5302; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5303; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 5304; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5305; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5306; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5307; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5308; GFX9-NEXT: v_mov_b32_e32 v2, s5 5309; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5310; GFX9-NEXT: v_mov_b32_e32 v2, s4 5311; GFX9-NEXT: s_mov_b32 s2, -1 5312; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5313; GFX9-NEXT: s_mov_b32 s3, 0xf000 5314; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5315; GFX9-NEXT: s_endpgm 5316; 5317; GFX1064-LABEL: min_i64_constant: 5318; GFX1064: ; %bb.0: ; %entry 5319; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5320; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5321; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5322; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5323; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5324; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5325; GFX1064-NEXT: s_cbranch_execz .LBB20_2 5326; GFX1064-NEXT: ; %bb.1: 5327; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5328; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5329; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5330; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5331; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5332; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5333; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5334; GFX1064-NEXT: buffer_gl0_inv 5335; GFX1064-NEXT: .LBB20_2: 5336; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5337; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5338; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5339; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5340; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5341; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5342; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5343; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5344; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5345; GFX1064-NEXT: s_mov_b32 s2, -1 5346; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5347; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5348; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5349; GFX1064-NEXT: s_endpgm 5350; 5351; GFX1032-LABEL: min_i64_constant: 5352; GFX1032: ; %bb.0: ; %entry 5353; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5354; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5355; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5356; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5357; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5358; GFX1032-NEXT: s_cbranch_execz .LBB20_2 5359; GFX1032-NEXT: ; %bb.1: 5360; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5361; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5362; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5363; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5364; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5365; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5366; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5367; GFX1032-NEXT: buffer_gl0_inv 5368; GFX1032-NEXT: .LBB20_2: 5369; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5370; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5371; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5372; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5373; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5374; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5375; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5376; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5377; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5378; GFX1032-NEXT: s_mov_b32 s2, -1 5379; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5380; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5381; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5382; GFX1032-NEXT: s_endpgm 5383; 5384; GFX1164-LABEL: min_i64_constant: 5385; GFX1164: ; %bb.0: ; %entry 5386; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5387; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5388; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5389; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5390; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5391; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5392; GFX1164-NEXT: s_cbranch_execz .LBB20_2 5393; GFX1164-NEXT: ; %bb.1: 5394; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5395; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5396; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5397; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5398; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5399; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5400; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5401; GFX1164-NEXT: buffer_gl0_inv 5402; GFX1164-NEXT: .LBB20_2: 5403; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5404; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5405; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5406; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5407; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5408; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5409; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5410; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5411; GFX1164-NEXT: s_mov_b32 s2, -1 5412; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5413; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5414; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5415; GFX1164-NEXT: s_endpgm 5416; 5417; GFX1132-LABEL: min_i64_constant: 5418; GFX1132: ; %bb.0: ; %entry 5419; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5420; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5421; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5422; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5423; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5424; GFX1132-NEXT: s_cbranch_execz .LBB20_2 5425; GFX1132-NEXT: ; %bb.1: 5426; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5427; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5428; GFX1132-NEXT: v_mov_b32_e32 v2, 0 5429; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5430; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5431; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5432; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5433; GFX1132-NEXT: buffer_gl0_inv 5434; GFX1132-NEXT: .LBB20_2: 5435; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5436; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5437; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5438; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5439; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5440; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5441; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5442; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5443; GFX1132-NEXT: s_mov_b32 s2, -1 5444; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5445; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5446; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5447; GFX1132-NEXT: s_endpgm 5448entry: 5449 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 5450 store i64 %old, i64 addrspace(1)* %out 5451 ret void 5452} 5453 5454define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 5455; 5456; 5457; GFX7LESS-LABEL: umax_i32_varying: 5458; GFX7LESS: ; %bb.0: ; %entry 5459; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5460; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5461; GFX7LESS-NEXT: s_mov_b32 m0, -1 5462; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5463; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 5464; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5465; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5466; GFX7LESS-NEXT: s_mov_b32 s2, -1 5467; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5468; GFX7LESS-NEXT: s_endpgm 5469; 5470; GFX8-LABEL: umax_i32_varying: 5471; GFX8: ; %bb.0: ; %entry 5472; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5473; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5474; GFX8-NEXT: v_mov_b32_e32 v1, 0 5475; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5476; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5477; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5478; GFX8-NEXT: v_mov_b32_e32 v2, v0 5479; GFX8-NEXT: s_not_b64 exec, exec 5480; GFX8-NEXT: v_mov_b32_e32 v2, 0 5481; GFX8-NEXT: s_not_b64 exec, exec 5482; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5483; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5484; GFX8-NEXT: s_nop 1 5485; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5486; GFX8-NEXT: s_nop 1 5487; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5488; GFX8-NEXT: s_nop 1 5489; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5490; GFX8-NEXT: s_nop 1 5491; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5492; GFX8-NEXT: s_nop 1 5493; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5494; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5495; GFX8-NEXT: s_nop 0 5496; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5497; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5498; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5499; GFX8-NEXT: ; implicit-def: $vgpr0 5500; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5501; GFX8-NEXT: s_cbranch_execz .LBB21_2 5502; GFX8-NEXT: ; %bb.1: 5503; GFX8-NEXT: v_mov_b32_e32 v0, 0 5504; GFX8-NEXT: v_mov_b32_e32 v3, s4 5505; GFX8-NEXT: s_mov_b32 m0, -1 5506; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5507; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 5508; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5509; GFX8-NEXT: .LBB21_2: 5510; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5511; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5512; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5513; GFX8-NEXT: v_mov_b32_e32 v0, v1 5514; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 5515; GFX8-NEXT: s_mov_b32 s3, 0xf000 5516; GFX8-NEXT: s_mov_b32 s2, -1 5517; GFX8-NEXT: s_nop 0 5518; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5519; GFX8-NEXT: s_endpgm 5520; 5521; GFX9-LABEL: umax_i32_varying: 5522; GFX9: ; %bb.0: ; %entry 5523; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5524; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5525; GFX9-NEXT: v_mov_b32_e32 v1, 0 5526; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5527; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5528; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5529; GFX9-NEXT: v_mov_b32_e32 v2, v0 5530; GFX9-NEXT: s_not_b64 exec, exec 5531; GFX9-NEXT: v_mov_b32_e32 v2, 0 5532; GFX9-NEXT: s_not_b64 exec, exec 5533; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5534; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5535; GFX9-NEXT: s_nop 1 5536; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5537; GFX9-NEXT: s_nop 1 5538; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5539; GFX9-NEXT: s_nop 1 5540; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5541; GFX9-NEXT: s_nop 1 5542; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5543; GFX9-NEXT: s_nop 1 5544; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5545; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5546; GFX9-NEXT: s_nop 0 5547; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5548; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5549; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5550; GFX9-NEXT: ; implicit-def: $vgpr0 5551; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5552; GFX9-NEXT: s_cbranch_execz .LBB21_2 5553; GFX9-NEXT: ; %bb.1: 5554; GFX9-NEXT: v_mov_b32_e32 v0, 0 5555; GFX9-NEXT: v_mov_b32_e32 v3, s4 5556; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5557; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 5558; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5559; GFX9-NEXT: .LBB21_2: 5560; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5561; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5562; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5563; GFX9-NEXT: v_mov_b32_e32 v0, v1 5564; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 5565; GFX9-NEXT: s_mov_b32 s3, 0xf000 5566; GFX9-NEXT: s_mov_b32 s2, -1 5567; GFX9-NEXT: s_nop 0 5568; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5569; GFX9-NEXT: s_endpgm 5570; 5571; GFX1064-LABEL: umax_i32_varying: 5572; GFX1064: ; %bb.0: ; %entry 5573; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5574; GFX1064-NEXT: s_not_b64 exec, exec 5575; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5576; GFX1064-NEXT: s_not_b64 exec, exec 5577; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5578; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5579; GFX1064-NEXT: v_mov_b32_e32 v3, 0 5580; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5581; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5582; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5583; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5584; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5585; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5586; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5587; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5588; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5589; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5590; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5591; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5592; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5593; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5594; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5595; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5596; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5597; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5598; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5599; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5600; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5601; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5602; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5603; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5604; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5605; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5606; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5607; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5608; GFX1064-NEXT: s_mov_b32 s2, -1 5609; GFX1064-NEXT: ; implicit-def: $vgpr0 5610; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5611; GFX1064-NEXT: s_cbranch_execz .LBB21_2 5612; GFX1064-NEXT: ; %bb.1: 5613; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5614; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5615; GFX1064-NEXT: s_mov_b32 s3, s7 5616; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5617; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5618; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 5619; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5620; GFX1064-NEXT: buffer_gl0_inv 5621; GFX1064-NEXT: .LBB21_2: 5622; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5623; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5624; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5625; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5626; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 5627; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5628; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5629; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5630; GFX1064-NEXT: s_endpgm 5631; 5632; GFX1032-LABEL: umax_i32_varying: 5633; GFX1032: ; %bb.0: ; %entry 5634; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5635; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5636; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5637; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5638; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5639; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5640; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5641; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5642; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5643; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5644; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5645; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5646; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5647; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5648; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5649; GFX1032-NEXT: v_mov_b32_e32 v3, 0 5650; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5651; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5652; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5653; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5654; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5655; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5656; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5657; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5658; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5659; GFX1032-NEXT: s_mov_b32 s2, -1 5660; GFX1032-NEXT: ; implicit-def: $vgpr0 5661; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5662; GFX1032-NEXT: s_cbranch_execz .LBB21_2 5663; GFX1032-NEXT: ; %bb.1: 5664; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5665; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5666; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5667; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5668; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 5669; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5670; GFX1032-NEXT: buffer_gl0_inv 5671; GFX1032-NEXT: .LBB21_2: 5672; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5673; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5674; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5675; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5676; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 5677; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5678; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5679; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5680; GFX1032-NEXT: s_endpgm 5681; 5682; GFX1164-LABEL: umax_i32_varying: 5683; GFX1164: ; %bb.0: ; %entry 5684; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5685; GFX1164-NEXT: s_not_b64 exec, exec 5686; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5687; GFX1164-NEXT: s_not_b64 exec, exec 5688; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5689; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5690; GFX1164-NEXT: v_mov_b32_e32 v3, 0 5691; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5692; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5693; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5694; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5695; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5696; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5697; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5698; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5699; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5700; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5701; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5702; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5703; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5704; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5705; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5706; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5707; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5708; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5709; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5710; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5711; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5712; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5713; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5714; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5715; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5716; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5717; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5718; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5719; GFX1164-NEXT: s_mov_b32 s2, -1 5720; GFX1164-NEXT: ; implicit-def: $vgpr0 5721; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5722; GFX1164-NEXT: s_cbranch_execz .LBB21_2 5723; GFX1164-NEXT: ; %bb.1: 5724; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5725; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5726; GFX1164-NEXT: s_mov_b32 s3, s7 5727; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5728; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5729; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v4 5730; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5731; GFX1164-NEXT: buffer_gl0_inv 5732; GFX1164-NEXT: .LBB21_2: 5733; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5734; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5735; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5736; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 5737; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5738; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5739; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5740; GFX1164-NEXT: s_endpgm 5741; 5742; GFX1132-LABEL: umax_i32_varying: 5743; GFX1132: ; %bb.0: ; %entry 5744; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5745; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5746; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5747; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5748; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5749; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5750; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5751; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5752; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5753; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5754; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5755; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5756; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5757; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5758; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5759; GFX1132-NEXT: v_mov_b32_e32 v3, 0 5760; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5761; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5762; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5763; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5764; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5765; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5766; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5767; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5768; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5769; GFX1132-NEXT: s_mov_b32 s2, -1 5770; GFX1132-NEXT: ; implicit-def: $vgpr0 5771; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5772; GFX1132-NEXT: s_cbranch_execz .LBB21_2 5773; GFX1132-NEXT: ; %bb.1: 5774; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5775; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5776; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5777; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5778; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v4 5779; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5780; GFX1132-NEXT: buffer_gl0_inv 5781; GFX1132-NEXT: .LBB21_2: 5782; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5783; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5784; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5785; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 5786; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5787; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5788; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5789; GFX1132-NEXT: s_endpgm 5790entry: 5791 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5792 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5793 store i32 %old, i32 addrspace(1)* %out 5794 ret void 5795} 5796 5797define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 5798; 5799; 5800; GFX7LESS-LABEL: umax_i64_constant: 5801; GFX7LESS: ; %bb.0: ; %entry 5802; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5803; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5804; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5805; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5806; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5807; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5808; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 5809; GFX7LESS-NEXT: ; %bb.1: 5810; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5811; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5812; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5813; GFX7LESS-NEXT: s_mov_b32 m0, -1 5814; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5815; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5816; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5817; GFX7LESS-NEXT: .LBB22_2: 5818; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5819; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5820; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5821; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5822; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5823; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5824; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5825; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 5826; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 5827; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5828; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 5829; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5830; GFX7LESS-NEXT: s_mov_b32 s2, -1 5831; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5832; GFX7LESS-NEXT: s_endpgm 5833; 5834; GFX8-LABEL: umax_i64_constant: 5835; GFX8: ; %bb.0: ; %entry 5836; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5837; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5838; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5839; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5840; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5841; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5842; GFX8-NEXT: s_cbranch_execz .LBB22_2 5843; GFX8-NEXT: ; %bb.1: 5844; GFX8-NEXT: v_mov_b32_e32 v0, 5 5845; GFX8-NEXT: v_mov_b32_e32 v2, 0 5846; GFX8-NEXT: v_mov_b32_e32 v1, 0 5847; GFX8-NEXT: s_mov_b32 m0, -1 5848; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5849; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5850; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5851; GFX8-NEXT: .LBB22_2: 5852; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5853; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5854; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5855; GFX8-NEXT: v_readfirstlane_b32 s3, v1 5856; GFX8-NEXT: v_mov_b32_e32 v1, 0 5857; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5858; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5859; GFX8-NEXT: v_mov_b32_e32 v2, s2 5860; GFX8-NEXT: v_mov_b32_e32 v1, s3 5861; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5862; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5863; GFX8-NEXT: s_mov_b32 s3, 0xf000 5864; GFX8-NEXT: s_mov_b32 s2, -1 5865; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5866; GFX8-NEXT: s_endpgm 5867; 5868; GFX9-LABEL: umax_i64_constant: 5869; GFX9: ; %bb.0: ; %entry 5870; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5871; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5872; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5873; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5874; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5875; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5876; GFX9-NEXT: s_cbranch_execz .LBB22_2 5877; GFX9-NEXT: ; %bb.1: 5878; GFX9-NEXT: v_mov_b32_e32 v0, 5 5879; GFX9-NEXT: v_mov_b32_e32 v1, 0 5880; GFX9-NEXT: v_mov_b32_e32 v2, 0 5881; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5882; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5883; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5884; GFX9-NEXT: .LBB22_2: 5885; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5886; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5887; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5888; GFX9-NEXT: v_readfirstlane_b32 s3, v1 5889; GFX9-NEXT: v_mov_b32_e32 v1, 0 5890; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5891; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5892; GFX9-NEXT: v_mov_b32_e32 v2, s2 5893; GFX9-NEXT: v_mov_b32_e32 v1, s3 5894; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5895; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5896; GFX9-NEXT: s_mov_b32 s3, 0xf000 5897; GFX9-NEXT: s_mov_b32 s2, -1 5898; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5899; GFX9-NEXT: s_endpgm 5900; 5901; GFX1064-LABEL: umax_i64_constant: 5902; GFX1064: ; %bb.0: ; %entry 5903; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5904; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5905; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5906; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5907; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5908; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5909; GFX1064-NEXT: s_cbranch_execz .LBB22_2 5910; GFX1064-NEXT: ; %bb.1: 5911; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5912; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5913; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5914; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5915; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5916; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5917; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5918; GFX1064-NEXT: buffer_gl0_inv 5919; GFX1064-NEXT: .LBB22_2: 5920; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5921; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5922; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5923; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5924; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5925; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5926; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5927; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5928; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 5929; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5930; GFX1064-NEXT: s_mov_b32 s2, -1 5931; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5932; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5933; GFX1064-NEXT: s_endpgm 5934; 5935; GFX1032-LABEL: umax_i64_constant: 5936; GFX1032: ; %bb.0: ; %entry 5937; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5938; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5939; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5940; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5941; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5942; GFX1032-NEXT: s_cbranch_execz .LBB22_2 5943; GFX1032-NEXT: ; %bb.1: 5944; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5945; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5946; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5947; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5948; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5949; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5950; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5951; GFX1032-NEXT: buffer_gl0_inv 5952; GFX1032-NEXT: .LBB22_2: 5953; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5954; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5955; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5956; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5957; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5958; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 5959; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 5960; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5961; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 5962; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5963; GFX1032-NEXT: s_mov_b32 s2, -1 5964; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5965; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5966; GFX1032-NEXT: s_endpgm 5967; 5968; GFX1164-LABEL: umax_i64_constant: 5969; GFX1164: ; %bb.0: ; %entry 5970; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5971; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5972; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5973; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5974; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5975; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5976; GFX1164-NEXT: s_cbranch_execz .LBB22_2 5977; GFX1164-NEXT: ; %bb.1: 5978; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5979; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5980; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5981; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5982; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5983; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5984; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5985; GFX1164-NEXT: buffer_gl0_inv 5986; GFX1164-NEXT: .LBB22_2: 5987; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5988; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5989; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5990; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5991; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5992; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5993; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5994; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 5995; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5996; GFX1164-NEXT: s_mov_b32 s2, -1 5997; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5998; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5999; GFX1164-NEXT: s_endpgm 6000; 6001; GFX1132-LABEL: umax_i64_constant: 6002; GFX1132: ; %bb.0: ; %entry 6003; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6004; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6005; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6006; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6007; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6008; GFX1132-NEXT: s_cbranch_execz .LBB22_2 6009; GFX1132-NEXT: ; %bb.1: 6010; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6011; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6012; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6013; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6014; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6015; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6016; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6017; GFX1132-NEXT: buffer_gl0_inv 6018; GFX1132-NEXT: .LBB22_2: 6019; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6020; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6021; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6022; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6023; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6024; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6025; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6026; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6027; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6028; GFX1132-NEXT: s_mov_b32 s2, -1 6029; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6030; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6031; GFX1132-NEXT: s_endpgm 6032entry: 6033 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 6034 store i64 %old, i64 addrspace(1)* %out 6035 ret void 6036} 6037 6038define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 6039; 6040; 6041; GFX7LESS-LABEL: umin_i32_varying: 6042; GFX7LESS: ; %bb.0: ; %entry 6043; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6044; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6045; GFX7LESS-NEXT: s_mov_b32 m0, -1 6046; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6047; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 6048; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6049; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6050; GFX7LESS-NEXT: s_mov_b32 s2, -1 6051; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 6052; GFX7LESS-NEXT: s_endpgm 6053; 6054; GFX8-LABEL: umin_i32_varying: 6055; GFX8: ; %bb.0: ; %entry 6056; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6057; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6058; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6059; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6060; GFX8-NEXT: v_mov_b32_e32 v1, -1 6061; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6062; GFX8-NEXT: v_mov_b32_e32 v2, v0 6063; GFX8-NEXT: s_not_b64 exec, exec 6064; GFX8-NEXT: v_mov_b32_e32 v2, -1 6065; GFX8-NEXT: s_not_b64 exec, exec 6066; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6067; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6068; GFX8-NEXT: s_nop 1 6069; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6070; GFX8-NEXT: s_nop 1 6071; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6072; GFX8-NEXT: s_nop 1 6073; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6074; GFX8-NEXT: s_nop 1 6075; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6076; GFX8-NEXT: s_nop 1 6077; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6078; GFX8-NEXT: v_readlane_b32 s4, v2, 63 6079; GFX8-NEXT: s_nop 0 6080; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6081; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6082; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6083; GFX8-NEXT: ; implicit-def: $vgpr0 6084; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6085; GFX8-NEXT: s_cbranch_execz .LBB23_2 6086; GFX8-NEXT: ; %bb.1: 6087; GFX8-NEXT: v_mov_b32_e32 v0, 0 6088; GFX8-NEXT: v_mov_b32_e32 v3, s4 6089; GFX8-NEXT: s_mov_b32 m0, -1 6090; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6091; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 6092; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6093; GFX8-NEXT: .LBB23_2: 6094; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6095; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6096; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6097; GFX8-NEXT: v_mov_b32_e32 v0, v1 6098; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 6099; GFX8-NEXT: s_mov_b32 s3, 0xf000 6100; GFX8-NEXT: s_mov_b32 s2, -1 6101; GFX8-NEXT: s_nop 0 6102; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 6103; GFX8-NEXT: s_endpgm 6104; 6105; GFX9-LABEL: umin_i32_varying: 6106; GFX9: ; %bb.0: ; %entry 6107; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6108; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6109; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6110; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6111; GFX9-NEXT: v_mov_b32_e32 v1, -1 6112; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6113; GFX9-NEXT: v_mov_b32_e32 v2, v0 6114; GFX9-NEXT: s_not_b64 exec, exec 6115; GFX9-NEXT: v_mov_b32_e32 v2, -1 6116; GFX9-NEXT: s_not_b64 exec, exec 6117; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6118; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6119; GFX9-NEXT: s_nop 1 6120; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6121; GFX9-NEXT: s_nop 1 6122; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6123; GFX9-NEXT: s_nop 1 6124; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6125; GFX9-NEXT: s_nop 1 6126; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6127; GFX9-NEXT: s_nop 1 6128; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6129; GFX9-NEXT: v_readlane_b32 s4, v2, 63 6130; GFX9-NEXT: s_nop 0 6131; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6132; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6133; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6134; GFX9-NEXT: ; implicit-def: $vgpr0 6135; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6136; GFX9-NEXT: s_cbranch_execz .LBB23_2 6137; GFX9-NEXT: ; %bb.1: 6138; GFX9-NEXT: v_mov_b32_e32 v0, 0 6139; GFX9-NEXT: v_mov_b32_e32 v3, s4 6140; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6141; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 6142; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6143; GFX9-NEXT: .LBB23_2: 6144; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6146; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6147; GFX9-NEXT: v_mov_b32_e32 v0, v1 6148; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 6149; GFX9-NEXT: s_mov_b32 s3, 0xf000 6150; GFX9-NEXT: s_mov_b32 s2, -1 6151; GFX9-NEXT: s_nop 0 6152; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 6153; GFX9-NEXT: s_endpgm 6154; 6155; GFX1064-LABEL: umin_i32_varying: 6156; GFX1064: ; %bb.0: ; %entry 6157; GFX1064-NEXT: v_mov_b32_e32 v1, v0 6158; GFX1064-NEXT: s_not_b64 exec, exec 6159; GFX1064-NEXT: v_mov_b32_e32 v1, -1 6160; GFX1064-NEXT: s_not_b64 exec, exec 6161; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6162; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6163; GFX1064-NEXT: v_mov_b32_e32 v3, -1 6164; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6165; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6166; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6167; GFX1064-NEXT: v_mov_b32_e32 v2, v1 6168; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6169; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6170; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 6171; GFX1064-NEXT: v_mov_b32_e32 v2, s4 6172; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6173; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 6174; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6175; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6176; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6177; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6178; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 6179; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 6180; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6181; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6182; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6183; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 6184; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 6185; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 6186; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6187; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6188; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 6189; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 6190; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 6191; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6192; GFX1064-NEXT: s_mov_b32 s2, -1 6193; GFX1064-NEXT: ; implicit-def: $vgpr0 6194; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 6195; GFX1064-NEXT: s_cbranch_execz .LBB23_2 6196; GFX1064-NEXT: ; %bb.1: 6197; GFX1064-NEXT: v_mov_b32_e32 v0, 0 6198; GFX1064-NEXT: v_mov_b32_e32 v4, s7 6199; GFX1064-NEXT: s_mov_b32 s3, s7 6200; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6201; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6202; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 6203; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6204; GFX1064-NEXT: buffer_gl0_inv 6205; GFX1064-NEXT: .LBB23_2: 6206; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6207; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 6208; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 6209; GFX1064-NEXT: v_mov_b32_e32 v0, v3 6210; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 6211; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6212; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6213; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 6214; GFX1064-NEXT: s_endpgm 6215; 6216; GFX1032-LABEL: umin_i32_varying: 6217; GFX1032: ; %bb.0: ; %entry 6218; GFX1032-NEXT: v_mov_b32_e32 v1, v0 6219; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6220; GFX1032-NEXT: v_mov_b32_e32 v1, -1 6221; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6222; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6223; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6224; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6225; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6226; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6227; GFX1032-NEXT: v_mov_b32_e32 v2, v1 6228; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6229; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6230; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6231; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6232; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6233; GFX1032-NEXT: v_mov_b32_e32 v3, -1 6234; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 6235; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 6236; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6237; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6238; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6239; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6240; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 6241; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6242; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6243; GFX1032-NEXT: s_mov_b32 s2, -1 6244; GFX1032-NEXT: ; implicit-def: $vgpr0 6245; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 6246; GFX1032-NEXT: s_cbranch_execz .LBB23_2 6247; GFX1032-NEXT: ; %bb.1: 6248; GFX1032-NEXT: v_mov_b32_e32 v0, 0 6249; GFX1032-NEXT: v_mov_b32_e32 v4, s4 6250; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6251; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6252; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 6253; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6254; GFX1032-NEXT: buffer_gl0_inv 6255; GFX1032-NEXT: .LBB23_2: 6256; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6257; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 6258; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 6259; GFX1032-NEXT: v_mov_b32_e32 v0, v3 6260; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 6261; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6262; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6263; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 6264; GFX1032-NEXT: s_endpgm 6265; 6266; GFX1164-LABEL: umin_i32_varying: 6267; GFX1164: ; %bb.0: ; %entry 6268; GFX1164-NEXT: v_mov_b32_e32 v1, v0 6269; GFX1164-NEXT: s_not_b64 exec, exec 6270; GFX1164-NEXT: v_mov_b32_e32 v1, -1 6271; GFX1164-NEXT: s_not_b64 exec, exec 6272; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6273; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6274; GFX1164-NEXT: v_mov_b32_e32 v3, -1 6275; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6276; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6277; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6278; GFX1164-NEXT: v_mov_b32_e32 v2, v1 6279; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6280; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6281; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 6282; GFX1164-NEXT: v_mov_b32_e32 v2, s4 6283; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6284; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 6285; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6286; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6287; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6288; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6289; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 6290; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 6291; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6292; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6293; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6294; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 6295; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 6296; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 6297; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6298; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6299; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 6300; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 6301; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 6302; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6303; GFX1164-NEXT: s_mov_b32 s2, -1 6304; GFX1164-NEXT: ; implicit-def: $vgpr0 6305; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 6306; GFX1164-NEXT: s_cbranch_execz .LBB23_2 6307; GFX1164-NEXT: ; %bb.1: 6308; GFX1164-NEXT: v_mov_b32_e32 v0, 0 6309; GFX1164-NEXT: v_mov_b32_e32 v4, s7 6310; GFX1164-NEXT: s_mov_b32 s3, s7 6311; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6312; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6313; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 6314; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6315; GFX1164-NEXT: buffer_gl0_inv 6316; GFX1164-NEXT: .LBB23_2: 6317; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 6318; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 6319; GFX1164-NEXT: v_mov_b32_e32 v0, v3 6320; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 6321; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6322; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6323; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6324; GFX1164-NEXT: s_endpgm 6325; 6326; GFX1132-LABEL: umin_i32_varying: 6327; GFX1132: ; %bb.0: ; %entry 6328; GFX1132-NEXT: v_mov_b32_e32 v1, v0 6329; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6330; GFX1132-NEXT: v_mov_b32_e32 v1, -1 6331; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6332; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6333; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6334; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6335; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6336; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6337; GFX1132-NEXT: v_mov_b32_e32 v2, v1 6338; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6339; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6340; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6341; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6342; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6343; GFX1132-NEXT: v_mov_b32_e32 v3, -1 6344; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 6345; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 6346; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6347; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6348; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6349; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6350; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 6351; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6352; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6353; GFX1132-NEXT: s_mov_b32 s2, -1 6354; GFX1132-NEXT: ; implicit-def: $vgpr0 6355; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 6356; GFX1132-NEXT: s_cbranch_execz .LBB23_2 6357; GFX1132-NEXT: ; %bb.1: 6358; GFX1132-NEXT: v_mov_b32_e32 v0, 0 6359; GFX1132-NEXT: v_mov_b32_e32 v4, s4 6360; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6361; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6362; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 6363; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6364; GFX1132-NEXT: buffer_gl0_inv 6365; GFX1132-NEXT: .LBB23_2: 6366; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 6367; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 6368; GFX1132-NEXT: v_mov_b32_e32 v0, v3 6369; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 6370; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6371; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6372; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6373; GFX1132-NEXT: s_endpgm 6374entry: 6375 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6376 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 6377 store i32 %old, i32 addrspace(1)* %out 6378 ret void 6379} 6380 6381define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 6382; 6383; 6384; GFX7LESS-LABEL: umin_i64_constant: 6385; GFX7LESS: ; %bb.0: ; %entry 6386; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6387; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6388; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6389; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6390; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6391; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6392; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 6393; GFX7LESS-NEXT: ; %bb.1: 6394; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6395; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6396; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6397; GFX7LESS-NEXT: s_mov_b32 m0, -1 6398; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6399; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6400; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6401; GFX7LESS-NEXT: .LBB24_2: 6402; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6403; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6404; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6405; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6406; GFX7LESS-NEXT: s_mov_b32 s2, -1 6407; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6408; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6409; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 6410; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6411; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6412; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6413; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6414; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6415; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6416; GFX7LESS-NEXT: s_endpgm 6417; 6418; GFX8-LABEL: umin_i64_constant: 6419; GFX8: ; %bb.0: ; %entry 6420; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6421; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6422; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6423; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6424; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6425; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6426; GFX8-NEXT: s_cbranch_execz .LBB24_2 6427; GFX8-NEXT: ; %bb.1: 6428; GFX8-NEXT: v_mov_b32_e32 v0, 5 6429; GFX8-NEXT: v_mov_b32_e32 v2, 0 6430; GFX8-NEXT: v_mov_b32_e32 v1, 0 6431; GFX8-NEXT: s_mov_b32 m0, -1 6432; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6433; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6434; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6435; GFX8-NEXT: .LBB24_2: 6436; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6437; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6438; GFX8-NEXT: v_readfirstlane_b32 s4, v0 6439; GFX8-NEXT: v_readfirstlane_b32 s5, v1 6440; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6441; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6442; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6443; GFX8-NEXT: v_mov_b32_e32 v2, s5 6444; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6445; GFX8-NEXT: v_mov_b32_e32 v2, s4 6446; GFX8-NEXT: s_mov_b32 s2, -1 6447; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6448; GFX8-NEXT: s_mov_b32 s3, 0xf000 6449; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6450; GFX8-NEXT: s_endpgm 6451; 6452; GFX9-LABEL: umin_i64_constant: 6453; GFX9: ; %bb.0: ; %entry 6454; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6455; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6456; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6457; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6458; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6459; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6460; GFX9-NEXT: s_cbranch_execz .LBB24_2 6461; GFX9-NEXT: ; %bb.1: 6462; GFX9-NEXT: v_mov_b32_e32 v0, 5 6463; GFX9-NEXT: v_mov_b32_e32 v1, 0 6464; GFX9-NEXT: v_mov_b32_e32 v2, 0 6465; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6466; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6467; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6468; GFX9-NEXT: .LBB24_2: 6469; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6470; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6471; GFX9-NEXT: v_readfirstlane_b32 s4, v0 6472; GFX9-NEXT: v_readfirstlane_b32 s5, v1 6473; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6474; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6475; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6476; GFX9-NEXT: v_mov_b32_e32 v2, s5 6477; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6478; GFX9-NEXT: v_mov_b32_e32 v2, s4 6479; GFX9-NEXT: s_mov_b32 s2, -1 6480; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6481; GFX9-NEXT: s_mov_b32 s3, 0xf000 6482; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6483; GFX9-NEXT: s_endpgm 6484; 6485; GFX1064-LABEL: umin_i64_constant: 6486; GFX1064: ; %bb.0: ; %entry 6487; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6488; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6489; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6490; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6491; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6492; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6493; GFX1064-NEXT: s_cbranch_execz .LBB24_2 6494; GFX1064-NEXT: ; %bb.1: 6495; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6496; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6497; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6498; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6499; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6500; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6501; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6502; GFX1064-NEXT: buffer_gl0_inv 6503; GFX1064-NEXT: .LBB24_2: 6504; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6505; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6506; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6507; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6508; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6509; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6510; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6511; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6512; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6513; GFX1064-NEXT: s_mov_b32 s2, -1 6514; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6515; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6516; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6517; GFX1064-NEXT: s_endpgm 6518; 6519; GFX1032-LABEL: umin_i64_constant: 6520; GFX1032: ; %bb.0: ; %entry 6521; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6522; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6523; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6524; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6525; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6526; GFX1032-NEXT: s_cbranch_execz .LBB24_2 6527; GFX1032-NEXT: ; %bb.1: 6528; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6529; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6530; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6531; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6532; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6533; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6534; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6535; GFX1032-NEXT: buffer_gl0_inv 6536; GFX1032-NEXT: .LBB24_2: 6537; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6538; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6539; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6540; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6541; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6542; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6543; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6544; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6545; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6546; GFX1032-NEXT: s_mov_b32 s2, -1 6547; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6548; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6549; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6550; GFX1032-NEXT: s_endpgm 6551; 6552; GFX1164-LABEL: umin_i64_constant: 6553; GFX1164: ; %bb.0: ; %entry 6554; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6555; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6556; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6557; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6558; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6559; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6560; GFX1164-NEXT: s_cbranch_execz .LBB24_2 6561; GFX1164-NEXT: ; %bb.1: 6562; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6563; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6564; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6565; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6566; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6567; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6568; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6569; GFX1164-NEXT: buffer_gl0_inv 6570; GFX1164-NEXT: .LBB24_2: 6571; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6572; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6573; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6574; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6575; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6576; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6577; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6578; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6579; GFX1164-NEXT: s_mov_b32 s2, -1 6580; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6581; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6582; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6583; GFX1164-NEXT: s_endpgm 6584; 6585; GFX1132-LABEL: umin_i64_constant: 6586; GFX1132: ; %bb.0: ; %entry 6587; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6588; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6589; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6590; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6591; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6592; GFX1132-NEXT: s_cbranch_execz .LBB24_2 6593; GFX1132-NEXT: ; %bb.1: 6594; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6595; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6596; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6597; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6598; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6599; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6600; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6601; GFX1132-NEXT: buffer_gl0_inv 6602; GFX1132-NEXT: .LBB24_2: 6603; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6604; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6605; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6606; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6607; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6608; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6609; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6610; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6611; GFX1132-NEXT: s_mov_b32 s2, -1 6612; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6613; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6614; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6615; GFX1132-NEXT: s_endpgm 6616entry: 6617 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 6618 store i64 %old, i64 addrspace(1)* %out 6619 ret void 6620} 6621