1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s 8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s 9 10declare i32 @llvm.amdgcn.workitem.id.x() 11 12@local_var32 = addrspace(3) global i32 undef, align 4 13@local_var64 = addrspace(3) global i64 undef, align 8 14 15; Show what the atomic optimization pass will do for local pointers. 16 17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 18; 19; 20; GFX7LESS-LABEL: add_i32_constant: 21; GFX7LESS: ; %bb.0: ; %entry 22; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 23; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 24; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 25; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 26; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 27; GFX7LESS-NEXT: ; implicit-def: $vgpr1 28; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 29; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 30; GFX7LESS-NEXT: ; %bb.1: 31; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 32; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 33; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 34; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 35; GFX7LESS-NEXT: s_mov_b32 m0, -1 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: .LBB0_2: 40; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 41; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 42; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 43; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 44; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 45; GFX7LESS-NEXT: s_mov_b32 s2, -1 46; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; GFX7LESS-NEXT: s_endpgm 48; 49; GFX8-LABEL: add_i32_constant: 50; GFX8: ; %bb.0: ; %entry 51; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 52; GFX8-NEXT: s_mov_b64 s[2:3], exec 53; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 54; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 55; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 56; GFX8-NEXT: ; implicit-def: $vgpr1 57; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 58; GFX8-NEXT: s_cbranch_execz .LBB0_2 59; GFX8-NEXT: ; %bb.1: 60; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 61; GFX8-NEXT: s_mul_i32 s2, s2, 5 62; GFX8-NEXT: v_mov_b32_e32 v1, 0 63; GFX8-NEXT: v_mov_b32_e32 v2, s2 64; GFX8-NEXT: s_mov_b32 m0, -1 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 67; GFX8-NEXT: s_waitcnt lgkmcnt(0) 68; GFX8-NEXT: .LBB0_2: 69; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 70; GFX8-NEXT: s_waitcnt lgkmcnt(0) 71; GFX8-NEXT: v_readfirstlane_b32 s2, v1 72; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 73; GFX8-NEXT: s_mov_b32 s3, 0xf000 74; GFX8-NEXT: s_mov_b32 s2, -1 75; GFX8-NEXT: s_nop 1 76; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX8-NEXT: s_endpgm 78; 79; GFX9-LABEL: add_i32_constant: 80; GFX9: ; %bb.0: ; %entry 81; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 82; GFX9-NEXT: s_mov_b64 s[2:3], exec 83; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 84; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 85; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 86; GFX9-NEXT: ; implicit-def: $vgpr1 87; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 88; GFX9-NEXT: s_cbranch_execz .LBB0_2 89; GFX9-NEXT: ; %bb.1: 90; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 91; GFX9-NEXT: s_mul_i32 s2, s2, 5 92; GFX9-NEXT: v_mov_b32_e32 v1, 0 93; GFX9-NEXT: v_mov_b32_e32 v2, s2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: .LBB0_2: 98; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 99; GFX9-NEXT: s_waitcnt lgkmcnt(0) 100; GFX9-NEXT: v_readfirstlane_b32 s2, v1 101; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 102; GFX9-NEXT: s_mov_b32 s3, 0xf000 103; GFX9-NEXT: s_mov_b32 s2, -1 104; GFX9-NEXT: s_nop 1 105; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; GFX9-NEXT: s_endpgm 107; 108; GFX1064-LABEL: add_i32_constant: 109; GFX1064: ; %bb.0: ; %entry 110; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 111; GFX1064-NEXT: s_mov_b64 s[2:3], exec 112; GFX1064-NEXT: ; implicit-def: $vgpr1 113; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 114; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 115; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 116; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 117; GFX1064-NEXT: s_cbranch_execz .LBB0_2 118; GFX1064-NEXT: ; %bb.1: 119; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 120; GFX1064-NEXT: v_mov_b32_e32 v1, 0 121; GFX1064-NEXT: s_mul_i32 s2, s2, 5 122; GFX1064-NEXT: v_mov_b32_e32 v2, s2 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 125; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 126; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 127; GFX1064-NEXT: buffer_gl0_inv 128; GFX1064-NEXT: .LBB0_2: 129; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 130; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 131; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 132; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 133; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 134; GFX1064-NEXT: s_mov_b32 s2, -1 135; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 136; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 137; GFX1064-NEXT: s_endpgm 138; 139; GFX1032-LABEL: add_i32_constant: 140; GFX1032: ; %bb.0: ; %entry 141; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 142; GFX1032-NEXT: s_mov_b32 s3, exec_lo 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz .LBB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 150; GFX1032-NEXT: v_mov_b32_e32 v1, 0 151; GFX1032-NEXT: s_mul_i32 s3, s3, 5 152; GFX1032-NEXT: v_mov_b32_e32 v2, s3 153; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 154; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 155; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 156; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 157; GFX1032-NEXT: buffer_gl0_inv 158; GFX1032-NEXT: .LBB0_2: 159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 166; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 167; GFX1032-NEXT: s_endpgm 168; 169; GFX1164-LABEL: add_i32_constant: 170; GFX1164: ; %bb.0: ; %entry 171; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 172; GFX1164-NEXT: s_mov_b64 s[2:3], exec 173; GFX1164-NEXT: s_mov_b64 s[4:5], exec 174; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 175; GFX1164-NEXT: ; implicit-def: $vgpr1 176; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 177; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 178; GFX1164-NEXT: s_cbranch_execz .LBB0_2 179; GFX1164-NEXT: ; %bb.1: 180; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 181; GFX1164-NEXT: v_mov_b32_e32 v1, 0 182; GFX1164-NEXT: s_mul_i32 s2, s2, 5 183; GFX1164-NEXT: v_mov_b32_e32 v2, s2 184; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 185; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 186; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 187; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 188; GFX1164-NEXT: buffer_gl0_inv 189; GFX1164-NEXT: .LBB0_2: 190; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 191; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 192; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 193; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 194; GFX1164-NEXT: s_mov_b32 s2, -1 195; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 196; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 197; GFX1164-NEXT: s_endpgm 198; 199; GFX1132-LABEL: add_i32_constant: 200; GFX1132: ; %bb.0: ; %entry 201; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 202; GFX1132-NEXT: s_mov_b32 s3, exec_lo 203; GFX1132-NEXT: s_mov_b32 s2, exec_lo 204; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 205; GFX1132-NEXT: ; implicit-def: $vgpr1 206; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 207; GFX1132-NEXT: s_cbranch_execz .LBB0_2 208; GFX1132-NEXT: ; %bb.1: 209; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 210; GFX1132-NEXT: v_mov_b32_e32 v1, 0 211; GFX1132-NEXT: s_mul_i32 s3, s3, 5 212; GFX1132-NEXT: v_mov_b32_e32 v2, s3 213; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 214; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 215; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 216; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 217; GFX1132-NEXT: buffer_gl0_inv 218; GFX1132-NEXT: .LBB0_2: 219; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 220; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 221; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 222; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 223; GFX1132-NEXT: s_mov_b32 s2, -1 224; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 225; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 226; GFX1132-NEXT: s_endpgm 227entry: 228 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 229 store i32 %old, i32 addrspace(1)* %out 230 ret void 231} 232 233define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 234; 235; 236; GFX7LESS-LABEL: add_i32_uniform: 237; GFX7LESS: ; %bb.0: ; %entry 238; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 239; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 240; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 241; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 242; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 243; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 244; GFX7LESS-NEXT: ; implicit-def: $vgpr1 245; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 246; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 247; GFX7LESS-NEXT: ; %bb.1: 248; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 249; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 250; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 251; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 252; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 253; GFX7LESS-NEXT: s_mov_b32 m0, -1 254; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 255; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 256; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 257; GFX7LESS-NEXT: .LBB1_2: 258; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 259; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 260; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 261; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 262; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 263; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 264; GFX7LESS-NEXT: s_mov_b32 s6, -1 265; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 266; GFX7LESS-NEXT: s_endpgm 267; 268; GFX8-LABEL: add_i32_uniform: 269; GFX8: ; %bb.0: ; %entry 270; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 271; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 272; GFX8-NEXT: s_mov_b64 s[2:3], exec 273; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 274; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 275; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 276; GFX8-NEXT: ; implicit-def: $vgpr1 277; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 278; GFX8-NEXT: s_cbranch_execz .LBB1_2 279; GFX8-NEXT: ; %bb.1: 280; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 281; GFX8-NEXT: s_waitcnt lgkmcnt(0) 282; GFX8-NEXT: s_mul_i32 s2, s6, s2 283; GFX8-NEXT: v_mov_b32_e32 v1, 0 284; GFX8-NEXT: v_mov_b32_e32 v2, s2 285; GFX8-NEXT: s_mov_b32 m0, -1 286; GFX8-NEXT: s_waitcnt lgkmcnt(0) 287; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 288; GFX8-NEXT: s_waitcnt lgkmcnt(0) 289; GFX8-NEXT: .LBB1_2: 290; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 291; GFX8-NEXT: s_waitcnt lgkmcnt(0) 292; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 293; GFX8-NEXT: v_readfirstlane_b32 s0, v1 294; GFX8-NEXT: s_mov_b32 s7, 0xf000 295; GFX8-NEXT: s_mov_b32 s6, -1 296; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 297; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 298; GFX8-NEXT: s_endpgm 299; 300; GFX9-LABEL: add_i32_uniform: 301; GFX9: ; %bb.0: ; %entry 302; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 303; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 304; GFX9-NEXT: s_mov_b64 s[2:3], exec 305; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 306; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 307; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 308; GFX9-NEXT: ; implicit-def: $vgpr1 309; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 310; GFX9-NEXT: s_cbranch_execz .LBB1_2 311; GFX9-NEXT: ; %bb.1: 312; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 314; GFX9-NEXT: s_mul_i32 s2, s6, s2 315; GFX9-NEXT: v_mov_b32_e32 v1, 0 316; GFX9-NEXT: v_mov_b32_e32 v2, s2 317; GFX9-NEXT: s_waitcnt lgkmcnt(0) 318; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 319; GFX9-NEXT: s_waitcnt lgkmcnt(0) 320; GFX9-NEXT: .LBB1_2: 321; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 323; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 324; GFX9-NEXT: v_readfirstlane_b32 s0, v1 325; GFX9-NEXT: s_mov_b32 s7, 0xf000 326; GFX9-NEXT: s_mov_b32 s6, -1 327; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 328; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 329; GFX9-NEXT: s_endpgm 330; 331; GFX1064-LABEL: add_i32_uniform: 332; GFX1064: ; %bb.0: ; %entry 333; GFX1064-NEXT: s_clause 0x1 334; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 335; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 336; GFX1064-NEXT: s_mov_b64 s[2:3], exec 337; GFX1064-NEXT: ; implicit-def: $vgpr1 338; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 339; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 340; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 341; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 342; GFX1064-NEXT: s_cbranch_execz .LBB1_2 343; GFX1064-NEXT: ; %bb.1: 344; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 345; GFX1064-NEXT: v_mov_b32_e32 v1, 0 346; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 347; GFX1064-NEXT: s_mul_i32 s2, s6, s2 348; GFX1064-NEXT: v_mov_b32_e32 v2, s2 349; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 350; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 351; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 352; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 353; GFX1064-NEXT: buffer_gl0_inv 354; GFX1064-NEXT: .LBB1_2: 355; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 356; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 357; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 358; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 359; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 360; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1] 361; GFX1064-NEXT: s_mov_b32 s6, -1 362; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 363; GFX1064-NEXT: s_endpgm 364; 365; GFX1032-LABEL: add_i32_uniform: 366; GFX1032: ; %bb.0: ; %entry 367; GFX1032-NEXT: s_clause 0x1 368; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 369; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 370; GFX1032-NEXT: s_mov_b32 s3, exec_lo 371; GFX1032-NEXT: ; implicit-def: $vgpr1 372; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 373; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 374; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 375; GFX1032-NEXT: s_cbranch_execz .LBB1_2 376; GFX1032-NEXT: ; %bb.1: 377; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 378; GFX1032-NEXT: v_mov_b32_e32 v1, 0 379; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 380; GFX1032-NEXT: s_mul_i32 s1, s2, s1 381; GFX1032-NEXT: v_mov_b32_e32 v2, s1 382; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 383; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 384; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 385; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 386; GFX1032-NEXT: buffer_gl0_inv 387; GFX1032-NEXT: .LBB1_2: 388; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 389; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 390; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 391; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 392; GFX1032-NEXT: s_mov_b32 s6, -1 393; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 394; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] 395; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 396; GFX1032-NEXT: s_endpgm 397; 398; GFX1164-LABEL: add_i32_uniform: 399; GFX1164: ; %bb.0: ; %entry 400; GFX1164-NEXT: s_clause 0x1 401; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 402; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 403; GFX1164-NEXT: s_mov_b64 s[2:3], exec 404; GFX1164-NEXT: s_mov_b64 s[0:1], exec 405; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 406; GFX1164-NEXT: ; implicit-def: $vgpr1 407; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 408; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 409; GFX1164-NEXT: s_cbranch_execz .LBB1_2 410; GFX1164-NEXT: ; %bb.1: 411; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 412; GFX1164-NEXT: v_mov_b32_e32 v1, 0 413; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 414; GFX1164-NEXT: s_mul_i32 s2, s6, s2 415; GFX1164-NEXT: v_mov_b32_e32 v2, s2 416; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 417; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 418; GFX1164-NEXT: ds_add_rtn_u32 v1, v1, v2 419; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 420; GFX1164-NEXT: buffer_gl0_inv 421; GFX1164-NEXT: .LBB1_2: 422; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 423; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 424; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 425; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 426; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1] 427; GFX1164-NEXT: s_mov_b32 s6, -1 428; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 429; GFX1164-NEXT: s_endpgm 430; 431; GFX1132-LABEL: add_i32_uniform: 432; GFX1132: ; %bb.0: ; %entry 433; GFX1132-NEXT: s_clause 0x1 434; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 435; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 436; GFX1132-NEXT: s_mov_b32 s2, exec_lo 437; GFX1132-NEXT: s_mov_b32 s1, exec_lo 438; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 439; GFX1132-NEXT: ; implicit-def: $vgpr1 440; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 441; GFX1132-NEXT: s_cbranch_execz .LBB1_2 442; GFX1132-NEXT: ; %bb.1: 443; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 444; GFX1132-NEXT: v_mov_b32_e32 v1, 0 445; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 446; GFX1132-NEXT: s_mul_i32 s2, s0, s2 447; GFX1132-NEXT: v_mov_b32_e32 v2, s2 448; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 449; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 450; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 451; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 452; GFX1132-NEXT: buffer_gl0_inv 453; GFX1132-NEXT: .LBB1_2: 454; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 455; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 456; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 457; GFX1132-NEXT: s_mov_b32 s6, -1 458; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 459; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] 460; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 461; GFX1132-NEXT: s_endpgm 462entry: 463 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 464 store i32 %old, i32 addrspace(1)* %out 465 ret void 466} 467 468define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 469; 470; 471; GFX7LESS-LABEL: add_i32_varying: 472; GFX7LESS: ; %bb.0: ; %entry 473; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 474; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 475; GFX7LESS-NEXT: s_mov_b32 m0, -1 476; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 477; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 478; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 479; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 480; GFX7LESS-NEXT: s_mov_b32 s2, -1 481; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 482; GFX7LESS-NEXT: s_endpgm 483; 484; GFX8-LABEL: add_i32_varying: 485; GFX8: ; %bb.0: ; %entry 486; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 487; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 488; GFX8-NEXT: v_mov_b32_e32 v1, 0 489; GFX8-NEXT: s_mov_b64 exec, s[2:3] 490; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 491; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 492; GFX8-NEXT: v_mov_b32_e32 v2, v0 493; GFX8-NEXT: s_not_b64 exec, exec 494; GFX8-NEXT: v_mov_b32_e32 v2, 0 495; GFX8-NEXT: s_not_b64 exec, exec 496; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 497; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 498; GFX8-NEXT: s_nop 1 499; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 500; GFX8-NEXT: s_nop 1 501; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 502; GFX8-NEXT: s_nop 1 503; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 504; GFX8-NEXT: s_nop 1 505; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 506; GFX8-NEXT: s_nop 1 507; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 508; GFX8-NEXT: v_readlane_b32 s4, v2, 63 509; GFX8-NEXT: s_nop 0 510; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 511; GFX8-NEXT: s_mov_b64 exec, s[2:3] 512; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 513; GFX8-NEXT: ; implicit-def: $vgpr0 514; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 515; GFX8-NEXT: s_cbranch_execz .LBB2_2 516; GFX8-NEXT: ; %bb.1: 517; GFX8-NEXT: v_mov_b32_e32 v0, 0 518; GFX8-NEXT: v_mov_b32_e32 v3, s4 519; GFX8-NEXT: s_mov_b32 m0, -1 520; GFX8-NEXT: s_waitcnt lgkmcnt(0) 521; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 522; GFX8-NEXT: s_waitcnt lgkmcnt(0) 523; GFX8-NEXT: .LBB2_2: 524; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 525; GFX8-NEXT: s_waitcnt lgkmcnt(0) 526; GFX8-NEXT: v_readfirstlane_b32 s2, v0 527; GFX8-NEXT: v_mov_b32_e32 v0, v1 528; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 529; GFX8-NEXT: s_mov_b32 s3, 0xf000 530; GFX8-NEXT: s_mov_b32 s2, -1 531; GFX8-NEXT: s_nop 0 532; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 533; GFX8-NEXT: s_endpgm 534; 535; GFX9-LABEL: add_i32_varying: 536; GFX9: ; %bb.0: ; %entry 537; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 538; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 539; GFX9-NEXT: v_mov_b32_e32 v1, 0 540; GFX9-NEXT: s_mov_b64 exec, s[2:3] 541; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 542; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 543; GFX9-NEXT: v_mov_b32_e32 v2, v0 544; GFX9-NEXT: s_not_b64 exec, exec 545; GFX9-NEXT: v_mov_b32_e32 v2, 0 546; GFX9-NEXT: s_not_b64 exec, exec 547; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 548; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 549; GFX9-NEXT: s_nop 1 550; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 551; GFX9-NEXT: s_nop 1 552; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 553; GFX9-NEXT: s_nop 1 554; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 555; GFX9-NEXT: s_nop 1 556; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 557; GFX9-NEXT: s_nop 1 558; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 559; GFX9-NEXT: v_readlane_b32 s4, v2, 63 560; GFX9-NEXT: s_nop 0 561; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 562; GFX9-NEXT: s_mov_b64 exec, s[2:3] 563; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 564; GFX9-NEXT: ; implicit-def: $vgpr0 565; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 566; GFX9-NEXT: s_cbranch_execz .LBB2_2 567; GFX9-NEXT: ; %bb.1: 568; GFX9-NEXT: v_mov_b32_e32 v0, 0 569; GFX9-NEXT: v_mov_b32_e32 v3, s4 570; GFX9-NEXT: s_waitcnt lgkmcnt(0) 571; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 572; GFX9-NEXT: s_waitcnt lgkmcnt(0) 573; GFX9-NEXT: .LBB2_2: 574; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 575; GFX9-NEXT: s_waitcnt lgkmcnt(0) 576; GFX9-NEXT: v_readfirstlane_b32 s2, v0 577; GFX9-NEXT: v_mov_b32_e32 v0, v1 578; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 579; GFX9-NEXT: s_mov_b32 s3, 0xf000 580; GFX9-NEXT: s_mov_b32 s2, -1 581; GFX9-NEXT: s_nop 0 582; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 583; GFX9-NEXT: s_endpgm 584; 585; GFX1064-LABEL: add_i32_varying: 586; GFX1064: ; %bb.0: ; %entry 587; GFX1064-NEXT: v_mov_b32_e32 v1, v0 588; GFX1064-NEXT: s_not_b64 exec, exec 589; GFX1064-NEXT: v_mov_b32_e32 v1, 0 590; GFX1064-NEXT: s_not_b64 exec, exec 591; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 592; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 593; GFX1064-NEXT: v_mov_b32_e32 v3, 0 594; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 595; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 596; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 597; GFX1064-NEXT: v_mov_b32_e32 v2, v1 598; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 599; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 600; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 601; GFX1064-NEXT: v_mov_b32_e32 v2, s4 602; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 603; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 604; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 605; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 606; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 607; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 608; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 609; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 610; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 611; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 612; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 613; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 614; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 615; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 616; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 617; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 618; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 619; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 620; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 621; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 622; GFX1064-NEXT: s_mov_b32 s2, -1 623; GFX1064-NEXT: ; implicit-def: $vgpr0 624; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 625; GFX1064-NEXT: s_cbranch_execz .LBB2_2 626; GFX1064-NEXT: ; %bb.1: 627; GFX1064-NEXT: v_mov_b32_e32 v0, 0 628; GFX1064-NEXT: v_mov_b32_e32 v4, s7 629; GFX1064-NEXT: s_mov_b32 s3, s7 630; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 631; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 632; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 633; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 634; GFX1064-NEXT: buffer_gl0_inv 635; GFX1064-NEXT: .LBB2_2: 636; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 637; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 638; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 639; GFX1064-NEXT: v_mov_b32_e32 v0, v3 640; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 641; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 642; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 643; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 644; GFX1064-NEXT: s_endpgm 645; 646; GFX1032-LABEL: add_i32_varying: 647; GFX1032: ; %bb.0: ; %entry 648; GFX1032-NEXT: v_mov_b32_e32 v1, v0 649; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 650; GFX1032-NEXT: v_mov_b32_e32 v1, 0 651; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 652; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 653; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 654; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 655; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 656; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 657; GFX1032-NEXT: v_mov_b32_e32 v2, v1 658; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 659; GFX1032-NEXT: s_mov_b32 exec_lo, s2 660; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 661; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 662; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 663; GFX1032-NEXT: v_mov_b32_e32 v3, 0 664; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 665; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 666; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 667; GFX1032-NEXT: s_mov_b32 exec_lo, s2 668; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 669; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 670; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 671; GFX1032-NEXT: s_mov_b32 exec_lo, s2 672; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 673; GFX1032-NEXT: s_mov_b32 s2, -1 674; GFX1032-NEXT: ; implicit-def: $vgpr0 675; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 676; GFX1032-NEXT: s_cbranch_execz .LBB2_2 677; GFX1032-NEXT: ; %bb.1: 678; GFX1032-NEXT: v_mov_b32_e32 v0, 0 679; GFX1032-NEXT: v_mov_b32_e32 v4, s4 680; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 681; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 682; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 683; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 684; GFX1032-NEXT: buffer_gl0_inv 685; GFX1032-NEXT: .LBB2_2: 686; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 687; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 688; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 689; GFX1032-NEXT: v_mov_b32_e32 v0, v3 690; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 691; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 692; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 693; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 694; GFX1032-NEXT: s_endpgm 695; 696; GFX1164-LABEL: add_i32_varying: 697; GFX1164: ; %bb.0: ; %entry 698; GFX1164-NEXT: v_mov_b32_e32 v1, v0 699; GFX1164-NEXT: s_not_b64 exec, exec 700; GFX1164-NEXT: v_mov_b32_e32 v1, 0 701; GFX1164-NEXT: s_not_b64 exec, exec 702; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 703; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 704; GFX1164-NEXT: v_mov_b32_e32 v3, 0 705; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 707; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 708; GFX1164-NEXT: v_mov_b32_e32 v2, v1 709; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 710; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 711; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 712; GFX1164-NEXT: v_mov_b32_e32 v2, s4 713; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 714; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 715; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 716; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 717; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 718; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 719; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 720; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 721; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 722; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 723; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 724; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 725; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 726; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 727; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 728; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 729; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 730; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 731; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 732; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 733; GFX1164-NEXT: s_mov_b32 s2, -1 734; GFX1164-NEXT: ; implicit-def: $vgpr0 735; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 736; GFX1164-NEXT: s_cbranch_execz .LBB2_2 737; GFX1164-NEXT: ; %bb.1: 738; GFX1164-NEXT: v_mov_b32_e32 v0, 0 739; GFX1164-NEXT: v_mov_b32_e32 v4, s7 740; GFX1164-NEXT: s_mov_b32 s3, s7 741; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 742; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 743; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v4 744; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 745; GFX1164-NEXT: buffer_gl0_inv 746; GFX1164-NEXT: .LBB2_2: 747; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 748; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 749; GFX1164-NEXT: v_mov_b32_e32 v0, v3 750; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 751; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 752; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 753; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 754; GFX1164-NEXT: s_endpgm 755; 756; GFX1132-LABEL: add_i32_varying: 757; GFX1132: ; %bb.0: ; %entry 758; GFX1132-NEXT: v_mov_b32_e32 v1, v0 759; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 760; GFX1132-NEXT: v_mov_b32_e32 v1, 0 761; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 762; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 763; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 764; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 765; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 766; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 767; GFX1132-NEXT: v_mov_b32_e32 v2, v1 768; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 769; GFX1132-NEXT: s_mov_b32 exec_lo, s2 770; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 771; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 772; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 773; GFX1132-NEXT: v_mov_b32_e32 v3, 0 774; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 775; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 776; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 777; GFX1132-NEXT: s_mov_b32 exec_lo, s2 778; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 779; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 780; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 781; GFX1132-NEXT: s_mov_b32 exec_lo, s2 782; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 783; GFX1132-NEXT: s_mov_b32 s2, -1 784; GFX1132-NEXT: ; implicit-def: $vgpr0 785; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 786; GFX1132-NEXT: s_cbranch_execz .LBB2_2 787; GFX1132-NEXT: ; %bb.1: 788; GFX1132-NEXT: v_mov_b32_e32 v0, 0 789; GFX1132-NEXT: v_mov_b32_e32 v4, s4 790; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 791; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 792; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v4 793; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 794; GFX1132-NEXT: buffer_gl0_inv 795; GFX1132-NEXT: .LBB2_2: 796; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 797; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 798; GFX1132-NEXT: v_mov_b32_e32 v0, v3 799; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 800; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 801; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 802; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 803; GFX1132-NEXT: s_endpgm 804entry: 805 %lane = call i32 @llvm.amdgcn.workitem.id.x() 806 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 807 store i32 %old, i32 addrspace(1)* %out 808 ret void 809} 810 811define amdgpu_kernel void @add_i32_varying_nouse() { 812; GFX7LESS-LABEL: add_i32_varying_nouse: 813; GFX7LESS: ; %bb.0: ; %entry 814; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 815; GFX7LESS-NEXT: s_mov_b32 m0, -1 816; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 817; GFX7LESS-NEXT: ds_add_u32 v1, v0 818; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 819; GFX7LESS-NEXT: s_endpgm 820; 821; GFX8-LABEL: add_i32_varying_nouse: 822; GFX8: ; %bb.0: ; %entry 823; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 824; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 825; GFX8-NEXT: v_mov_b32_e32 v1, v0 826; GFX8-NEXT: s_not_b64 exec, exec 827; GFX8-NEXT: v_mov_b32_e32 v1, 0 828; GFX8-NEXT: s_not_b64 exec, exec 829; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 830; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 831; GFX8-NEXT: s_nop 1 832; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 833; GFX8-NEXT: s_nop 1 834; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 835; GFX8-NEXT: s_nop 1 836; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 837; GFX8-NEXT: s_nop 1 838; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 839; GFX8-NEXT: s_nop 1 840; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 841; GFX8-NEXT: v_readlane_b32 s2, v1, 63 842; GFX8-NEXT: s_mov_b64 exec, s[0:1] 843; GFX8-NEXT: s_mov_b32 s0, s2 844; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 845; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 846; GFX8-NEXT: s_cbranch_execz .LBB3_2 847; GFX8-NEXT: ; %bb.1: 848; GFX8-NEXT: v_mov_b32_e32 v0, 0 849; GFX8-NEXT: v_mov_b32_e32 v2, s0 850; GFX8-NEXT: s_mov_b32 m0, -1 851; GFX8-NEXT: s_waitcnt lgkmcnt(0) 852; GFX8-NEXT: ds_add_u32 v0, v2 853; GFX8-NEXT: s_waitcnt lgkmcnt(0) 854; GFX8-NEXT: .LBB3_2: 855; GFX8-NEXT: s_endpgm 856; 857; GFX9-LABEL: add_i32_varying_nouse: 858; GFX9: ; %bb.0: ; %entry 859; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 860; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 861; GFX9-NEXT: v_mov_b32_e32 v1, v0 862; GFX9-NEXT: s_not_b64 exec, exec 863; GFX9-NEXT: v_mov_b32_e32 v1, 0 864; GFX9-NEXT: s_not_b64 exec, exec 865; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 866; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 867; GFX9-NEXT: s_nop 1 868; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 869; GFX9-NEXT: s_nop 1 870; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 871; GFX9-NEXT: s_nop 1 872; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 873; GFX9-NEXT: s_nop 1 874; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 875; GFX9-NEXT: s_nop 1 876; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 877; GFX9-NEXT: v_readlane_b32 s2, v1, 63 878; GFX9-NEXT: s_mov_b64 exec, s[0:1] 879; GFX9-NEXT: s_mov_b32 s0, s2 880; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 881; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 882; GFX9-NEXT: s_cbranch_execz .LBB3_2 883; GFX9-NEXT: ; %bb.1: 884; GFX9-NEXT: v_mov_b32_e32 v0, 0 885; GFX9-NEXT: v_mov_b32_e32 v2, s0 886; GFX9-NEXT: s_waitcnt lgkmcnt(0) 887; GFX9-NEXT: ds_add_u32 v0, v2 888; GFX9-NEXT: s_waitcnt lgkmcnt(0) 889; GFX9-NEXT: .LBB3_2: 890; GFX9-NEXT: s_endpgm 891; 892; GFX1064-LABEL: add_i32_varying_nouse: 893; GFX1064: ; %bb.0: ; %entry 894; GFX1064-NEXT: v_mov_b32_e32 v1, v0 895; GFX1064-NEXT: s_not_b64 exec, exec 896; GFX1064-NEXT: v_mov_b32_e32 v1, 0 897; GFX1064-NEXT: s_not_b64 exec, exec 898; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 899; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 900; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 901; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 902; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 903; GFX1064-NEXT: v_mov_b32_e32 v2, v1 904; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 905; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 906; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 907; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 908; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 909; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 910; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 911; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 912; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 913; GFX1064-NEXT: s_add_i32 s0, s2, s3 914; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 915; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 916; GFX1064-NEXT: s_cbranch_execz .LBB3_2 917; GFX1064-NEXT: ; %bb.1: 918; GFX1064-NEXT: v_mov_b32_e32 v0, 0 919; GFX1064-NEXT: v_mov_b32_e32 v3, s0 920; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 921; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 922; GFX1064-NEXT: ds_add_u32 v0, v3 923; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 924; GFX1064-NEXT: buffer_gl0_inv 925; GFX1064-NEXT: .LBB3_2: 926; GFX1064-NEXT: s_endpgm 927; 928; GFX1032-LABEL: add_i32_varying_nouse: 929; GFX1032: ; %bb.0: ; %entry 930; GFX1032-NEXT: v_mov_b32_e32 v1, v0 931; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 932; GFX1032-NEXT: v_mov_b32_e32 v1, 0 933; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 934; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 935; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 936; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 937; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 938; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 939; GFX1032-NEXT: v_mov_b32_e32 v2, v1 940; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 941; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 942; GFX1032-NEXT: s_mov_b32 exec_lo, s0 943; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 944; GFX1032-NEXT: v_mov_b32_e32 v0, v1 945; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 946; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 947; GFX1032-NEXT: s_cbranch_execz .LBB3_2 948; GFX1032-NEXT: ; %bb.1: 949; GFX1032-NEXT: v_mov_b32_e32 v3, 0 950; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 951; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 952; GFX1032-NEXT: ds_add_u32 v3, v0 953; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 954; GFX1032-NEXT: buffer_gl0_inv 955; GFX1032-NEXT: .LBB3_2: 956; GFX1032-NEXT: s_endpgm 957; 958; GFX1164-LABEL: add_i32_varying_nouse: 959; GFX1164: ; %bb.0: ; %entry 960; GFX1164-NEXT: v_mov_b32_e32 v1, v0 961; GFX1164-NEXT: s_not_b64 exec, exec 962; GFX1164-NEXT: v_mov_b32_e32 v1, 0 963; GFX1164-NEXT: s_not_b64 exec, exec 964; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 965; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 966; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 967; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 968; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 969; GFX1164-NEXT: v_mov_b32_e32 v2, v1 970; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 971; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 972; GFX1164-NEXT: v_permlane64_b32 v2, v1 973; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 974; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 975; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 976; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 977; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 978; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 979; GFX1164-NEXT: v_mov_b32_e32 v0, v1 980; GFX1164-NEXT: s_mov_b64 s[0:1], exec 981; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 982; GFX1164-NEXT: s_cbranch_execz .LBB3_2 983; GFX1164-NEXT: ; %bb.1: 984; GFX1164-NEXT: v_mov_b32_e32 v3, 0 985; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 986; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 987; GFX1164-NEXT: ds_add_u32 v3, v0 988; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 989; GFX1164-NEXT: buffer_gl0_inv 990; GFX1164-NEXT: .LBB3_2: 991; GFX1164-NEXT: s_endpgm 992; 993; GFX1132-LABEL: add_i32_varying_nouse: 994; GFX1132: ; %bb.0: ; %entry 995; GFX1132-NEXT: v_mov_b32_e32 v1, v0 996; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 997; GFX1132-NEXT: v_mov_b32_e32 v1, 0 998; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 999; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 1000; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1001; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1002; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1003; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1004; GFX1132-NEXT: v_mov_b32_e32 v2, v1 1005; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1006; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 1007; GFX1132-NEXT: s_mov_b32 exec_lo, s0 1008; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1009; GFX1132-NEXT: v_mov_b32_e32 v0, v1 1010; GFX1132-NEXT: s_mov_b32 s0, exec_lo 1011; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 1012; GFX1132-NEXT: s_cbranch_execz .LBB3_2 1013; GFX1132-NEXT: ; %bb.1: 1014; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1015; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1016; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1017; GFX1132-NEXT: ds_add_u32 v3, v0 1018; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX1132-NEXT: buffer_gl0_inv 1020; GFX1132-NEXT: .LBB3_2: 1021; GFX1132-NEXT: s_endpgm 1022entry: 1023 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1024 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1025 ret void 1026} 1027 1028define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1029; 1030; 1031; GFX7LESS-LABEL: add_i64_constant: 1032; GFX7LESS: ; %bb.0: ; %entry 1033; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1034; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1035; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1036; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1037; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1038; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1039; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1040; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 1041; GFX7LESS-NEXT: ; %bb.1: 1042; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1043; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1044; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1045; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1046; GFX7LESS-NEXT: s_mov_b32 m0, -1 1047; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1048; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1049; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1050; GFX7LESS-NEXT: .LBB4_2: 1051; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1052; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1054; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1055; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1056; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1057; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1058; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1059; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1060; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1061; GFX7LESS-NEXT: s_mov_b32 s2, -1 1062; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1063; GFX7LESS-NEXT: s_endpgm 1064; 1065; GFX8-LABEL: add_i64_constant: 1066; GFX8: ; %bb.0: ; %entry 1067; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1068; GFX8-NEXT: s_mov_b64 s[4:5], exec 1069; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1070; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1071; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1072; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1073; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1074; GFX8-NEXT: s_cbranch_execz .LBB4_2 1075; GFX8-NEXT: ; %bb.1: 1076; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1077; GFX8-NEXT: s_mul_i32 s4, s4, 5 1078; GFX8-NEXT: v_mov_b32_e32 v0, s4 1079; GFX8-NEXT: v_mov_b32_e32 v1, 0 1080; GFX8-NEXT: s_mov_b32 m0, -1 1081; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1083; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1084; GFX8-NEXT: .LBB4_2: 1085; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1086; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1087; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1088; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1089; GFX8-NEXT: v_mov_b32_e32 v0, s2 1090; GFX8-NEXT: v_mov_b32_e32 v1, s3 1091; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1092; GFX8-NEXT: s_mov_b32 s3, 0xf000 1093; GFX8-NEXT: s_mov_b32 s2, -1 1094; GFX8-NEXT: s_nop 2 1095; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1096; GFX8-NEXT: s_endpgm 1097; 1098; GFX9-LABEL: add_i64_constant: 1099; GFX9: ; %bb.0: ; %entry 1100; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1101; GFX9-NEXT: s_mov_b64 s[4:5], exec 1102; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1103; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1104; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1105; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1106; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1107; GFX9-NEXT: s_cbranch_execz .LBB4_2 1108; GFX9-NEXT: ; %bb.1: 1109; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1110; GFX9-NEXT: s_mul_i32 s4, s4, 5 1111; GFX9-NEXT: v_mov_b32_e32 v0, s4 1112; GFX9-NEXT: v_mov_b32_e32 v1, 0 1113; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1115; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX9-NEXT: .LBB4_2: 1117; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1119; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1120; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1121; GFX9-NEXT: v_mov_b32_e32 v0, s2 1122; GFX9-NEXT: v_mov_b32_e32 v1, s3 1123; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 1124; GFX9-NEXT: s_mov_b32 s3, 0xf000 1125; GFX9-NEXT: s_mov_b32 s2, -1 1126; GFX9-NEXT: s_nop 2 1127; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1128; GFX9-NEXT: s_endpgm 1129; 1130; GFX1064-LABEL: add_i64_constant: 1131; GFX1064: ; %bb.0: ; %entry 1132; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1133; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1134; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1135; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1136; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1137; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1138; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1139; GFX1064-NEXT: s_cbranch_execz .LBB4_2 1140; GFX1064-NEXT: ; %bb.1: 1141; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1142; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1143; GFX1064-NEXT: s_mul_i32 s4, s4, 5 1144; GFX1064-NEXT: v_mov_b32_e32 v0, s4 1145; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1146; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1147; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1148; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX1064-NEXT: buffer_gl0_inv 1150; GFX1064-NEXT: .LBB4_2: 1151; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1152; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1153; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1154; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 1155; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 1156; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1157; GFX1064-NEXT: s_mov_b32 s2, -1 1158; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1159; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1160; GFX1064-NEXT: s_endpgm 1161; 1162; GFX1032-LABEL: add_i64_constant: 1163; GFX1032: ; %bb.0: ; %entry 1164; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1165; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1166; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1167; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1168; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1169; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1170; GFX1032-NEXT: s_cbranch_execz .LBB4_2 1171; GFX1032-NEXT: ; %bb.1: 1172; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1173; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1174; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1175; GFX1032-NEXT: v_mov_b32_e32 v0, s3 1176; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1177; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1178; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1179; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1180; GFX1032-NEXT: buffer_gl0_inv 1181; GFX1032-NEXT: .LBB4_2: 1182; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1183; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1184; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1185; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 1186; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 1187; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1188; GFX1032-NEXT: s_mov_b32 s2, -1 1189; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1190; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1191; GFX1032-NEXT: s_endpgm 1192; 1193; GFX1164-LABEL: add_i64_constant: 1194; GFX1164: ; %bb.0: ; %entry 1195; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1196; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1197; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1198; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1199; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1200; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1201; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1202; GFX1164-NEXT: s_cbranch_execz .LBB4_2 1203; GFX1164-NEXT: ; %bb.1: 1204; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1205; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1206; GFX1164-NEXT: s_mul_i32 s4, s4, 5 1207; GFX1164-NEXT: v_mov_b32_e32 v0, s4 1208; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1209; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1210; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1211; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1212; GFX1164-NEXT: buffer_gl0_inv 1213; GFX1164-NEXT: .LBB4_2: 1214; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 1215; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 1216; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 1217; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1218; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1219; GFX1164-NEXT: s_mov_b32 s2, -1 1220; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1221; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1222; GFX1164-NEXT: s_endpgm 1223; 1224; GFX1132-LABEL: add_i64_constant: 1225; GFX1132: ; %bb.0: ; %entry 1226; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1227; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1228; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1229; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 1230; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1231; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1232; GFX1132-NEXT: s_cbranch_execz .LBB4_2 1233; GFX1132-NEXT: ; %bb.1: 1234; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1235; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1236; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1237; GFX1132-NEXT: v_mov_b32_e32 v0, s3 1238; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1239; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1240; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1241; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1242; GFX1132-NEXT: buffer_gl0_inv 1243; GFX1132-NEXT: .LBB4_2: 1244; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1245; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 1246; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 1247; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] 1248; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1249; GFX1132-NEXT: s_mov_b32 s2, -1 1250; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1251; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1252; GFX1132-NEXT: s_endpgm 1253entry: 1254 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1255 store i64 %old, i64 addrspace(1)* %out 1256 ret void 1257} 1258 1259define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1260; 1261; 1262; GFX7LESS-LABEL: add_i64_uniform: 1263; GFX7LESS: ; %bb.0: ; %entry 1264; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1265; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1266; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1267; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 1268; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1269; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1270; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1271; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 1272; GFX7LESS-NEXT: ; %bb.1: 1273; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1274; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 1275; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1276; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1277; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1278; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 1279; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1280; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 1281; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 1282; GFX7LESS-NEXT: s_mov_b32 m0, -1 1283; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1284; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1285; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1286; GFX7LESS-NEXT: .LBB5_2: 1287; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1288; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1289; GFX7LESS-NEXT: s_mov_b32 s6, -1 1290; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX7LESS-NEXT: s_mov_b32 s4, s0 1292; GFX7LESS-NEXT: s_mov_b32 s5, s1 1293; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 1294; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1295; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 1296; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 1297; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 1298; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 1299; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 1300; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1301; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 1302; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1303; GFX7LESS-NEXT: s_endpgm 1304; 1305; GFX8-LABEL: add_i64_uniform: 1306; GFX8: ; %bb.0: ; %entry 1307; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1308; GFX8-NEXT: s_mov_b64 s[6:7], exec 1309; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1310; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1311; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1312; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1313; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1314; GFX8-NEXT: s_cbranch_execz .LBB5_2 1315; GFX8-NEXT: ; %bb.1: 1316; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 1317; GFX8-NEXT: v_mov_b32_e32 v0, s8 1318; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1319; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 1320; GFX8-NEXT: s_mul_i32 s6, s3, s8 1321; GFX8-NEXT: v_mov_b32_e32 v3, 0 1322; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 1323; GFX8-NEXT: s_mov_b32 m0, -1 1324; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1325; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1326; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1327; GFX8-NEXT: .LBB5_2: 1328; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1329; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1330; GFX8-NEXT: v_readfirstlane_b32 s4, v0 1331; GFX8-NEXT: v_readfirstlane_b32 s5, v1 1332; GFX8-NEXT: v_mov_b32_e32 v0, s4 1333; GFX8-NEXT: v_mov_b32_e32 v1, s5 1334; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 1335; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 1336; GFX8-NEXT: s_mov_b32 s7, 0xf000 1337; GFX8-NEXT: s_mov_b32 s6, -1 1338; GFX8-NEXT: s_mov_b32 s4, s0 1339; GFX8-NEXT: s_mov_b32 s5, s1 1340; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1341; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1342; GFX8-NEXT: s_endpgm 1343; 1344; GFX9-LABEL: add_i64_uniform: 1345; GFX9: ; %bb.0: ; %entry 1346; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1347; GFX9-NEXT: s_mov_b64 s[6:7], exec 1348; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1349; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1350; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1351; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1352; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1353; GFX9-NEXT: s_cbranch_execz .LBB5_2 1354; GFX9-NEXT: ; %bb.1: 1355; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1356; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1357; GFX9-NEXT: s_mul_i32 s7, s3, s6 1358; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1359; GFX9-NEXT: s_add_i32 s8, s8, s7 1360; GFX9-NEXT: s_mul_i32 s6, s2, s6 1361; GFX9-NEXT: v_mov_b32_e32 v0, s6 1362; GFX9-NEXT: v_mov_b32_e32 v1, s8 1363; GFX9-NEXT: v_mov_b32_e32 v3, 0 1364; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1365; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX9-NEXT: .LBB5_2: 1368; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1369; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1370; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1371; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1372; GFX9-NEXT: v_mov_b32_e32 v0, s4 1373; GFX9-NEXT: v_mov_b32_e32 v1, s5 1374; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] 1375; GFX9-NEXT: s_mov_b32 s7, 0xf000 1376; GFX9-NEXT: s_mov_b32 s6, -1 1377; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1378; GFX9-NEXT: s_mov_b32 s4, s0 1379; GFX9-NEXT: s_mov_b32 s5, s1 1380; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1381; GFX9-NEXT: s_endpgm 1382; 1383; GFX1064-LABEL: add_i64_uniform: 1384; GFX1064: ; %bb.0: ; %entry 1385; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1386; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1387; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1388; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1389; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1390; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1391; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1392; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1393; GFX1064-NEXT: ; %bb.1: 1394; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1395; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1396; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1398; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1399; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1400; GFX1064-NEXT: s_add_i32 s8, s8, s7 1401; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1402; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1403; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1404; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1405; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1406; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1407; GFX1064-NEXT: buffer_gl0_inv 1408; GFX1064-NEXT: .LBB5_2: 1409; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1410; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1411; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1412; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1413; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1414; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] 1415; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1416; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1417; GFX1064-NEXT: s_mov_b32 s2, -1 1418; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1419; GFX1064-NEXT: s_endpgm 1420; 1421; GFX1032-LABEL: add_i64_uniform: 1422; GFX1032: ; %bb.0: ; %entry 1423; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1424; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1425; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1426; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1427; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1428; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1429; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1430; GFX1032-NEXT: ; %bb.1: 1431; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1432; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1433; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1434; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1435; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1436; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1437; GFX1032-NEXT: s_add_i32 s7, s7, s6 1438; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1439; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1440; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1441; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1442; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1443; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1444; GFX1032-NEXT: buffer_gl0_inv 1445; GFX1032-NEXT: .LBB5_2: 1446; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1447; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1448; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1449; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1450; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1452; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] 1453; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1454; GFX1032-NEXT: s_mov_b32 s2, -1 1455; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1456; GFX1032-NEXT: s_endpgm 1457; 1458; GFX1164-LABEL: add_i64_uniform: 1459; GFX1164: ; %bb.0: ; %entry 1460; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1461; GFX1164-NEXT: s_mov_b64 s[6:7], exec 1462; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1463; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1464; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1465; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 1466; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 1467; GFX1164-NEXT: s_cbranch_execz .LBB5_2 1468; GFX1164-NEXT: ; %bb.1: 1469; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1470; GFX1164-NEXT: v_mov_b32_e32 v3, 0 1471; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1472; GFX1164-NEXT: s_mul_i32 s7, s3, s6 1473; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 1474; GFX1164-NEXT: s_mul_i32 s6, s2, s6 1475; GFX1164-NEXT: s_add_i32 s8, s8, s7 1476; GFX1164-NEXT: v_mov_b32_e32 v0, s6 1477; GFX1164-NEXT: v_mov_b32_e32 v1, s8 1478; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1479; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1480; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1481; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1482; GFX1164-NEXT: buffer_gl0_inv 1483; GFX1164-NEXT: .LBB5_2: 1484; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1485; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 1486; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 1487; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1488; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1489; GFX1164-NEXT: s_mov_b32 s2, -1 1490; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1491; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1492; GFX1164-NEXT: v_mov_b32_e32 v1, v3 1493; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1494; GFX1164-NEXT: s_endpgm 1495; 1496; GFX1132-LABEL: add_i64_uniform: 1497; GFX1132: ; %bb.0: ; %entry 1498; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1499; GFX1132-NEXT: s_mov_b32 s5, exec_lo 1500; GFX1132-NEXT: s_mov_b32 s4, exec_lo 1501; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1502; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 1503; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 1504; GFX1132-NEXT: s_cbranch_execz .LBB5_2 1505; GFX1132-NEXT: ; %bb.1: 1506; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 1507; GFX1132-NEXT: v_mov_b32_e32 v3, 0 1508; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1509; GFX1132-NEXT: s_mul_i32 s6, s3, s5 1510; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 1511; GFX1132-NEXT: s_mul_i32 s5, s2, s5 1512; GFX1132-NEXT: s_add_i32 s7, s7, s6 1513; GFX1132-NEXT: v_mov_b32_e32 v0, s5 1514; GFX1132-NEXT: v_mov_b32_e32 v1, s7 1515; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1516; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1517; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1518; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1519; GFX1132-NEXT: buffer_gl0_inv 1520; GFX1132-NEXT: .LBB5_2: 1521; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 1522; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 1523; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 1524; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1525; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] 1526; GFX1132-NEXT: s_mov_b32 s2, -1 1527; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] 1528; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1529; GFX1132-NEXT: v_mov_b32_e32 v1, v3 1530; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1531; GFX1132-NEXT: s_endpgm 1532entry: 1533 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1534 store i64 %old, i64 addrspace(1)* %out 1535 ret void 1536} 1537 1538define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1539; 1540; 1541; GFX7LESS-LABEL: add_i64_varying: 1542; GFX7LESS: ; %bb.0: ; %entry 1543; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1544; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1545; GFX7LESS-NEXT: s_mov_b32 m0, -1 1546; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1547; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1548; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1549; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1550; GFX7LESS-NEXT: s_mov_b32 s2, -1 1551; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1552; GFX7LESS-NEXT: s_endpgm 1553; 1554; GFX8-LABEL: add_i64_varying: 1555; GFX8: ; %bb.0: ; %entry 1556; GFX8-NEXT: v_mov_b32_e32 v1, 0 1557; GFX8-NEXT: s_mov_b32 m0, -1 1558; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1559; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1560; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1561; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1562; GFX8-NEXT: s_mov_b32 s3, 0xf000 1563; GFX8-NEXT: s_mov_b32 s2, -1 1564; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1565; GFX8-NEXT: s_endpgm 1566; 1567; GFX9-LABEL: add_i64_varying: 1568; GFX9: ; %bb.0: ; %entry 1569; GFX9-NEXT: v_mov_b32_e32 v1, 0 1570; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1571; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1572; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1573; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1574; GFX9-NEXT: s_mov_b32 s3, 0xf000 1575; GFX9-NEXT: s_mov_b32 s2, -1 1576; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1577; GFX9-NEXT: s_endpgm 1578; 1579; GFX10-LABEL: add_i64_varying: 1580; GFX10: ; %bb.0: ; %entry 1581; GFX10-NEXT: v_mov_b32_e32 v1, 0 1582; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1583; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1584; GFX10-NEXT: s_mov_b32 s2, -1 1585; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1586; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1587; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1588; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1589; GFX10-NEXT: buffer_gl0_inv 1590; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1591; GFX10-NEXT: s_endpgm 1592; 1593; GFX11-LABEL: add_i64_varying: 1594; GFX11: ; %bb.0: ; %entry 1595; GFX11-NEXT: v_mov_b32_e32 v1, 0 1596; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1597; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1598; GFX11-NEXT: s_mov_b32 s2, -1 1599; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1600; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1601; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1602; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1603; GFX11-NEXT: buffer_gl0_inv 1604; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1605; GFX11-NEXT: s_endpgm 1606entry: 1607 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1608 %zext = zext i32 %lane to i64 1609 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1610 store i64 %old, i64 addrspace(1)* %out 1611 ret void 1612} 1613 1614define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1615; 1616; 1617; GFX7LESS-LABEL: sub_i32_constant: 1618; GFX7LESS: ; %bb.0: ; %entry 1619; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1620; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1621; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1622; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1623; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1624; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1625; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1626; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1627; GFX7LESS-NEXT: ; %bb.1: 1628; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1629; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1630; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1631; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1632; GFX7LESS-NEXT: s_mov_b32 m0, -1 1633; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1634; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1635; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX7LESS-NEXT: .LBB7_2: 1637; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1638; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1639; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1640; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1641; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1642; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1643; GFX7LESS-NEXT: s_mov_b32 s2, -1 1644; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1645; GFX7LESS-NEXT: s_endpgm 1646; 1647; GFX8-LABEL: sub_i32_constant: 1648; GFX8: ; %bb.0: ; %entry 1649; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1650; GFX8-NEXT: s_mov_b64 s[2:3], exec 1651; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1652; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1653; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1654; GFX8-NEXT: ; implicit-def: $vgpr1 1655; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1656; GFX8-NEXT: s_cbranch_execz .LBB7_2 1657; GFX8-NEXT: ; %bb.1: 1658; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1659; GFX8-NEXT: s_mul_i32 s2, s2, 5 1660; GFX8-NEXT: v_mov_b32_e32 v1, 0 1661; GFX8-NEXT: v_mov_b32_e32 v2, s2 1662; GFX8-NEXT: s_mov_b32 m0, -1 1663; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1664; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1665; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1666; GFX8-NEXT: .LBB7_2: 1667; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1668; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1670; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1671; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1672; GFX8-NEXT: s_mov_b32 s3, 0xf000 1673; GFX8-NEXT: s_mov_b32 s2, -1 1674; GFX8-NEXT: s_nop 0 1675; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1676; GFX8-NEXT: s_endpgm 1677; 1678; GFX9-LABEL: sub_i32_constant: 1679; GFX9: ; %bb.0: ; %entry 1680; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1681; GFX9-NEXT: s_mov_b64 s[2:3], exec 1682; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1683; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1684; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1685; GFX9-NEXT: ; implicit-def: $vgpr1 1686; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1687; GFX9-NEXT: s_cbranch_execz .LBB7_2 1688; GFX9-NEXT: ; %bb.1: 1689; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1690; GFX9-NEXT: s_mul_i32 s2, s2, 5 1691; GFX9-NEXT: v_mov_b32_e32 v1, 0 1692; GFX9-NEXT: v_mov_b32_e32 v2, s2 1693; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1694; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1695; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1696; GFX9-NEXT: .LBB7_2: 1697; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1698; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1700; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1701; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1702; GFX9-NEXT: s_mov_b32 s3, 0xf000 1703; GFX9-NEXT: s_mov_b32 s2, -1 1704; GFX9-NEXT: s_nop 0 1705; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1706; GFX9-NEXT: s_endpgm 1707; 1708; GFX1064-LABEL: sub_i32_constant: 1709; GFX1064: ; %bb.0: ; %entry 1710; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1711; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1712; GFX1064-NEXT: ; implicit-def: $vgpr1 1713; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1714; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1715; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1716; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1717; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1718; GFX1064-NEXT: ; %bb.1: 1719; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1720; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1721; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1722; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1723; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1724; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1725; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1726; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX1064-NEXT: buffer_gl0_inv 1728; GFX1064-NEXT: .LBB7_2: 1729; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1730; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1731; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1732; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1733; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1734; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1735; GFX1064-NEXT: s_mov_b32 s2, -1 1736; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1737; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1738; GFX1064-NEXT: s_endpgm 1739; 1740; GFX1032-LABEL: sub_i32_constant: 1741; GFX1032: ; %bb.0: ; %entry 1742; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1743; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1744; GFX1032-NEXT: ; implicit-def: $vgpr1 1745; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1746; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1747; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1748; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1749; GFX1032-NEXT: ; %bb.1: 1750; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1751; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1752; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1753; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1754; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1755; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1756; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1757; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1758; GFX1032-NEXT: buffer_gl0_inv 1759; GFX1032-NEXT: .LBB7_2: 1760; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1761; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1762; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1763; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1764; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1765; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1766; GFX1032-NEXT: s_mov_b32 s2, -1 1767; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1769; GFX1032-NEXT: s_endpgm 1770; 1771; GFX1164-LABEL: sub_i32_constant: 1772; GFX1164: ; %bb.0: ; %entry 1773; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1774; GFX1164-NEXT: s_mov_b64 s[2:3], exec 1775; GFX1164-NEXT: s_mov_b64 s[4:5], exec 1776; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1777; GFX1164-NEXT: ; implicit-def: $vgpr1 1778; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1779; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 1780; GFX1164-NEXT: s_cbranch_execz .LBB7_2 1781; GFX1164-NEXT: ; %bb.1: 1782; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1783; GFX1164-NEXT: v_mov_b32_e32 v1, 0 1784; GFX1164-NEXT: s_mul_i32 s2, s2, 5 1785; GFX1164-NEXT: v_mov_b32_e32 v2, s2 1786; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1787; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 1788; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 1789; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1790; GFX1164-NEXT: buffer_gl0_inv 1791; GFX1164-NEXT: .LBB7_2: 1792; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 1793; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 1794; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1795; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 1796; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1797; GFX1164-NEXT: s_mov_b32 s2, -1 1798; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 1799; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1800; GFX1164-NEXT: s_endpgm 1801; 1802; GFX1132-LABEL: sub_i32_constant: 1803; GFX1132: ; %bb.0: ; %entry 1804; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1805; GFX1132-NEXT: s_mov_b32 s3, exec_lo 1806; GFX1132-NEXT: s_mov_b32 s2, exec_lo 1807; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1808; GFX1132-NEXT: ; implicit-def: $vgpr1 1809; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 1810; GFX1132-NEXT: s_cbranch_execz .LBB7_2 1811; GFX1132-NEXT: ; %bb.1: 1812; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 1813; GFX1132-NEXT: v_mov_b32_e32 v1, 0 1814; GFX1132-NEXT: s_mul_i32 s3, s3, 5 1815; GFX1132-NEXT: v_mov_b32_e32 v2, s3 1816; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1817; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 1818; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 1819; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX1132-NEXT: buffer_gl0_inv 1821; GFX1132-NEXT: .LBB7_2: 1822; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 1823; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 1824; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1825; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 1826; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1827; GFX1132-NEXT: s_mov_b32 s2, -1 1828; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 1829; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1830; GFX1132-NEXT: s_endpgm 1831entry: 1832 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1833 store i32 %old, i32 addrspace(1)* %out 1834 ret void 1835} 1836 1837define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1838; 1839; 1840; GFX7LESS-LABEL: sub_i32_uniform: 1841; GFX7LESS: ; %bb.0: ; %entry 1842; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1843; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1844; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1845; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1846; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1847; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1848; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1849; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1850; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1851; GFX7LESS-NEXT: ; %bb.1: 1852; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1853; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1854; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1855; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1856; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1857; GFX7LESS-NEXT: s_mov_b32 m0, -1 1858; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1859; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1860; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1861; GFX7LESS-NEXT: .LBB8_2: 1862; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1863; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1864; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1865; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1866; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1867; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1868; GFX7LESS-NEXT: s_mov_b32 s6, -1 1869; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1870; GFX7LESS-NEXT: s_endpgm 1871; 1872; GFX8-LABEL: sub_i32_uniform: 1873; GFX8: ; %bb.0: ; %entry 1874; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1875; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1876; GFX8-NEXT: s_mov_b64 s[2:3], exec 1877; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1878; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1879; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1880; GFX8-NEXT: ; implicit-def: $vgpr1 1881; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1882; GFX8-NEXT: s_cbranch_execz .LBB8_2 1883; GFX8-NEXT: ; %bb.1: 1884; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1885; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1886; GFX8-NEXT: s_mul_i32 s2, s6, s2 1887; GFX8-NEXT: v_mov_b32_e32 v1, 0 1888; GFX8-NEXT: v_mov_b32_e32 v2, s2 1889; GFX8-NEXT: s_mov_b32 m0, -1 1890; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1891; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1892; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1893; GFX8-NEXT: .LBB8_2: 1894; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1895; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1896; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1897; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1898; GFX8-NEXT: s_mov_b32 s7, 0xf000 1899; GFX8-NEXT: s_mov_b32 s6, -1 1900; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1901; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1902; GFX8-NEXT: s_endpgm 1903; 1904; GFX9-LABEL: sub_i32_uniform: 1905; GFX9: ; %bb.0: ; %entry 1906; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1907; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1908; GFX9-NEXT: s_mov_b64 s[2:3], exec 1909; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1910; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1911; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1912; GFX9-NEXT: ; implicit-def: $vgpr1 1913; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1914; GFX9-NEXT: s_cbranch_execz .LBB8_2 1915; GFX9-NEXT: ; %bb.1: 1916; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1918; GFX9-NEXT: s_mul_i32 s2, s6, s2 1919; GFX9-NEXT: v_mov_b32_e32 v1, 0 1920; GFX9-NEXT: v_mov_b32_e32 v2, s2 1921; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1922; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1923; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1924; GFX9-NEXT: .LBB8_2: 1925; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1926; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1927; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1928; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1929; GFX9-NEXT: s_mov_b32 s7, 0xf000 1930; GFX9-NEXT: s_mov_b32 s6, -1 1931; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1932; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1933; GFX9-NEXT: s_endpgm 1934; 1935; GFX1064-LABEL: sub_i32_uniform: 1936; GFX1064: ; %bb.0: ; %entry 1937; GFX1064-NEXT: s_clause 0x1 1938; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1939; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 1940; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1941; GFX1064-NEXT: ; implicit-def: $vgpr1 1942; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1943; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1944; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1945; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1946; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1947; GFX1064-NEXT: ; %bb.1: 1948; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1949; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1950; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1951; GFX1064-NEXT: s_mul_i32 s2, s6, s2 1952; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1953; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1954; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1955; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1956; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1957; GFX1064-NEXT: buffer_gl0_inv 1958; GFX1064-NEXT: .LBB8_2: 1959; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1960; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1961; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1962; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 1963; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1964; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1965; GFX1064-NEXT: s_mov_b32 s6, -1 1966; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1967; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1968; GFX1064-NEXT: s_endpgm 1969; 1970; GFX1032-LABEL: sub_i32_uniform: 1971; GFX1032: ; %bb.0: ; %entry 1972; GFX1032-NEXT: s_clause 0x1 1973; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1974; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1975; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1976; GFX1032-NEXT: ; implicit-def: $vgpr1 1977; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1978; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1979; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1980; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1981; GFX1032-NEXT: ; %bb.1: 1982; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1983; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1984; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1986; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1987; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1988; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1989; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1990; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1991; GFX1032-NEXT: buffer_gl0_inv 1992; GFX1032-NEXT: .LBB8_2: 1993; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1994; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1995; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1996; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1997; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1998; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1999; GFX1032-NEXT: s_mov_b32 s6, -1 2000; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2001; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 2002; GFX1032-NEXT: s_endpgm 2003; 2004; GFX1164-LABEL: sub_i32_uniform: 2005; GFX1164: ; %bb.0: ; %entry 2006; GFX1164-NEXT: s_clause 0x1 2007; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2008; GFX1164-NEXT: s_load_b32 s6, s[0:1], 0x2c 2009; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2010; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2011; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2012; GFX1164-NEXT: ; implicit-def: $vgpr1 2013; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2014; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 2015; GFX1164-NEXT: s_cbranch_execz .LBB8_2 2016; GFX1164-NEXT: ; %bb.1: 2017; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2018; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2019; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2020; GFX1164-NEXT: s_mul_i32 s2, s6, s2 2021; GFX1164-NEXT: v_mov_b32_e32 v2, s2 2022; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2023; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2024; GFX1164-NEXT: ds_sub_rtn_u32 v1, v1, v2 2025; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2026; GFX1164-NEXT: buffer_gl0_inv 2027; GFX1164-NEXT: .LBB8_2: 2028; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] 2029; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2030; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0 2031; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 2032; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 2033; GFX1164-NEXT: s_mov_b32 s6, -1 2034; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2035; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2036; GFX1164-NEXT: s_endpgm 2037; 2038; GFX1132-LABEL: sub_i32_uniform: 2039; GFX1132: ; %bb.0: ; %entry 2040; GFX1132-NEXT: s_clause 0x1 2041; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 2042; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x2c 2043; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2044; GFX1132-NEXT: s_mov_b32 s1, exec_lo 2045; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2046; GFX1132-NEXT: ; implicit-def: $vgpr1 2047; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 2048; GFX1132-NEXT: s_cbranch_execz .LBB8_2 2049; GFX1132-NEXT: ; %bb.1: 2050; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 2051; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2052; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX1132-NEXT: s_mul_i32 s2, s0, s2 2054; GFX1132-NEXT: v_mov_b32_e32 v2, s2 2055; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2056; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2057; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 2058; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2059; GFX1132-NEXT: buffer_gl0_inv 2060; GFX1132-NEXT: .LBB8_2: 2061; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 2062; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2063; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 2064; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 2065; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 2066; GFX1132-NEXT: s_mov_b32 s6, -1 2067; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 2068; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 2069; GFX1132-NEXT: s_endpgm 2070entry: 2071 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 2072 store i32 %old, i32 addrspace(1)* %out 2073 ret void 2074} 2075 2076define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 2077; 2078; 2079; GFX7LESS-LABEL: sub_i32_varying: 2080; GFX7LESS: ; %bb.0: ; %entry 2081; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2082; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2083; GFX7LESS-NEXT: s_mov_b32 m0, -1 2084; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2085; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 2086; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2087; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2088; GFX7LESS-NEXT: s_mov_b32 s2, -1 2089; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2090; GFX7LESS-NEXT: s_endpgm 2091; 2092; GFX8-LABEL: sub_i32_varying: 2093; GFX8: ; %bb.0: ; %entry 2094; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2095; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2096; GFX8-NEXT: v_mov_b32_e32 v1, 0 2097; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2098; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2099; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2100; GFX8-NEXT: v_mov_b32_e32 v2, v0 2101; GFX8-NEXT: s_not_b64 exec, exec 2102; GFX8-NEXT: v_mov_b32_e32 v2, 0 2103; GFX8-NEXT: s_not_b64 exec, exec 2104; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2105; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2106; GFX8-NEXT: s_nop 1 2107; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2108; GFX8-NEXT: s_nop 1 2109; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2110; GFX8-NEXT: s_nop 1 2111; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2112; GFX8-NEXT: s_nop 1 2113; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2114; GFX8-NEXT: s_nop 1 2115; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2116; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2117; GFX8-NEXT: s_nop 0 2118; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2119; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2120; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2121; GFX8-NEXT: ; implicit-def: $vgpr0 2122; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2123; GFX8-NEXT: s_cbranch_execz .LBB9_2 2124; GFX8-NEXT: ; %bb.1: 2125; GFX8-NEXT: v_mov_b32_e32 v0, 0 2126; GFX8-NEXT: v_mov_b32_e32 v3, s4 2127; GFX8-NEXT: s_mov_b32 m0, -1 2128; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2129; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 2130; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2131; GFX8-NEXT: .LBB9_2: 2132; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2133; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2134; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2135; GFX8-NEXT: v_mov_b32_e32 v0, v1 2136; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2137; GFX8-NEXT: s_mov_b32 s3, 0xf000 2138; GFX8-NEXT: s_mov_b32 s2, -1 2139; GFX8-NEXT: s_nop 0 2140; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2141; GFX8-NEXT: s_endpgm 2142; 2143; GFX9-LABEL: sub_i32_varying: 2144; GFX9: ; %bb.0: ; %entry 2145; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2146; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2147; GFX9-NEXT: v_mov_b32_e32 v1, 0 2148; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2149; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2150; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2151; GFX9-NEXT: v_mov_b32_e32 v2, v0 2152; GFX9-NEXT: s_not_b64 exec, exec 2153; GFX9-NEXT: v_mov_b32_e32 v2, 0 2154; GFX9-NEXT: s_not_b64 exec, exec 2155; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2156; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2157; GFX9-NEXT: s_nop 1 2158; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2159; GFX9-NEXT: s_nop 1 2160; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2161; GFX9-NEXT: s_nop 1 2162; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2163; GFX9-NEXT: s_nop 1 2164; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2165; GFX9-NEXT: s_nop 1 2166; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2167; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2168; GFX9-NEXT: s_nop 0 2169; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2170; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2171; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2172; GFX9-NEXT: ; implicit-def: $vgpr0 2173; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2174; GFX9-NEXT: s_cbranch_execz .LBB9_2 2175; GFX9-NEXT: ; %bb.1: 2176; GFX9-NEXT: v_mov_b32_e32 v0, 0 2177; GFX9-NEXT: v_mov_b32_e32 v3, s4 2178; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2179; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2180; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2181; GFX9-NEXT: .LBB9_2: 2182; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2183; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2184; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2185; GFX9-NEXT: v_mov_b32_e32 v0, v1 2186; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2187; GFX9-NEXT: s_mov_b32 s3, 0xf000 2188; GFX9-NEXT: s_mov_b32 s2, -1 2189; GFX9-NEXT: s_nop 0 2190; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2191; GFX9-NEXT: s_endpgm 2192; 2193; GFX1064-LABEL: sub_i32_varying: 2194; GFX1064: ; %bb.0: ; %entry 2195; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2196; GFX1064-NEXT: s_not_b64 exec, exec 2197; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2198; GFX1064-NEXT: s_not_b64 exec, exec 2199; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2200; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2201; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2202; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2203; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2204; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2205; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2206; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2207; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2208; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2209; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2210; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2211; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2212; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2213; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2214; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2215; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2216; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2217; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2218; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2219; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2220; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2221; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2222; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2223; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2224; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2225; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2226; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2227; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2228; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2229; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2230; GFX1064-NEXT: s_mov_b32 s2, -1 2231; GFX1064-NEXT: ; implicit-def: $vgpr0 2232; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2233; GFX1064-NEXT: s_cbranch_execz .LBB9_2 2234; GFX1064-NEXT: ; %bb.1: 2235; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2236; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2237; GFX1064-NEXT: s_mov_b32 s3, s7 2238; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2239; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2240; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 2241; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2242; GFX1064-NEXT: buffer_gl0_inv 2243; GFX1064-NEXT: .LBB9_2: 2244; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2245; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2246; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2247; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2248; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2249; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2250; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2251; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2252; GFX1064-NEXT: s_endpgm 2253; 2254; GFX1032-LABEL: sub_i32_varying: 2255; GFX1032: ; %bb.0: ; %entry 2256; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2257; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2258; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2259; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2260; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2261; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2262; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2263; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2264; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2265; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2266; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2267; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2268; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2269; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2270; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2271; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2272; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2273; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2274; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2275; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2276; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2277; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2278; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2279; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2280; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2281; GFX1032-NEXT: s_mov_b32 s2, -1 2282; GFX1032-NEXT: ; implicit-def: $vgpr0 2283; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2284; GFX1032-NEXT: s_cbranch_execz .LBB9_2 2285; GFX1032-NEXT: ; %bb.1: 2286; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2287; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2288; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2289; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2290; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 2291; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2292; GFX1032-NEXT: buffer_gl0_inv 2293; GFX1032-NEXT: .LBB9_2: 2294; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2295; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2296; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2297; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2298; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2299; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2300; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2301; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2302; GFX1032-NEXT: s_endpgm 2303; 2304; GFX1164-LABEL: sub_i32_varying: 2305; GFX1164: ; %bb.0: ; %entry 2306; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2307; GFX1164-NEXT: s_not_b64 exec, exec 2308; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2309; GFX1164-NEXT: s_not_b64 exec, exec 2310; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2311; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2312; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2313; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2314; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2315; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2316; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2317; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2318; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2319; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 2320; GFX1164-NEXT: v_mov_b32_e32 v2, s4 2321; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2322; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 2323; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2324; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2325; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2326; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2327; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 2328; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 2329; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2330; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2331; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 2332; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 2333; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 2334; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 2335; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 2336; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2337; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 2338; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 2339; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 2340; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2341; GFX1164-NEXT: s_mov_b32 s2, -1 2342; GFX1164-NEXT: ; implicit-def: $vgpr0 2343; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 2344; GFX1164-NEXT: s_cbranch_execz .LBB9_2 2345; GFX1164-NEXT: ; %bb.1: 2346; GFX1164-NEXT: v_mov_b32_e32 v0, 0 2347; GFX1164-NEXT: v_mov_b32_e32 v4, s7 2348; GFX1164-NEXT: s_mov_b32 s3, s7 2349; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2350; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2351; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v4 2352; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2353; GFX1164-NEXT: buffer_gl0_inv 2354; GFX1164-NEXT: .LBB9_2: 2355; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 2356; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 2357; GFX1164-NEXT: v_mov_b32_e32 v0, v3 2358; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2359; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2360; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2361; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2362; GFX1164-NEXT: s_endpgm 2363; 2364; GFX1132-LABEL: sub_i32_varying: 2365; GFX1132: ; %bb.0: ; %entry 2366; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2367; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2368; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2369; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2370; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2371; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2372; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2373; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2374; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2375; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2376; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2377; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2378; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2379; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2380; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2381; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2382; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 2383; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 2384; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2385; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2386; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2387; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 2388; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 2389; GFX1132-NEXT: s_mov_b32 exec_lo, s2 2390; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2391; GFX1132-NEXT: s_mov_b32 s2, -1 2392; GFX1132-NEXT: ; implicit-def: $vgpr0 2393; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 2394; GFX1132-NEXT: s_cbranch_execz .LBB9_2 2395; GFX1132-NEXT: ; %bb.1: 2396; GFX1132-NEXT: v_mov_b32_e32 v0, 0 2397; GFX1132-NEXT: v_mov_b32_e32 v4, s4 2398; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2399; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2400; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v4 2401; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2402; GFX1132-NEXT: buffer_gl0_inv 2403; GFX1132-NEXT: .LBB9_2: 2404; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 2405; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 2406; GFX1132-NEXT: v_mov_b32_e32 v0, v3 2407; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2408; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2409; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2410; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2411; GFX1132-NEXT: s_endpgm 2412entry: 2413 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2414 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2415 store i32 %old, i32 addrspace(1)* %out 2416 ret void 2417} 2418 2419define amdgpu_kernel void @sub_i32_varying_nouse() { 2420; GFX7LESS-LABEL: sub_i32_varying_nouse: 2421; GFX7LESS: ; %bb.0: ; %entry 2422; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2423; GFX7LESS-NEXT: s_mov_b32 m0, -1 2424; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2425; GFX7LESS-NEXT: ds_sub_u32 v1, v0 2426; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2427; GFX7LESS-NEXT: s_endpgm 2428; 2429; GFX8-LABEL: sub_i32_varying_nouse: 2430; GFX8: ; %bb.0: ; %entry 2431; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2432; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2433; GFX8-NEXT: v_mov_b32_e32 v1, v0 2434; GFX8-NEXT: s_not_b64 exec, exec 2435; GFX8-NEXT: v_mov_b32_e32 v1, 0 2436; GFX8-NEXT: s_not_b64 exec, exec 2437; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 2438; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2439; GFX8-NEXT: s_nop 1 2440; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2441; GFX8-NEXT: s_nop 1 2442; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2443; GFX8-NEXT: s_nop 1 2444; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2445; GFX8-NEXT: s_nop 1 2446; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2447; GFX8-NEXT: s_nop 1 2448; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2449; GFX8-NEXT: v_readlane_b32 s2, v1, 63 2450; GFX8-NEXT: s_mov_b64 exec, s[0:1] 2451; GFX8-NEXT: s_mov_b32 s0, s2 2452; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2453; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2454; GFX8-NEXT: s_cbranch_execz .LBB10_2 2455; GFX8-NEXT: ; %bb.1: 2456; GFX8-NEXT: v_mov_b32_e32 v0, 0 2457; GFX8-NEXT: v_mov_b32_e32 v2, s0 2458; GFX8-NEXT: s_mov_b32 m0, -1 2459; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2460; GFX8-NEXT: ds_sub_u32 v0, v2 2461; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2462; GFX8-NEXT: .LBB10_2: 2463; GFX8-NEXT: s_endpgm 2464; 2465; GFX9-LABEL: sub_i32_varying_nouse: 2466; GFX9: ; %bb.0: ; %entry 2467; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 2468; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 2469; GFX9-NEXT: v_mov_b32_e32 v1, v0 2470; GFX9-NEXT: s_not_b64 exec, exec 2471; GFX9-NEXT: v_mov_b32_e32 v1, 0 2472; GFX9-NEXT: s_not_b64 exec, exec 2473; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 2474; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2475; GFX9-NEXT: s_nop 1 2476; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2477; GFX9-NEXT: s_nop 1 2478; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2479; GFX9-NEXT: s_nop 1 2480; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2481; GFX9-NEXT: s_nop 1 2482; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 2483; GFX9-NEXT: s_nop 1 2484; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 2485; GFX9-NEXT: v_readlane_b32 s2, v1, 63 2486; GFX9-NEXT: s_mov_b64 exec, s[0:1] 2487; GFX9-NEXT: s_mov_b32 s0, s2 2488; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2489; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2490; GFX9-NEXT: s_cbranch_execz .LBB10_2 2491; GFX9-NEXT: ; %bb.1: 2492; GFX9-NEXT: v_mov_b32_e32 v0, 0 2493; GFX9-NEXT: v_mov_b32_e32 v2, s0 2494; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2495; GFX9-NEXT: ds_sub_u32 v0, v2 2496; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2497; GFX9-NEXT: .LBB10_2: 2498; GFX9-NEXT: s_endpgm 2499; 2500; GFX1064-LABEL: sub_i32_varying_nouse: 2501; GFX1064: ; %bb.0: ; %entry 2502; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2503; GFX1064-NEXT: s_not_b64 exec, exec 2504; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2505; GFX1064-NEXT: s_not_b64 exec, exec 2506; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2507; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2508; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2509; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2510; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2511; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2512; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2513; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 2514; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2515; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2516; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 2517; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 2518; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 2519; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 2520; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2521; GFX1064-NEXT: s_add_i32 s0, s2, s3 2522; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2523; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2524; GFX1064-NEXT: s_cbranch_execz .LBB10_2 2525; GFX1064-NEXT: ; %bb.1: 2526; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2527; GFX1064-NEXT: v_mov_b32_e32 v3, s0 2528; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2529; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2530; GFX1064-NEXT: ds_sub_u32 v0, v3 2531; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2532; GFX1064-NEXT: buffer_gl0_inv 2533; GFX1064-NEXT: .LBB10_2: 2534; GFX1064-NEXT: s_endpgm 2535; 2536; GFX1032-LABEL: sub_i32_varying_nouse: 2537; GFX1032: ; %bb.0: ; %entry 2538; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2539; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2540; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2541; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2542; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 2543; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2544; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2545; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2546; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2547; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2548; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2549; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 2550; GFX1032-NEXT: s_mov_b32 exec_lo, s0 2551; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2552; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2553; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2554; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 2555; GFX1032-NEXT: s_cbranch_execz .LBB10_2 2556; GFX1032-NEXT: ; %bb.1: 2557; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2558; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2559; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2560; GFX1032-NEXT: ds_sub_u32 v3, v0 2561; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2562; GFX1032-NEXT: buffer_gl0_inv 2563; GFX1032-NEXT: .LBB10_2: 2564; GFX1032-NEXT: s_endpgm 2565; 2566; GFX1164-LABEL: sub_i32_varying_nouse: 2567; GFX1164: ; %bb.0: ; %entry 2568; GFX1164-NEXT: v_mov_b32_e32 v1, v0 2569; GFX1164-NEXT: s_not_b64 exec, exec 2570; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2571; GFX1164-NEXT: s_not_b64 exec, exec 2572; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2573; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2574; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2575; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2576; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2577; GFX1164-NEXT: v_mov_b32_e32 v2, v1 2578; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2579; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2580; GFX1164-NEXT: v_permlane64_b32 v2, v1 2581; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2582; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2583; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 2584; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 2585; GFX1164-NEXT: s_mov_b64 exec, s[0:1] 2586; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 2587; GFX1164-NEXT: v_mov_b32_e32 v0, v1 2588; GFX1164-NEXT: s_mov_b64 s[0:1], exec 2589; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 2590; GFX1164-NEXT: s_cbranch_execz .LBB10_2 2591; GFX1164-NEXT: ; %bb.1: 2592; GFX1164-NEXT: v_mov_b32_e32 v3, 0 2593; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2594; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2595; GFX1164-NEXT: ds_sub_u32 v3, v0 2596; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2597; GFX1164-NEXT: buffer_gl0_inv 2598; GFX1164-NEXT: .LBB10_2: 2599; GFX1164-NEXT: s_endpgm 2600; 2601; GFX1132-LABEL: sub_i32_varying_nouse: 2602; GFX1132: ; %bb.0: ; %entry 2603; GFX1132-NEXT: v_mov_b32_e32 v1, v0 2604; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2605; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2606; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 2607; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 2608; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2609; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2610; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2611; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2612; GFX1132-NEXT: v_mov_b32_e32 v2, v1 2613; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2614; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 2615; GFX1132-NEXT: s_mov_b32 exec_lo, s0 2616; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2617; GFX1132-NEXT: v_mov_b32_e32 v0, v1 2618; GFX1132-NEXT: s_mov_b32 s0, exec_lo 2619; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 2620; GFX1132-NEXT: s_cbranch_execz .LBB10_2 2621; GFX1132-NEXT: ; %bb.1: 2622; GFX1132-NEXT: v_mov_b32_e32 v3, 0 2623; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2624; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2625; GFX1132-NEXT: ds_sub_u32 v3, v0 2626; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2627; GFX1132-NEXT: buffer_gl0_inv 2628; GFX1132-NEXT: .LBB10_2: 2629; GFX1132-NEXT: s_endpgm 2630entry: 2631 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2632 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2633 ret void 2634} 2635 2636define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2637; 2638; 2639; GFX7LESS-LABEL: sub_i64_constant: 2640; GFX7LESS: ; %bb.0: ; %entry 2641; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2642; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2643; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2644; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 2645; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2646; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2647; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2648; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 2649; GFX7LESS-NEXT: ; %bb.1: 2650; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2651; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 2652; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2653; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 2654; GFX7LESS-NEXT: s_mov_b32 m0, -1 2655; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2656; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2657; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2658; GFX7LESS-NEXT: .LBB11_2: 2659; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2660; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2661; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 2662; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 2663; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2664; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2665; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2666; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2667; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2668; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2669; GFX7LESS-NEXT: s_mov_b32 s2, -1 2670; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2671; GFX7LESS-NEXT: s_endpgm 2672; 2673; GFX8-LABEL: sub_i64_constant: 2674; GFX8: ; %bb.0: ; %entry 2675; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2676; GFX8-NEXT: s_mov_b64 s[4:5], exec 2677; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2678; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2679; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2680; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2681; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2682; GFX8-NEXT: s_cbranch_execz .LBB11_2 2683; GFX8-NEXT: ; %bb.1: 2684; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2685; GFX8-NEXT: s_mul_i32 s4, s4, 5 2686; GFX8-NEXT: v_mov_b32_e32 v0, s4 2687; GFX8-NEXT: v_mov_b32_e32 v1, 0 2688; GFX8-NEXT: s_mov_b32 m0, -1 2689; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2690; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2691; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2692; GFX8-NEXT: .LBB11_2: 2693; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2694; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2695; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2696; GFX8-NEXT: v_readfirstlane_b32 s3, v1 2697; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2698; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2699; GFX8-NEXT: v_mov_b32_e32 v2, s3 2700; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2701; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2702; GFX8-NEXT: s_mov_b32 s3, 0xf000 2703; GFX8-NEXT: s_mov_b32 s2, -1 2704; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2705; GFX8-NEXT: s_endpgm 2706; 2707; GFX9-LABEL: sub_i64_constant: 2708; GFX9: ; %bb.0: ; %entry 2709; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2710; GFX9-NEXT: s_mov_b64 s[4:5], exec 2711; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2712; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2713; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2714; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2715; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2716; GFX9-NEXT: s_cbranch_execz .LBB11_2 2717; GFX9-NEXT: ; %bb.1: 2718; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2719; GFX9-NEXT: s_mul_i32 s4, s4, 5 2720; GFX9-NEXT: v_mov_b32_e32 v0, s4 2721; GFX9-NEXT: v_mov_b32_e32 v1, 0 2722; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2723; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2724; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2725; GFX9-NEXT: .LBB11_2: 2726; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2727; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2728; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2729; GFX9-NEXT: v_readfirstlane_b32 s3, v1 2730; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2731; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2732; GFX9-NEXT: v_mov_b32_e32 v2, s3 2733; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2734; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2735; GFX9-NEXT: s_mov_b32 s3, 0xf000 2736; GFX9-NEXT: s_mov_b32 s2, -1 2737; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2738; GFX9-NEXT: s_endpgm 2739; 2740; GFX1064-LABEL: sub_i64_constant: 2741; GFX1064: ; %bb.0: ; %entry 2742; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2743; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2744; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2745; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2746; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2747; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2748; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2749; GFX1064-NEXT: s_cbranch_execz .LBB11_2 2750; GFX1064-NEXT: ; %bb.1: 2751; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2752; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2753; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2754; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2755; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2756; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2757; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2758; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2759; GFX1064-NEXT: buffer_gl0_inv 2760; GFX1064-NEXT: .LBB11_2: 2761; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2762; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2763; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2764; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2765; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2766; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2767; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2768; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2769; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2770; GFX1064-NEXT: s_mov_b32 s2, -1 2771; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2772; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2773; GFX1064-NEXT: s_endpgm 2774; 2775; GFX1032-LABEL: sub_i64_constant: 2776; GFX1032: ; %bb.0: ; %entry 2777; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2778; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2779; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2780; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2781; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2782; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2783; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2784; GFX1032-NEXT: ; %bb.1: 2785; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2786; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2787; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2788; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2789; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2790; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2791; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2792; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2793; GFX1032-NEXT: buffer_gl0_inv 2794; GFX1032-NEXT: .LBB11_2: 2795; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2796; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2797; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2798; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2799; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2800; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2801; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2802; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2803; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2804; GFX1032-NEXT: s_mov_b32 s2, -1 2805; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2806; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2807; GFX1032-NEXT: s_endpgm 2808; 2809; GFX1164-LABEL: sub_i64_constant: 2810; GFX1164: ; %bb.0: ; %entry 2811; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2812; GFX1164-NEXT: s_mov_b64 s[4:5], exec 2813; GFX1164-NEXT: s_mov_b64 s[2:3], exec 2814; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2815; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2816; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 2817; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 2818; GFX1164-NEXT: s_cbranch_execz .LBB11_2 2819; GFX1164-NEXT: ; %bb.1: 2820; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2821; GFX1164-NEXT: v_mov_b32_e32 v1, 0 2822; GFX1164-NEXT: s_mul_i32 s4, s4, 5 2823; GFX1164-NEXT: v_mov_b32_e32 v0, s4 2824; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2825; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 2826; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2827; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2828; GFX1164-NEXT: buffer_gl0_inv 2829; GFX1164-NEXT: .LBB11_2: 2830; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 2831; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 2832; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2833; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 2834; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2835; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2836; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2837; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 2838; GFX1164-NEXT: s_mov_b32 s2, -1 2839; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 2840; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2841; GFX1164-NEXT: s_endpgm 2842; 2843; GFX1132-LABEL: sub_i64_constant: 2844; GFX1132: ; %bb.0: ; %entry 2845; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2846; GFX1132-NEXT: s_mov_b32 s3, exec_lo 2847; GFX1132-NEXT: s_mov_b32 s2, exec_lo 2848; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2849; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 2850; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 2851; GFX1132-NEXT: s_cbranch_execz .LBB11_2 2852; GFX1132-NEXT: ; %bb.1: 2853; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 2854; GFX1132-NEXT: v_mov_b32_e32 v1, 0 2855; GFX1132-NEXT: s_mul_i32 s3, s3, 5 2856; GFX1132-NEXT: v_mov_b32_e32 v0, s3 2857; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2858; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 2859; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2860; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2861; GFX1132-NEXT: buffer_gl0_inv 2862; GFX1132-NEXT: .LBB11_2: 2863; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 2864; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 2865; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2866; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 2867; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2868; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2869; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2870; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 2871; GFX1132-NEXT: s_mov_b32 s2, -1 2872; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 2873; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2874; GFX1132-NEXT: s_endpgm 2875entry: 2876 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2877 store i64 %old, i64 addrspace(1)* %out 2878 ret void 2879} 2880 2881define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2882; 2883; 2884; GFX7LESS-LABEL: sub_i64_uniform: 2885; GFX7LESS: ; %bb.0: ; %entry 2886; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2887; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2888; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2889; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 2890; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2891; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2892; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2893; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 2894; GFX7LESS-NEXT: ; %bb.1: 2895; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2896; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 2897; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2898; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2899; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2900; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 2901; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2902; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 2903; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2904; GFX7LESS-NEXT: s_mov_b32 m0, -1 2905; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2906; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2907; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2908; GFX7LESS-NEXT: .LBB12_2: 2909; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2910; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2911; GFX7LESS-NEXT: s_mov_b32 s6, -1 2912; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2913; GFX7LESS-NEXT: s_mov_b32 s4, s0 2914; GFX7LESS-NEXT: s_mov_b32 s5, s1 2915; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 2916; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 2917; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 2918; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 2919; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 2920; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 2921; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 2922; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 2923; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2924; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2925; GFX7LESS-NEXT: s_endpgm 2926; 2927; GFX8-LABEL: sub_i64_uniform: 2928; GFX8: ; %bb.0: ; %entry 2929; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2930; GFX8-NEXT: s_mov_b64 s[6:7], exec 2931; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2932; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2933; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2934; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2935; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2936; GFX8-NEXT: s_cbranch_execz .LBB12_2 2937; GFX8-NEXT: ; %bb.1: 2938; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 2939; GFX8-NEXT: v_mov_b32_e32 v0, s8 2940; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2941; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 2942; GFX8-NEXT: s_mul_i32 s6, s3, s8 2943; GFX8-NEXT: v_mov_b32_e32 v3, 0 2944; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 2945; GFX8-NEXT: s_mov_b32 m0, -1 2946; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2947; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2949; GFX8-NEXT: .LBB12_2: 2950; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2951; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2952; GFX8-NEXT: s_mov_b32 s4, s0 2953; GFX8-NEXT: s_mov_b32 s5, s1 2954; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 2955; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2956; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2957; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2958; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 2959; GFX8-NEXT: v_mov_b32_e32 v3, s1 2960; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 2961; GFX8-NEXT: s_mov_b32 s7, 0xf000 2962; GFX8-NEXT: s_mov_b32 s6, -1 2963; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2964; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2965; GFX8-NEXT: s_endpgm 2966; 2967; GFX9-LABEL: sub_i64_uniform: 2968; GFX9: ; %bb.0: ; %entry 2969; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2970; GFX9-NEXT: s_mov_b64 s[6:7], exec 2971; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2972; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2973; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2974; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2975; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2976; GFX9-NEXT: s_cbranch_execz .LBB12_2 2977; GFX9-NEXT: ; %bb.1: 2978; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2979; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2980; GFX9-NEXT: s_mul_i32 s7, s3, s6 2981; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2982; GFX9-NEXT: s_add_i32 s8, s8, s7 2983; GFX9-NEXT: s_mul_i32 s6, s2, s6 2984; GFX9-NEXT: v_mov_b32_e32 v0, s6 2985; GFX9-NEXT: v_mov_b32_e32 v1, s8 2986; GFX9-NEXT: v_mov_b32_e32 v3, 0 2987; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2988; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2989; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2990; GFX9-NEXT: .LBB12_2: 2991; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2992; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2993; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 2994; GFX9-NEXT: s_mov_b32 s4, s0 2995; GFX9-NEXT: s_mov_b32 s5, s1 2996; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 2997; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2998; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2999; GFX9-NEXT: v_mov_b32_e32 v1, v4 3000; GFX9-NEXT: v_mov_b32_e32 v2, s1 3001; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 3002; GFX9-NEXT: s_mov_b32 s7, 0xf000 3003; GFX9-NEXT: s_mov_b32 s6, -1 3004; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 3005; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3006; GFX9-NEXT: s_endpgm 3007; 3008; GFX1064-LABEL: sub_i64_uniform: 3009; GFX1064: ; %bb.0: ; %entry 3010; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3011; GFX1064-NEXT: s_mov_b64 s[6:7], exec 3012; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3013; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3014; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3015; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 3016; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3017; GFX1064-NEXT: s_cbranch_execz .LBB12_2 3018; GFX1064-NEXT: ; %bb.1: 3019; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3020; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3021; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3022; GFX1064-NEXT: s_mul_i32 s7, s3, s6 3023; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 3024; GFX1064-NEXT: s_mul_i32 s6, s2, s6 3025; GFX1064-NEXT: s_add_i32 s8, s8, s7 3026; GFX1064-NEXT: v_mov_b32_e32 v0, s6 3027; GFX1064-NEXT: v_mov_b32_e32 v1, s8 3028; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3029; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3030; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3031; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3032; GFX1064-NEXT: buffer_gl0_inv 3033; GFX1064-NEXT: .LBB12_2: 3034; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3035; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3036; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3037; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 3038; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 3039; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 3040; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3041; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3042; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3043; GFX1064-NEXT: v_mov_b32_e32 v1, v4 3044; GFX1064-NEXT: s_mov_b32 s2, -1 3045; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3046; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3047; GFX1064-NEXT: s_endpgm 3048; 3049; GFX1032-LABEL: sub_i64_uniform: 3050; GFX1032: ; %bb.0: ; %entry 3051; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 3052; GFX1032-NEXT: s_mov_b32 s5, exec_lo 3053; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3054; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3055; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 3056; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3057; GFX1032-NEXT: s_cbranch_execz .LBB12_2 3058; GFX1032-NEXT: ; %bb.1: 3059; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 3060; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3061; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3062; GFX1032-NEXT: s_mul_i32 s6, s3, s5 3063; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 3064; GFX1032-NEXT: s_mul_i32 s5, s2, s5 3065; GFX1032-NEXT: s_add_i32 s7, s7, s6 3066; GFX1032-NEXT: v_mov_b32_e32 v0, s5 3067; GFX1032-NEXT: v_mov_b32_e32 v1, s7 3068; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3069; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3070; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3071; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3072; GFX1032-NEXT: buffer_gl0_inv 3073; GFX1032-NEXT: .LBB12_2: 3074; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3075; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3076; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3077; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 3078; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 3079; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] 3080; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3081; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3082; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3083; GFX1032-NEXT: v_mov_b32_e32 v1, v4 3084; GFX1032-NEXT: s_mov_b32 s2, -1 3085; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3086; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3087; GFX1032-NEXT: s_endpgm 3088; 3089; GFX1164-LABEL: sub_i64_uniform: 3090; GFX1164: ; %bb.0: ; %entry 3091; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3092; GFX1164-NEXT: s_mov_b64 s[6:7], exec 3093; GFX1164-NEXT: s_mov_b64 s[4:5], exec 3094; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 3095; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 3096; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 3097; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 3098; GFX1164-NEXT: s_cbranch_execz .LBB12_2 3099; GFX1164-NEXT: ; %bb.1: 3100; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 3101; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3102; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3103; GFX1164-NEXT: s_mul_i32 s7, s3, s6 3104; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 3105; GFX1164-NEXT: s_mul_i32 s6, s2, s6 3106; GFX1164-NEXT: s_add_i32 s8, s8, s7 3107; GFX1164-NEXT: v_mov_b32_e32 v0, s6 3108; GFX1164-NEXT: v_mov_b32_e32 v1, s8 3109; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3110; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3111; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3112; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3113; GFX1164-NEXT: buffer_gl0_inv 3114; GFX1164-NEXT: .LBB12_2: 3115; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3116; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3117; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3118; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 3119; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 3120; GFX1164-NEXT: s_waitcnt_depctr 0xfff 3121; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3122; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 3123; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3124; GFX1164-NEXT: s_mov_b32 s2, -1 3125; GFX1164-NEXT: v_mov_b32_e32 v1, v5 3126; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 3127; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3128; GFX1164-NEXT: s_endpgm 3129; 3130; GFX1132-LABEL: sub_i64_uniform: 3131; GFX1132: ; %bb.0: ; %entry 3132; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 3133; GFX1132-NEXT: s_mov_b32 s5, exec_lo 3134; GFX1132-NEXT: s_mov_b32 s4, exec_lo 3135; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 3136; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 3137; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 3138; GFX1132-NEXT: s_cbranch_execz .LBB12_2 3139; GFX1132-NEXT: ; %bb.1: 3140; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 3141; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3142; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3143; GFX1132-NEXT: s_mul_i32 s6, s3, s5 3144; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 3145; GFX1132-NEXT: s_mul_i32 s5, s2, s5 3146; GFX1132-NEXT: s_add_i32 s7, s7, s6 3147; GFX1132-NEXT: v_mov_b32_e32 v0, s5 3148; GFX1132-NEXT: v_mov_b32_e32 v1, s7 3149; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3150; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3151; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 3152; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3153; GFX1132-NEXT: buffer_gl0_inv 3154; GFX1132-NEXT: .LBB12_2: 3155; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 3156; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3157; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 3158; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 3159; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 3160; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] 3161; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 3162; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3163; GFX1132-NEXT: s_mov_b32 s2, -1 3164; GFX1132-NEXT: v_mov_b32_e32 v1, v5 3165; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 3166; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3167; GFX1132-NEXT: s_endpgm 3168entry: 3169 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 3170 store i64 %old, i64 addrspace(1)* %out 3171 ret void 3172} 3173 3174define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 3175; 3176; 3177; GFX7LESS-LABEL: sub_i64_varying: 3178; GFX7LESS: ; %bb.0: ; %entry 3179; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3180; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3181; GFX7LESS-NEXT: s_mov_b32 m0, -1 3182; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3183; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3184; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3185; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3186; GFX7LESS-NEXT: s_mov_b32 s2, -1 3187; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3188; GFX7LESS-NEXT: s_endpgm 3189; 3190; GFX8-LABEL: sub_i64_varying: 3191; GFX8: ; %bb.0: ; %entry 3192; GFX8-NEXT: v_mov_b32_e32 v1, 0 3193; GFX8-NEXT: s_mov_b32 m0, -1 3194; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3195; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3196; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3197; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3198; GFX8-NEXT: s_mov_b32 s3, 0xf000 3199; GFX8-NEXT: s_mov_b32 s2, -1 3200; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3201; GFX8-NEXT: s_endpgm 3202; 3203; GFX9-LABEL: sub_i64_varying: 3204; GFX9: ; %bb.0: ; %entry 3205; GFX9-NEXT: v_mov_b32_e32 v1, 0 3206; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3208; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3209; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3210; GFX9-NEXT: s_mov_b32 s3, 0xf000 3211; GFX9-NEXT: s_mov_b32 s2, -1 3212; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3213; GFX9-NEXT: s_endpgm 3214; 3215; GFX10-LABEL: sub_i64_varying: 3216; GFX10: ; %bb.0: ; %entry 3217; GFX10-NEXT: v_mov_b32_e32 v1, 0 3218; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3219; GFX10-NEXT: s_mov_b32 s3, 0x31016000 3220; GFX10-NEXT: s_mov_b32 s2, -1 3221; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3222; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3223; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3224; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3225; GFX10-NEXT: buffer_gl0_inv 3226; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3227; GFX10-NEXT: s_endpgm 3228; 3229; GFX11-LABEL: sub_i64_varying: 3230; GFX11: ; %bb.0: ; %entry 3231; GFX11-NEXT: v_mov_b32_e32 v1, 0 3232; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3233; GFX11-NEXT: s_mov_b32 s3, 0x31016000 3234; GFX11-NEXT: s_mov_b32 s2, -1 3235; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3236; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3237; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 3238; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3239; GFX11-NEXT: buffer_gl0_inv 3240; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 3241; GFX11-NEXT: s_endpgm 3242entry: 3243 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3244 %zext = zext i32 %lane to i64 3245 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 3246 store i64 %old, i64 addrspace(1)* %out 3247 ret void 3248} 3249 3250define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 3251; 3252; 3253; GFX7LESS-LABEL: and_i32_varying: 3254; GFX7LESS: ; %bb.0: ; %entry 3255; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3256; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3257; GFX7LESS-NEXT: s_mov_b32 m0, -1 3258; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3259; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 3260; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3261; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3262; GFX7LESS-NEXT: s_mov_b32 s2, -1 3263; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3264; GFX7LESS-NEXT: s_endpgm 3265; 3266; GFX8-LABEL: and_i32_varying: 3267; GFX8: ; %bb.0: ; %entry 3268; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3269; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3270; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3271; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3272; GFX8-NEXT: v_mov_b32_e32 v1, -1 3273; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3274; GFX8-NEXT: v_mov_b32_e32 v2, v0 3275; GFX8-NEXT: s_not_b64 exec, exec 3276; GFX8-NEXT: v_mov_b32_e32 v2, -1 3277; GFX8-NEXT: s_not_b64 exec, exec 3278; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3279; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3280; GFX8-NEXT: s_nop 1 3281; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3282; GFX8-NEXT: s_nop 1 3283; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3284; GFX8-NEXT: s_nop 1 3285; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3286; GFX8-NEXT: s_nop 1 3287; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3288; GFX8-NEXT: s_nop 1 3289; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3290; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3291; GFX8-NEXT: s_nop 0 3292; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3293; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3294; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3295; GFX8-NEXT: ; implicit-def: $vgpr0 3296; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3297; GFX8-NEXT: s_cbranch_execz .LBB14_2 3298; GFX8-NEXT: ; %bb.1: 3299; GFX8-NEXT: v_mov_b32_e32 v0, 0 3300; GFX8-NEXT: v_mov_b32_e32 v3, s4 3301; GFX8-NEXT: s_mov_b32 m0, -1 3302; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3303; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 3304; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3305; GFX8-NEXT: .LBB14_2: 3306; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3307; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3308; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3309; GFX8-NEXT: v_mov_b32_e32 v0, v1 3310; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 3311; GFX8-NEXT: s_mov_b32 s3, 0xf000 3312; GFX8-NEXT: s_mov_b32 s2, -1 3313; GFX8-NEXT: s_nop 0 3314; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3315; GFX8-NEXT: s_endpgm 3316; 3317; GFX9-LABEL: and_i32_varying: 3318; GFX9: ; %bb.0: ; %entry 3319; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3320; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3321; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3322; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3323; GFX9-NEXT: v_mov_b32_e32 v1, -1 3324; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3325; GFX9-NEXT: v_mov_b32_e32 v2, v0 3326; GFX9-NEXT: s_not_b64 exec, exec 3327; GFX9-NEXT: v_mov_b32_e32 v2, -1 3328; GFX9-NEXT: s_not_b64 exec, exec 3329; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3330; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3331; GFX9-NEXT: s_nop 1 3332; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3333; GFX9-NEXT: s_nop 1 3334; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3335; GFX9-NEXT: s_nop 1 3336; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3337; GFX9-NEXT: s_nop 1 3338; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3339; GFX9-NEXT: s_nop 1 3340; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3341; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3342; GFX9-NEXT: s_nop 0 3343; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3344; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3345; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3346; GFX9-NEXT: ; implicit-def: $vgpr0 3347; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3348; GFX9-NEXT: s_cbranch_execz .LBB14_2 3349; GFX9-NEXT: ; %bb.1: 3350; GFX9-NEXT: v_mov_b32_e32 v0, 0 3351; GFX9-NEXT: v_mov_b32_e32 v3, s4 3352; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3353; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 3354; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3355; GFX9-NEXT: .LBB14_2: 3356; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3357; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3358; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3359; GFX9-NEXT: v_mov_b32_e32 v0, v1 3360; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 3361; GFX9-NEXT: s_mov_b32 s3, 0xf000 3362; GFX9-NEXT: s_mov_b32 s2, -1 3363; GFX9-NEXT: s_nop 0 3364; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3365; GFX9-NEXT: s_endpgm 3366; 3367; GFX1064-LABEL: and_i32_varying: 3368; GFX1064: ; %bb.0: ; %entry 3369; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3370; GFX1064-NEXT: s_not_b64 exec, exec 3371; GFX1064-NEXT: v_mov_b32_e32 v1, -1 3372; GFX1064-NEXT: s_not_b64 exec, exec 3373; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3374; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3375; GFX1064-NEXT: v_mov_b32_e32 v3, -1 3376; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3377; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3378; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3379; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3380; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3381; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3382; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3383; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3384; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3385; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3386; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3387; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3388; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3389; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3390; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3391; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3392; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3393; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3394; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3395; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3396; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3397; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3398; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3399; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3400; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3401; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3402; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3403; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3404; GFX1064-NEXT: s_mov_b32 s2, -1 3405; GFX1064-NEXT: ; implicit-def: $vgpr0 3406; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3407; GFX1064-NEXT: s_cbranch_execz .LBB14_2 3408; GFX1064-NEXT: ; %bb.1: 3409; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3410; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3411; GFX1064-NEXT: s_mov_b32 s3, s7 3412; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3413; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3414; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 3415; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3416; GFX1064-NEXT: buffer_gl0_inv 3417; GFX1064-NEXT: .LBB14_2: 3418; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3419; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3420; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3421; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3422; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 3423; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3424; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3425; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3426; GFX1064-NEXT: s_endpgm 3427; 3428; GFX1032-LABEL: and_i32_varying: 3429; GFX1032: ; %bb.0: ; %entry 3430; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3431; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3432; GFX1032-NEXT: v_mov_b32_e32 v1, -1 3433; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3434; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3435; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3436; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3437; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3438; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3439; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3440; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3441; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3442; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3443; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3444; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3445; GFX1032-NEXT: v_mov_b32_e32 v3, -1 3446; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3447; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3448; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3449; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3450; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3451; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3452; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3453; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3454; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3455; GFX1032-NEXT: s_mov_b32 s2, -1 3456; GFX1032-NEXT: ; implicit-def: $vgpr0 3457; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3458; GFX1032-NEXT: s_cbranch_execz .LBB14_2 3459; GFX1032-NEXT: ; %bb.1: 3460; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3461; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3462; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3463; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3464; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 3465; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3466; GFX1032-NEXT: buffer_gl0_inv 3467; GFX1032-NEXT: .LBB14_2: 3468; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3469; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3470; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3471; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3472; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 3473; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3474; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3475; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3476; GFX1032-NEXT: s_endpgm 3477; 3478; GFX1164-LABEL: and_i32_varying: 3479; GFX1164: ; %bb.0: ; %entry 3480; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3481; GFX1164-NEXT: s_not_b64 exec, exec 3482; GFX1164-NEXT: v_mov_b32_e32 v1, -1 3483; GFX1164-NEXT: s_not_b64 exec, exec 3484; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3485; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3486; GFX1164-NEXT: v_mov_b32_e32 v3, -1 3487; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3488; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3489; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3490; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3491; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3492; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3493; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3494; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3495; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3496; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3497; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3498; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3499; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3500; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3501; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3502; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3503; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3504; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3505; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3506; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3507; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3508; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3509; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3510; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3511; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3512; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3513; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3514; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3515; GFX1164-NEXT: s_mov_b32 s2, -1 3516; GFX1164-NEXT: ; implicit-def: $vgpr0 3517; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3518; GFX1164-NEXT: s_cbranch_execz .LBB14_2 3519; GFX1164-NEXT: ; %bb.1: 3520; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3521; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3522; GFX1164-NEXT: s_mov_b32 s3, s7 3523; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3524; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3525; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 3526; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3527; GFX1164-NEXT: buffer_gl0_inv 3528; GFX1164-NEXT: .LBB14_2: 3529; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3530; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3531; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3532; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 3533; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3534; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3535; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3536; GFX1164-NEXT: s_endpgm 3537; 3538; GFX1132-LABEL: and_i32_varying: 3539; GFX1132: ; %bb.0: ; %entry 3540; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3541; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3542; GFX1132-NEXT: v_mov_b32_e32 v1, -1 3543; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3544; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3545; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3546; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3547; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3548; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3549; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3550; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3551; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3552; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3553; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3554; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3555; GFX1132-NEXT: v_mov_b32_e32 v3, -1 3556; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3557; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3558; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3559; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3560; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3561; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3562; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3563; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3564; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3565; GFX1132-NEXT: s_mov_b32 s2, -1 3566; GFX1132-NEXT: ; implicit-def: $vgpr0 3567; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3568; GFX1132-NEXT: s_cbranch_execz .LBB14_2 3569; GFX1132-NEXT: ; %bb.1: 3570; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3571; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3572; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3573; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3574; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 3575; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3576; GFX1132-NEXT: buffer_gl0_inv 3577; GFX1132-NEXT: .LBB14_2: 3578; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3579; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3580; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3581; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 3582; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3583; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3584; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3585; GFX1132-NEXT: s_endpgm 3586entry: 3587 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3588 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3589 store i32 %old, i32 addrspace(1)* %out 3590 ret void 3591} 3592 3593define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 3594; 3595; 3596; GFX7LESS-LABEL: or_i32_varying: 3597; GFX7LESS: ; %bb.0: ; %entry 3598; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3599; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3600; GFX7LESS-NEXT: s_mov_b32 m0, -1 3601; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3602; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 3603; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3604; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3605; GFX7LESS-NEXT: s_mov_b32 s2, -1 3606; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3607; GFX7LESS-NEXT: s_endpgm 3608; 3609; GFX8-LABEL: or_i32_varying: 3610; GFX8: ; %bb.0: ; %entry 3611; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3612; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3613; GFX8-NEXT: v_mov_b32_e32 v1, 0 3614; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3615; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3616; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3617; GFX8-NEXT: v_mov_b32_e32 v2, v0 3618; GFX8-NEXT: s_not_b64 exec, exec 3619; GFX8-NEXT: v_mov_b32_e32 v2, 0 3620; GFX8-NEXT: s_not_b64 exec, exec 3621; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3622; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3623; GFX8-NEXT: s_nop 1 3624; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3625; GFX8-NEXT: s_nop 1 3626; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3627; GFX8-NEXT: s_nop 1 3628; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3629; GFX8-NEXT: s_nop 1 3630; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3631; GFX8-NEXT: s_nop 1 3632; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3633; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3634; GFX8-NEXT: s_nop 0 3635; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3636; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3637; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3638; GFX8-NEXT: ; implicit-def: $vgpr0 3639; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3640; GFX8-NEXT: s_cbranch_execz .LBB15_2 3641; GFX8-NEXT: ; %bb.1: 3642; GFX8-NEXT: v_mov_b32_e32 v0, 0 3643; GFX8-NEXT: v_mov_b32_e32 v3, s4 3644; GFX8-NEXT: s_mov_b32 m0, -1 3645; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3646; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 3647; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3648; GFX8-NEXT: .LBB15_2: 3649; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3650; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3651; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3652; GFX8-NEXT: v_mov_b32_e32 v0, v1 3653; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 3654; GFX8-NEXT: s_mov_b32 s3, 0xf000 3655; GFX8-NEXT: s_mov_b32 s2, -1 3656; GFX8-NEXT: s_nop 0 3657; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3658; GFX8-NEXT: s_endpgm 3659; 3660; GFX9-LABEL: or_i32_varying: 3661; GFX9: ; %bb.0: ; %entry 3662; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3663; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3664; GFX9-NEXT: v_mov_b32_e32 v1, 0 3665; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3666; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3667; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3668; GFX9-NEXT: v_mov_b32_e32 v2, v0 3669; GFX9-NEXT: s_not_b64 exec, exec 3670; GFX9-NEXT: v_mov_b32_e32 v2, 0 3671; GFX9-NEXT: s_not_b64 exec, exec 3672; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3673; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3674; GFX9-NEXT: s_nop 1 3675; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3676; GFX9-NEXT: s_nop 1 3677; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3678; GFX9-NEXT: s_nop 1 3679; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3680; GFX9-NEXT: s_nop 1 3681; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3682; GFX9-NEXT: s_nop 1 3683; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3684; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3685; GFX9-NEXT: s_nop 0 3686; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3687; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3688; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3689; GFX9-NEXT: ; implicit-def: $vgpr0 3690; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3691; GFX9-NEXT: s_cbranch_execz .LBB15_2 3692; GFX9-NEXT: ; %bb.1: 3693; GFX9-NEXT: v_mov_b32_e32 v0, 0 3694; GFX9-NEXT: v_mov_b32_e32 v3, s4 3695; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3696; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3698; GFX9-NEXT: .LBB15_2: 3699; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3700; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3701; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3702; GFX9-NEXT: v_mov_b32_e32 v0, v1 3703; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3704; GFX9-NEXT: s_mov_b32 s3, 0xf000 3705; GFX9-NEXT: s_mov_b32 s2, -1 3706; GFX9-NEXT: s_nop 0 3707; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3708; GFX9-NEXT: s_endpgm 3709; 3710; GFX1064-LABEL: or_i32_varying: 3711; GFX1064: ; %bb.0: ; %entry 3712; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3713; GFX1064-NEXT: s_not_b64 exec, exec 3714; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3715; GFX1064-NEXT: s_not_b64 exec, exec 3716; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3717; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3718; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3719; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3720; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3721; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3722; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3723; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3724; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3725; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3726; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3727; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3728; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3729; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3730; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3731; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3732; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3733; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3734; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3735; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3736; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3737; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3738; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3739; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3740; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3741; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3742; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3743; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3744; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3745; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3746; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3747; GFX1064-NEXT: s_mov_b32 s2, -1 3748; GFX1064-NEXT: ; implicit-def: $vgpr0 3749; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3750; GFX1064-NEXT: s_cbranch_execz .LBB15_2 3751; GFX1064-NEXT: ; %bb.1: 3752; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3753; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3754; GFX1064-NEXT: s_mov_b32 s3, s7 3755; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3756; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3757; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 3758; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3759; GFX1064-NEXT: buffer_gl0_inv 3760; GFX1064-NEXT: .LBB15_2: 3761; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3762; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3763; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3764; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3765; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3766; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3767; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3768; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3769; GFX1064-NEXT: s_endpgm 3770; 3771; GFX1032-LABEL: or_i32_varying: 3772; GFX1032: ; %bb.0: ; %entry 3773; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3774; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3775; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3776; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3777; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3778; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3779; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3780; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3781; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3782; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3783; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3784; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3785; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3786; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3787; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3788; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3789; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3790; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3791; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3792; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3793; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3794; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3795; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3796; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3797; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3798; GFX1032-NEXT: s_mov_b32 s2, -1 3799; GFX1032-NEXT: ; implicit-def: $vgpr0 3800; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3801; GFX1032-NEXT: s_cbranch_execz .LBB15_2 3802; GFX1032-NEXT: ; %bb.1: 3803; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3804; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3805; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3806; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3807; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 3808; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3809; GFX1032-NEXT: buffer_gl0_inv 3810; GFX1032-NEXT: .LBB15_2: 3811; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3812; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3813; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3814; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3815; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3816; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3817; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3818; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3819; GFX1032-NEXT: s_endpgm 3820; 3821; GFX1164-LABEL: or_i32_varying: 3822; GFX1164: ; %bb.0: ; %entry 3823; GFX1164-NEXT: v_mov_b32_e32 v1, v0 3824; GFX1164-NEXT: s_not_b64 exec, exec 3825; GFX1164-NEXT: v_mov_b32_e32 v1, 0 3826; GFX1164-NEXT: s_not_b64 exec, exec 3827; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3828; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3829; GFX1164-NEXT: v_mov_b32_e32 v3, 0 3830; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3831; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3832; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3833; GFX1164-NEXT: v_mov_b32_e32 v2, v1 3834; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3835; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3836; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 3837; GFX1164-NEXT: v_mov_b32_e32 v2, s4 3838; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3839; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 3840; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3841; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3842; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3843; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3844; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 3845; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 3846; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3847; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3848; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 3849; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 3850; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 3851; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 3852; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 3853; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3854; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 3855; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 3856; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 3857; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3858; GFX1164-NEXT: s_mov_b32 s2, -1 3859; GFX1164-NEXT: ; implicit-def: $vgpr0 3860; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 3861; GFX1164-NEXT: s_cbranch_execz .LBB15_2 3862; GFX1164-NEXT: ; %bb.1: 3863; GFX1164-NEXT: v_mov_b32_e32 v0, 0 3864; GFX1164-NEXT: v_mov_b32_e32 v4, s7 3865; GFX1164-NEXT: s_mov_b32 s3, s7 3866; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3867; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 3868; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v4 3869; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3870; GFX1164-NEXT: buffer_gl0_inv 3871; GFX1164-NEXT: .LBB15_2: 3872; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 3873; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 3874; GFX1164-NEXT: v_mov_b32_e32 v0, v3 3875; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 3876; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 3877; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 3878; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3879; GFX1164-NEXT: s_endpgm 3880; 3881; GFX1132-LABEL: or_i32_varying: 3882; GFX1132: ; %bb.0: ; %entry 3883; GFX1132-NEXT: v_mov_b32_e32 v1, v0 3884; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3885; GFX1132-NEXT: v_mov_b32_e32 v1, 0 3886; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 3887; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3888; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3889; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3890; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3891; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3892; GFX1132-NEXT: v_mov_b32_e32 v2, v1 3893; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3894; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3895; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 3896; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3897; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3898; GFX1132-NEXT: v_mov_b32_e32 v3, 0 3899; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 3900; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 3901; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3902; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3903; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3904; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 3905; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 3906; GFX1132-NEXT: s_mov_b32 exec_lo, s2 3907; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3908; GFX1132-NEXT: s_mov_b32 s2, -1 3909; GFX1132-NEXT: ; implicit-def: $vgpr0 3910; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 3911; GFX1132-NEXT: s_cbranch_execz .LBB15_2 3912; GFX1132-NEXT: ; %bb.1: 3913; GFX1132-NEXT: v_mov_b32_e32 v0, 0 3914; GFX1132-NEXT: v_mov_b32_e32 v4, s4 3915; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3916; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 3917; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v4 3918; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3919; GFX1132-NEXT: buffer_gl0_inv 3920; GFX1132-NEXT: .LBB15_2: 3921; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 3922; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 3923; GFX1132-NEXT: v_mov_b32_e32 v0, v3 3924; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 3925; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 3926; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 3927; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 3928; GFX1132-NEXT: s_endpgm 3929entry: 3930 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3931 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3932 store i32 %old, i32 addrspace(1)* %out 3933 ret void 3934} 3935 3936define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3937; 3938; 3939; GFX7LESS-LABEL: xor_i32_varying: 3940; GFX7LESS: ; %bb.0: ; %entry 3941; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3942; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3943; GFX7LESS-NEXT: s_mov_b32 m0, -1 3944; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3945; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3946; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3947; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3948; GFX7LESS-NEXT: s_mov_b32 s2, -1 3949; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3950; GFX7LESS-NEXT: s_endpgm 3951; 3952; GFX8-LABEL: xor_i32_varying: 3953; GFX8: ; %bb.0: ; %entry 3954; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3955; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3956; GFX8-NEXT: v_mov_b32_e32 v1, 0 3957; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3958; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3959; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3960; GFX8-NEXT: v_mov_b32_e32 v2, v0 3961; GFX8-NEXT: s_not_b64 exec, exec 3962; GFX8-NEXT: v_mov_b32_e32 v2, 0 3963; GFX8-NEXT: s_not_b64 exec, exec 3964; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3965; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3966; GFX8-NEXT: s_nop 1 3967; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3968; GFX8-NEXT: s_nop 1 3969; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3970; GFX8-NEXT: s_nop 1 3971; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3972; GFX8-NEXT: s_nop 1 3973; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3974; GFX8-NEXT: s_nop 1 3975; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3976; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3977; GFX8-NEXT: s_nop 0 3978; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3979; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3980; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3981; GFX8-NEXT: ; implicit-def: $vgpr0 3982; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3983; GFX8-NEXT: s_cbranch_execz .LBB16_2 3984; GFX8-NEXT: ; %bb.1: 3985; GFX8-NEXT: v_mov_b32_e32 v0, 0 3986; GFX8-NEXT: v_mov_b32_e32 v3, s4 3987; GFX8-NEXT: s_mov_b32 m0, -1 3988; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3989; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3990; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3991; GFX8-NEXT: .LBB16_2: 3992; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3993; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3994; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3995; GFX8-NEXT: v_mov_b32_e32 v0, v1 3996; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3997; GFX8-NEXT: s_mov_b32 s3, 0xf000 3998; GFX8-NEXT: s_mov_b32 s2, -1 3999; GFX8-NEXT: s_nop 0 4000; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4001; GFX8-NEXT: s_endpgm 4002; 4003; GFX9-LABEL: xor_i32_varying: 4004; GFX9: ; %bb.0: ; %entry 4005; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4006; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4007; GFX9-NEXT: v_mov_b32_e32 v1, 0 4008; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4009; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4010; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4011; GFX9-NEXT: v_mov_b32_e32 v2, v0 4012; GFX9-NEXT: s_not_b64 exec, exec 4013; GFX9-NEXT: v_mov_b32_e32 v2, 0 4014; GFX9-NEXT: s_not_b64 exec, exec 4015; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4016; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4017; GFX9-NEXT: s_nop 1 4018; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4019; GFX9-NEXT: s_nop 1 4020; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4021; GFX9-NEXT: s_nop 1 4022; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4023; GFX9-NEXT: s_nop 1 4024; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4025; GFX9-NEXT: s_nop 1 4026; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4027; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4028; GFX9-NEXT: s_nop 0 4029; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4030; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4031; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4032; GFX9-NEXT: ; implicit-def: $vgpr0 4033; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4034; GFX9-NEXT: s_cbranch_execz .LBB16_2 4035; GFX9-NEXT: ; %bb.1: 4036; GFX9-NEXT: v_mov_b32_e32 v0, 0 4037; GFX9-NEXT: v_mov_b32_e32 v3, s4 4038; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4039; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 4040; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4041; GFX9-NEXT: .LBB16_2: 4042; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4043; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4044; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4045; GFX9-NEXT: v_mov_b32_e32 v0, v1 4046; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 4047; GFX9-NEXT: s_mov_b32 s3, 0xf000 4048; GFX9-NEXT: s_mov_b32 s2, -1 4049; GFX9-NEXT: s_nop 0 4050; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4051; GFX9-NEXT: s_endpgm 4052; 4053; GFX1064-LABEL: xor_i32_varying: 4054; GFX1064: ; %bb.0: ; %entry 4055; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4056; GFX1064-NEXT: s_not_b64 exec, exec 4057; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4058; GFX1064-NEXT: s_not_b64 exec, exec 4059; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4060; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4061; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4062; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4063; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4064; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4065; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4066; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4067; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4068; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4069; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4070; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4071; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4072; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4073; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4074; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4075; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4076; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4077; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4078; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4079; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4080; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4081; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4082; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4083; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4084; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4085; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4086; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4087; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4088; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4089; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4090; GFX1064-NEXT: s_mov_b32 s2, -1 4091; GFX1064-NEXT: ; implicit-def: $vgpr0 4092; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4093; GFX1064-NEXT: s_cbranch_execz .LBB16_2 4094; GFX1064-NEXT: ; %bb.1: 4095; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4096; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4097; GFX1064-NEXT: s_mov_b32 s3, s7 4098; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4099; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4100; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 4101; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4102; GFX1064-NEXT: buffer_gl0_inv 4103; GFX1064-NEXT: .LBB16_2: 4104; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4105; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4106; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4107; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4108; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 4109; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4110; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4111; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4112; GFX1064-NEXT: s_endpgm 4113; 4114; GFX1032-LABEL: xor_i32_varying: 4115; GFX1032: ; %bb.0: ; %entry 4116; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4117; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4118; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4119; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4120; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4121; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4122; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4123; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4124; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4125; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4126; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4127; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4128; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4129; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4130; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4131; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4132; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4133; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4134; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4135; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4136; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4137; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4138; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4139; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4140; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4141; GFX1032-NEXT: s_mov_b32 s2, -1 4142; GFX1032-NEXT: ; implicit-def: $vgpr0 4143; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4144; GFX1032-NEXT: s_cbranch_execz .LBB16_2 4145; GFX1032-NEXT: ; %bb.1: 4146; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4147; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4148; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4149; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4150; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 4151; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4152; GFX1032-NEXT: buffer_gl0_inv 4153; GFX1032-NEXT: .LBB16_2: 4154; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4155; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4156; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4157; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4158; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 4159; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4160; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4161; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4162; GFX1032-NEXT: s_endpgm 4163; 4164; GFX1164-LABEL: xor_i32_varying: 4165; GFX1164: ; %bb.0: ; %entry 4166; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4167; GFX1164-NEXT: s_not_b64 exec, exec 4168; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4169; GFX1164-NEXT: s_not_b64 exec, exec 4170; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4171; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4172; GFX1164-NEXT: v_mov_b32_e32 v3, 0 4173; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4174; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4175; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4176; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4177; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4178; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4179; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4180; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4181; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4182; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4183; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4184; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4185; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4186; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4187; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4188; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4189; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4190; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4191; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4192; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4193; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4194; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4195; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4196; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4197; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4198; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4199; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4200; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4201; GFX1164-NEXT: s_mov_b32 s2, -1 4202; GFX1164-NEXT: ; implicit-def: $vgpr0 4203; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4204; GFX1164-NEXT: s_cbranch_execz .LBB16_2 4205; GFX1164-NEXT: ; %bb.1: 4206; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4207; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4208; GFX1164-NEXT: s_mov_b32 s3, s7 4209; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4210; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4211; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v4 4212; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4213; GFX1164-NEXT: buffer_gl0_inv 4214; GFX1164-NEXT: .LBB16_2: 4215; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4216; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4217; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4218; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 4219; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4220; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4221; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4222; GFX1164-NEXT: s_endpgm 4223; 4224; GFX1132-LABEL: xor_i32_varying: 4225; GFX1132: ; %bb.0: ; %entry 4226; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4227; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4228; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4229; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4230; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4231; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4232; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4233; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4234; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4235; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4236; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4237; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4238; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4239; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4240; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4241; GFX1132-NEXT: v_mov_b32_e32 v3, 0 4242; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4243; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4244; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4245; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4246; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4247; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4248; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4249; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4250; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4251; GFX1132-NEXT: s_mov_b32 s2, -1 4252; GFX1132-NEXT: ; implicit-def: $vgpr0 4253; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4254; GFX1132-NEXT: s_cbranch_execz .LBB16_2 4255; GFX1132-NEXT: ; %bb.1: 4256; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4257; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4258; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4259; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4260; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v4 4261; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4262; GFX1132-NEXT: buffer_gl0_inv 4263; GFX1132-NEXT: .LBB16_2: 4264; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4265; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4266; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4267; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 4268; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4269; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4270; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4271; GFX1132-NEXT: s_endpgm 4272entry: 4273 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4274 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4275 store i32 %old, i32 addrspace(1)* %out 4276 ret void 4277} 4278 4279define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 4280; 4281; 4282; GFX7LESS-LABEL: max_i32_varying: 4283; GFX7LESS: ; %bb.0: ; %entry 4284; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4285; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4286; GFX7LESS-NEXT: s_mov_b32 m0, -1 4287; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4288; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 4289; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4290; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4291; GFX7LESS-NEXT: s_mov_b32 s2, -1 4292; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4293; GFX7LESS-NEXT: s_endpgm 4294; 4295; GFX8-LABEL: max_i32_varying: 4296; GFX8: ; %bb.0: ; %entry 4297; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4298; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4299; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4300; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4301; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4302; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4303; GFX8-NEXT: v_mov_b32_e32 v2, v0 4304; GFX8-NEXT: s_not_b64 exec, exec 4305; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 4306; GFX8-NEXT: s_not_b64 exec, exec 4307; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4308; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4309; GFX8-NEXT: s_nop 1 4310; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4311; GFX8-NEXT: s_nop 1 4312; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4313; GFX8-NEXT: s_nop 1 4314; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4315; GFX8-NEXT: s_nop 1 4316; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4317; GFX8-NEXT: s_nop 1 4318; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4319; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4320; GFX8-NEXT: s_nop 0 4321; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4322; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4323; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4324; GFX8-NEXT: ; implicit-def: $vgpr0 4325; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4326; GFX8-NEXT: s_cbranch_execz .LBB17_2 4327; GFX8-NEXT: ; %bb.1: 4328; GFX8-NEXT: v_mov_b32_e32 v0, 0 4329; GFX8-NEXT: v_mov_b32_e32 v3, s4 4330; GFX8-NEXT: s_mov_b32 m0, -1 4331; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4332; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 4333; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4334; GFX8-NEXT: .LBB17_2: 4335; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4336; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4337; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4338; GFX8-NEXT: v_mov_b32_e32 v0, v1 4339; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 4340; GFX8-NEXT: s_mov_b32 s3, 0xf000 4341; GFX8-NEXT: s_mov_b32 s2, -1 4342; GFX8-NEXT: s_nop 0 4343; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4344; GFX8-NEXT: s_endpgm 4345; 4346; GFX9-LABEL: max_i32_varying: 4347; GFX9: ; %bb.0: ; %entry 4348; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4349; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4350; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4351; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4352; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4353; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4354; GFX9-NEXT: v_mov_b32_e32 v2, v0 4355; GFX9-NEXT: s_not_b64 exec, exec 4356; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 4357; GFX9-NEXT: s_not_b64 exec, exec 4358; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4359; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4360; GFX9-NEXT: s_nop 1 4361; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4362; GFX9-NEXT: s_nop 1 4363; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4364; GFX9-NEXT: s_nop 1 4365; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4366; GFX9-NEXT: s_nop 1 4367; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4368; GFX9-NEXT: s_nop 1 4369; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4370; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4371; GFX9-NEXT: s_nop 0 4372; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4373; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4374; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4375; GFX9-NEXT: ; implicit-def: $vgpr0 4376; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4377; GFX9-NEXT: s_cbranch_execz .LBB17_2 4378; GFX9-NEXT: ; %bb.1: 4379; GFX9-NEXT: v_mov_b32_e32 v0, 0 4380; GFX9-NEXT: v_mov_b32_e32 v3, s4 4381; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4382; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 4383; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4384; GFX9-NEXT: .LBB17_2: 4385; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4386; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4387; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4388; GFX9-NEXT: v_mov_b32_e32 v0, v1 4389; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 4390; GFX9-NEXT: s_mov_b32 s3, 0xf000 4391; GFX9-NEXT: s_mov_b32 s2, -1 4392; GFX9-NEXT: s_nop 0 4393; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4394; GFX9-NEXT: s_endpgm 4395; 4396; GFX1064-LABEL: max_i32_varying: 4397; GFX1064: ; %bb.0: ; %entry 4398; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4399; GFX1064-NEXT: s_not_b64 exec, exec 4400; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 4401; GFX1064-NEXT: s_not_b64 exec, exec 4402; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4403; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4404; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 4405; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4406; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4407; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4408; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4409; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4410; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4411; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4412; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4413; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4414; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4415; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4416; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4417; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4418; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4419; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4420; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4421; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4422; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4423; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4424; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4425; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4426; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4427; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4428; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4429; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4430; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4431; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4432; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4433; GFX1064-NEXT: s_mov_b32 s2, -1 4434; GFX1064-NEXT: ; implicit-def: $vgpr0 4435; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4436; GFX1064-NEXT: s_cbranch_execz .LBB17_2 4437; GFX1064-NEXT: ; %bb.1: 4438; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4439; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4440; GFX1064-NEXT: s_mov_b32 s3, s7 4441; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4442; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4443; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 4444; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4445; GFX1064-NEXT: buffer_gl0_inv 4446; GFX1064-NEXT: .LBB17_2: 4447; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4448; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4449; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4450; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4451; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 4452; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4453; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4454; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4455; GFX1064-NEXT: s_endpgm 4456; 4457; GFX1032-LABEL: max_i32_varying: 4458; GFX1032: ; %bb.0: ; %entry 4459; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4460; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4461; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 4462; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4463; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4464; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4465; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4466; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4467; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4468; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4469; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4470; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4471; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4472; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4473; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4474; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 4475; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4476; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4477; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4478; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4479; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4480; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4481; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4482; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4483; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4484; GFX1032-NEXT: s_mov_b32 s2, -1 4485; GFX1032-NEXT: ; implicit-def: $vgpr0 4486; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4487; GFX1032-NEXT: s_cbranch_execz .LBB17_2 4488; GFX1032-NEXT: ; %bb.1: 4489; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4490; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4491; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4492; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4493; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 4494; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4495; GFX1032-NEXT: buffer_gl0_inv 4496; GFX1032-NEXT: .LBB17_2: 4497; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4498; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4499; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4500; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4501; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 4502; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4503; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4504; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4505; GFX1032-NEXT: s_endpgm 4506; 4507; GFX1164-LABEL: max_i32_varying: 4508; GFX1164: ; %bb.0: ; %entry 4509; GFX1164-NEXT: v_mov_b32_e32 v1, v0 4510; GFX1164-NEXT: s_not_b64 exec, exec 4511; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 4512; GFX1164-NEXT: s_not_b64 exec, exec 4513; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4514; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4515; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 4516; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4517; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4518; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4519; GFX1164-NEXT: v_mov_b32_e32 v2, v1 4520; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4521; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4522; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 4523; GFX1164-NEXT: v_mov_b32_e32 v2, s4 4524; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4525; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 4526; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4527; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4528; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4529; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4530; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 4531; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 4532; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4533; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4534; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 4535; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 4536; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 4537; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 4538; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 4539; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4540; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 4541; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 4542; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 4543; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4544; GFX1164-NEXT: s_mov_b32 s2, -1 4545; GFX1164-NEXT: ; implicit-def: $vgpr0 4546; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 4547; GFX1164-NEXT: s_cbranch_execz .LBB17_2 4548; GFX1164-NEXT: ; %bb.1: 4549; GFX1164-NEXT: v_mov_b32_e32 v0, 0 4550; GFX1164-NEXT: v_mov_b32_e32 v4, s7 4551; GFX1164-NEXT: s_mov_b32 s3, s7 4552; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4553; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4554; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 4555; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4556; GFX1164-NEXT: buffer_gl0_inv 4557; GFX1164-NEXT: .LBB17_2: 4558; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 4559; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 4560; GFX1164-NEXT: v_mov_b32_e32 v0, v3 4561; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 4562; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4563; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4564; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4565; GFX1164-NEXT: s_endpgm 4566; 4567; GFX1132-LABEL: max_i32_varying: 4568; GFX1132: ; %bb.0: ; %entry 4569; GFX1132-NEXT: v_mov_b32_e32 v1, v0 4570; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4571; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 4572; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 4573; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4574; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4575; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4576; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4577; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4578; GFX1132-NEXT: v_mov_b32_e32 v2, v1 4579; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4580; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4581; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4582; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4583; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4584; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 4585; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 4586; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 4587; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4588; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4589; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4590; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 4591; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 4592; GFX1132-NEXT: s_mov_b32 exec_lo, s2 4593; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4594; GFX1132-NEXT: s_mov_b32 s2, -1 4595; GFX1132-NEXT: ; implicit-def: $vgpr0 4596; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 4597; GFX1132-NEXT: s_cbranch_execz .LBB17_2 4598; GFX1132-NEXT: ; %bb.1: 4599; GFX1132-NEXT: v_mov_b32_e32 v0, 0 4600; GFX1132-NEXT: v_mov_b32_e32 v4, s4 4601; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4602; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4603; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 4604; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4605; GFX1132-NEXT: buffer_gl0_inv 4606; GFX1132-NEXT: .LBB17_2: 4607; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 4608; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 4609; GFX1132-NEXT: v_mov_b32_e32 v0, v3 4610; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 4611; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4612; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4613; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 4614; GFX1132-NEXT: s_endpgm 4615entry: 4616 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4617 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4618 store i32 %old, i32 addrspace(1)* %out 4619 ret void 4620} 4621 4622define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 4623; 4624; 4625; GFX7LESS-LABEL: max_i64_constant: 4626; GFX7LESS: ; %bb.0: ; %entry 4627; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4628; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4629; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4630; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4631; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4632; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4633; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 4634; GFX7LESS-NEXT: ; %bb.1: 4635; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4636; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4637; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4638; GFX7LESS-NEXT: s_mov_b32 m0, -1 4639; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4640; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4641; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4642; GFX7LESS-NEXT: .LBB18_2: 4643; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4644; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4645; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4646; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4647; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 4648; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4649; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4650; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4651; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4652; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4653; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 4654; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4655; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4656; GFX7LESS-NEXT: s_mov_b32 s2, -1 4657; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4658; GFX7LESS-NEXT: s_endpgm 4659; 4660; GFX8-LABEL: max_i64_constant: 4661; GFX8: ; %bb.0: ; %entry 4662; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4663; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4664; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4665; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4666; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4667; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4668; GFX8-NEXT: s_cbranch_execz .LBB18_2 4669; GFX8-NEXT: ; %bb.1: 4670; GFX8-NEXT: v_mov_b32_e32 v0, 5 4671; GFX8-NEXT: v_mov_b32_e32 v2, 0 4672; GFX8-NEXT: v_mov_b32_e32 v1, 0 4673; GFX8-NEXT: s_mov_b32 m0, -1 4674; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4675; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4676; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4677; GFX8-NEXT: .LBB18_2: 4678; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4679; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4680; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4681; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 4682; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4683; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4684; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4685; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4686; GFX8-NEXT: v_mov_b32_e32 v2, s3 4687; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4688; GFX8-NEXT: v_mov_b32_e32 v2, s2 4689; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4690; GFX8-NEXT: s_mov_b32 s3, 0xf000 4691; GFX8-NEXT: s_mov_b32 s2, -1 4692; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4693; GFX8-NEXT: s_endpgm 4694; 4695; GFX9-LABEL: max_i64_constant: 4696; GFX9: ; %bb.0: ; %entry 4697; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4698; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4699; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4700; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4701; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4702; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4703; GFX9-NEXT: s_cbranch_execz .LBB18_2 4704; GFX9-NEXT: ; %bb.1: 4705; GFX9-NEXT: v_mov_b32_e32 v0, 5 4706; GFX9-NEXT: v_mov_b32_e32 v1, 0 4707; GFX9-NEXT: v_mov_b32_e32 v2, 0 4708; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4709; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4710; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4711; GFX9-NEXT: .LBB18_2: 4712; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4713; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4714; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4715; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 4716; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4717; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4718; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4719; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4720; GFX9-NEXT: v_mov_b32_e32 v2, s3 4721; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4722; GFX9-NEXT: v_mov_b32_e32 v2, s2 4723; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4724; GFX9-NEXT: s_mov_b32 s3, 0xf000 4725; GFX9-NEXT: s_mov_b32 s2, -1 4726; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4727; GFX9-NEXT: s_endpgm 4728; 4729; GFX1064-LABEL: max_i64_constant: 4730; GFX1064: ; %bb.0: ; %entry 4731; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4732; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4733; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4734; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4735; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4736; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4737; GFX1064-NEXT: s_cbranch_execz .LBB18_2 4738; GFX1064-NEXT: ; %bb.1: 4739; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4740; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4741; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4742; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4743; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4744; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4745; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4746; GFX1064-NEXT: buffer_gl0_inv 4747; GFX1064-NEXT: .LBB18_2: 4748; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4749; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4750; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4751; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4752; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4753; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4754; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4755; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4756; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4757; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4758; GFX1064-NEXT: s_mov_b32 s2, -1 4759; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4760; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4761; GFX1064-NEXT: s_endpgm 4762; 4763; GFX1032-LABEL: max_i64_constant: 4764; GFX1032: ; %bb.0: ; %entry 4765; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4766; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4767; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4768; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4769; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4770; GFX1032-NEXT: s_cbranch_execz .LBB18_2 4771; GFX1032-NEXT: ; %bb.1: 4772; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4773; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4774; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4775; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4776; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4777; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4778; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4779; GFX1032-NEXT: buffer_gl0_inv 4780; GFX1032-NEXT: .LBB18_2: 4781; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4782; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4783; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4784; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4785; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4786; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4787; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4788; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4789; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4790; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4791; GFX1032-NEXT: s_mov_b32 s2, -1 4792; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4793; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4794; GFX1032-NEXT: s_endpgm 4795; 4796; GFX1164-LABEL: max_i64_constant: 4797; GFX1164: ; %bb.0: ; %entry 4798; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4799; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4800; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4801; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4802; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 4803; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 4804; GFX1164-NEXT: s_cbranch_execz .LBB18_2 4805; GFX1164-NEXT: ; %bb.1: 4806; GFX1164-NEXT: v_mov_b32_e32 v0, 5 4807; GFX1164-NEXT: v_mov_b32_e32 v1, 0 4808; GFX1164-NEXT: v_mov_b32_e32 v2, 0 4809; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4810; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 4811; GFX1164-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4812; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4813; GFX1164-NEXT: buffer_gl0_inv 4814; GFX1164-NEXT: .LBB18_2: 4815; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 4816; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 4817; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 4818; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 4819; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4820; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 4821; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4822; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4823; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 4824; GFX1164-NEXT: s_mov_b32 s2, -1 4825; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 4826; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 4827; GFX1164-NEXT: s_endpgm 4828; 4829; GFX1132-LABEL: max_i64_constant: 4830; GFX1132: ; %bb.0: ; %entry 4831; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 4832; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4833; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4834; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 4835; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 4836; GFX1132-NEXT: s_cbranch_execz .LBB18_2 4837; GFX1132-NEXT: ; %bb.1: 4838; GFX1132-NEXT: v_mov_b32_e32 v0, 5 4839; GFX1132-NEXT: v_mov_b32_e32 v1, 0 4840; GFX1132-NEXT: v_mov_b32_e32 v2, 0 4841; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4842; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 4843; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 4844; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4845; GFX1132-NEXT: buffer_gl0_inv 4846; GFX1132-NEXT: .LBB18_2: 4847; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 4848; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 4849; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 4850; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 4851; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4852; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 4853; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4854; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4855; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 4856; GFX1132-NEXT: s_mov_b32 s2, -1 4857; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 4858; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 4859; GFX1132-NEXT: s_endpgm 4860entry: 4861 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 4862 store i64 %old, i64 addrspace(1)* %out 4863 ret void 4864} 4865 4866define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 4867; 4868; 4869; GFX7LESS-LABEL: min_i32_varying: 4870; GFX7LESS: ; %bb.0: ; %entry 4871; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4872; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4873; GFX7LESS-NEXT: s_mov_b32 m0, -1 4874; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4875; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 4876; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4877; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4878; GFX7LESS-NEXT: s_mov_b32 s2, -1 4879; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4880; GFX7LESS-NEXT: s_endpgm 4881; 4882; GFX8-LABEL: min_i32_varying: 4883; GFX8: ; %bb.0: ; %entry 4884; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4885; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4886; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4887; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4888; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 4889; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4890; GFX8-NEXT: v_mov_b32_e32 v2, v0 4891; GFX8-NEXT: s_not_b64 exec, exec 4892; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 4893; GFX8-NEXT: s_not_b64 exec, exec 4894; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4895; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4896; GFX8-NEXT: s_nop 1 4897; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4898; GFX8-NEXT: s_nop 1 4899; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4900; GFX8-NEXT: s_nop 1 4901; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4902; GFX8-NEXT: s_nop 1 4903; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4904; GFX8-NEXT: s_nop 1 4905; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4906; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4907; GFX8-NEXT: s_nop 0 4908; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4909; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4910; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4911; GFX8-NEXT: ; implicit-def: $vgpr0 4912; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4913; GFX8-NEXT: s_cbranch_execz .LBB19_2 4914; GFX8-NEXT: ; %bb.1: 4915; GFX8-NEXT: v_mov_b32_e32 v0, 0 4916; GFX8-NEXT: v_mov_b32_e32 v3, s4 4917; GFX8-NEXT: s_mov_b32 m0, -1 4918; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4919; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 4920; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4921; GFX8-NEXT: .LBB19_2: 4922; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4923; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4924; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4925; GFX8-NEXT: v_mov_b32_e32 v0, v1 4926; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 4927; GFX8-NEXT: s_mov_b32 s3, 0xf000 4928; GFX8-NEXT: s_mov_b32 s2, -1 4929; GFX8-NEXT: s_nop 0 4930; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4931; GFX8-NEXT: s_endpgm 4932; 4933; GFX9-LABEL: min_i32_varying: 4934; GFX9: ; %bb.0: ; %entry 4935; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4936; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4937; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4938; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4939; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 4940; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4941; GFX9-NEXT: v_mov_b32_e32 v2, v0 4942; GFX9-NEXT: s_not_b64 exec, exec 4943; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 4944; GFX9-NEXT: s_not_b64 exec, exec 4945; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4946; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4947; GFX9-NEXT: s_nop 1 4948; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4949; GFX9-NEXT: s_nop 1 4950; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4951; GFX9-NEXT: s_nop 1 4952; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4953; GFX9-NEXT: s_nop 1 4954; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4955; GFX9-NEXT: s_nop 1 4956; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4957; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4958; GFX9-NEXT: s_nop 0 4959; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4960; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4961; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4962; GFX9-NEXT: ; implicit-def: $vgpr0 4963; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4964; GFX9-NEXT: s_cbranch_execz .LBB19_2 4965; GFX9-NEXT: ; %bb.1: 4966; GFX9-NEXT: v_mov_b32_e32 v0, 0 4967; GFX9-NEXT: v_mov_b32_e32 v3, s4 4968; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4969; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 4970; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4971; GFX9-NEXT: .LBB19_2: 4972; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4973; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4974; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4975; GFX9-NEXT: v_mov_b32_e32 v0, v1 4976; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 4977; GFX9-NEXT: s_mov_b32 s3, 0xf000 4978; GFX9-NEXT: s_mov_b32 s2, -1 4979; GFX9-NEXT: s_nop 0 4980; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4981; GFX9-NEXT: s_endpgm 4982; 4983; GFX1064-LABEL: min_i32_varying: 4984; GFX1064: ; %bb.0: ; %entry 4985; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4986; GFX1064-NEXT: s_not_b64 exec, exec 4987; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 4988; GFX1064-NEXT: s_not_b64 exec, exec 4989; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4990; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4991; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 4992; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4993; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4994; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4995; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4996; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4997; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4998; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4999; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5000; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5001; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5002; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5003; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5004; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5005; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5006; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5007; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5008; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5009; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5010; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5011; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5012; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5013; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5014; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5015; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5016; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5017; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5018; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5019; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5020; GFX1064-NEXT: s_mov_b32 s2, -1 5021; GFX1064-NEXT: ; implicit-def: $vgpr0 5022; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5023; GFX1064-NEXT: s_cbranch_execz .LBB19_2 5024; GFX1064-NEXT: ; %bb.1: 5025; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5026; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5027; GFX1064-NEXT: s_mov_b32 s3, s7 5028; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5029; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5030; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 5031; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5032; GFX1064-NEXT: buffer_gl0_inv 5033; GFX1064-NEXT: .LBB19_2: 5034; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5035; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5036; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5037; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5038; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 5039; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5040; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5041; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5042; GFX1064-NEXT: s_endpgm 5043; 5044; GFX1032-LABEL: min_i32_varying: 5045; GFX1032: ; %bb.0: ; %entry 5046; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5047; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5048; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 5049; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5050; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5051; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5052; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5053; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5054; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5055; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5056; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5057; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5058; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5059; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5060; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5061; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 5062; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5063; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5064; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5065; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5066; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5067; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5068; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5069; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5070; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5071; GFX1032-NEXT: s_mov_b32 s2, -1 5072; GFX1032-NEXT: ; implicit-def: $vgpr0 5073; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5074; GFX1032-NEXT: s_cbranch_execz .LBB19_2 5075; GFX1032-NEXT: ; %bb.1: 5076; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5077; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5078; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5079; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5080; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 5081; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5082; GFX1032-NEXT: buffer_gl0_inv 5083; GFX1032-NEXT: .LBB19_2: 5084; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5085; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5086; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5087; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5088; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 5089; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5090; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5091; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5092; GFX1032-NEXT: s_endpgm 5093; 5094; GFX1164-LABEL: min_i32_varying: 5095; GFX1164: ; %bb.0: ; %entry 5096; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5097; GFX1164-NEXT: s_not_b64 exec, exec 5098; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 5099; GFX1164-NEXT: s_not_b64 exec, exec 5100; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5101; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5102; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 5103; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5104; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5105; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5106; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5107; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5108; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5109; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5110; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5111; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5112; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5113; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5114; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5115; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5116; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5117; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5118; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5119; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5120; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5121; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5122; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5123; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5124; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5125; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5126; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5127; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5128; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5129; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5130; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5131; GFX1164-NEXT: s_mov_b32 s2, -1 5132; GFX1164-NEXT: ; implicit-def: $vgpr0 5133; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5134; GFX1164-NEXT: s_cbranch_execz .LBB19_2 5135; GFX1164-NEXT: ; %bb.1: 5136; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5137; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5138; GFX1164-NEXT: s_mov_b32 s3, s7 5139; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5140; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5141; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 5142; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5143; GFX1164-NEXT: buffer_gl0_inv 5144; GFX1164-NEXT: .LBB19_2: 5145; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5146; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5147; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5148; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 5149; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5150; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5151; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5152; GFX1164-NEXT: s_endpgm 5153; 5154; GFX1132-LABEL: min_i32_varying: 5155; GFX1132: ; %bb.0: ; %entry 5156; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5157; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5158; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 5159; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5160; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5161; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5162; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 5163; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 5164; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 5165; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5166; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5167; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5168; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5169; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5170; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5171; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 5172; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5173; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5174; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5175; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5176; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5177; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5178; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5179; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5180; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5181; GFX1132-NEXT: s_mov_b32 s2, -1 5182; GFX1132-NEXT: ; implicit-def: $vgpr0 5183; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5184; GFX1132-NEXT: s_cbranch_execz .LBB19_2 5185; GFX1132-NEXT: ; %bb.1: 5186; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5187; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5188; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5189; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5190; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 5191; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5192; GFX1132-NEXT: buffer_gl0_inv 5193; GFX1132-NEXT: .LBB19_2: 5194; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5195; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5196; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5197; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 5198; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5199; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5200; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5201; GFX1132-NEXT: s_endpgm 5202entry: 5203 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5204 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5205 store i32 %old, i32 addrspace(1)* %out 5206 ret void 5207} 5208 5209define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 5210; 5211; 5212; GFX7LESS-LABEL: min_i64_constant: 5213; GFX7LESS: ; %bb.0: ; %entry 5214; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5215; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5216; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5217; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5218; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5219; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5220; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 5221; GFX7LESS-NEXT: ; %bb.1: 5222; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5223; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5224; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5225; GFX7LESS-NEXT: s_mov_b32 m0, -1 5226; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5227; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5228; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5229; GFX7LESS-NEXT: .LBB20_2: 5230; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5231; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5232; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5233; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5234; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 5235; GFX7LESS-NEXT: s_mov_b32 s2, -1 5236; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5237; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5238; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 5239; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 5240; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5241; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5242; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5243; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5244; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5245; GFX7LESS-NEXT: s_endpgm 5246; 5247; GFX8-LABEL: min_i64_constant: 5248; GFX8: ; %bb.0: ; %entry 5249; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5250; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5251; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5252; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5253; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5254; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5255; GFX8-NEXT: s_cbranch_execz .LBB20_2 5256; GFX8-NEXT: ; %bb.1: 5257; GFX8-NEXT: v_mov_b32_e32 v0, 5 5258; GFX8-NEXT: v_mov_b32_e32 v2, 0 5259; GFX8-NEXT: v_mov_b32_e32 v1, 0 5260; GFX8-NEXT: s_mov_b32 m0, -1 5261; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5262; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5263; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5264; GFX8-NEXT: .LBB20_2: 5265; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5266; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5267; GFX8-NEXT: v_readfirstlane_b32 s4, v0 5268; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 5269; GFX8-NEXT: v_readfirstlane_b32 s5, v1 5270; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5271; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5272; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5273; GFX8-NEXT: v_mov_b32_e32 v2, s5 5274; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5275; GFX8-NEXT: v_mov_b32_e32 v2, s4 5276; GFX8-NEXT: s_mov_b32 s2, -1 5277; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5278; GFX8-NEXT: s_mov_b32 s3, 0xf000 5279; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5280; GFX8-NEXT: s_endpgm 5281; 5282; GFX9-LABEL: min_i64_constant: 5283; GFX9: ; %bb.0: ; %entry 5284; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5285; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5286; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5287; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5288; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5289; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5290; GFX9-NEXT: s_cbranch_execz .LBB20_2 5291; GFX9-NEXT: ; %bb.1: 5292; GFX9-NEXT: v_mov_b32_e32 v0, 5 5293; GFX9-NEXT: v_mov_b32_e32 v1, 0 5294; GFX9-NEXT: v_mov_b32_e32 v2, 0 5295; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5296; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5297; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5298; GFX9-NEXT: .LBB20_2: 5299; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5300; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5301; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5302; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 5303; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5304; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 5305; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5306; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 5307; GFX9-NEXT: v_mov_b32_e32 v2, s5 5308; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5309; GFX9-NEXT: v_mov_b32_e32 v2, s4 5310; GFX9-NEXT: s_mov_b32 s2, -1 5311; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5312; GFX9-NEXT: s_mov_b32 s3, 0xf000 5313; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5314; GFX9-NEXT: s_endpgm 5315; 5316; GFX1064-LABEL: min_i64_constant: 5317; GFX1064: ; %bb.0: ; %entry 5318; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5319; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5320; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5321; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5322; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5323; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5324; GFX1064-NEXT: s_cbranch_execz .LBB20_2 5325; GFX1064-NEXT: ; %bb.1: 5326; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5327; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5328; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5329; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5330; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5331; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5332; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5333; GFX1064-NEXT: buffer_gl0_inv 5334; GFX1064-NEXT: .LBB20_2: 5335; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5336; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5337; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5338; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5339; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5340; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5341; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5342; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5343; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5344; GFX1064-NEXT: s_mov_b32 s2, -1 5345; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5346; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5347; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5348; GFX1064-NEXT: s_endpgm 5349; 5350; GFX1032-LABEL: min_i64_constant: 5351; GFX1032: ; %bb.0: ; %entry 5352; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5353; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5354; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5355; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5356; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5357; GFX1032-NEXT: s_cbranch_execz .LBB20_2 5358; GFX1032-NEXT: ; %bb.1: 5359; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5360; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5361; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5362; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5363; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5364; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5365; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5366; GFX1032-NEXT: buffer_gl0_inv 5367; GFX1032-NEXT: .LBB20_2: 5368; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5369; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5370; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5371; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5372; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5373; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5374; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5375; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5376; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5377; GFX1032-NEXT: s_mov_b32 s2, -1 5378; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5379; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5380; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5381; GFX1032-NEXT: s_endpgm 5382; 5383; GFX1164-LABEL: min_i64_constant: 5384; GFX1164: ; %bb.0: ; %entry 5385; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5386; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5387; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5388; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5389; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5390; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5391; GFX1164-NEXT: s_cbranch_execz .LBB20_2 5392; GFX1164-NEXT: ; %bb.1: 5393; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5394; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5395; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5396; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5397; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5398; GFX1164-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5399; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5400; GFX1164-NEXT: buffer_gl0_inv 5401; GFX1164-NEXT: .LBB20_2: 5402; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5403; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5404; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5405; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 5406; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5407; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5408; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 5409; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5410; GFX1164-NEXT: s_mov_b32 s2, -1 5411; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5412; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5413; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5414; GFX1164-NEXT: s_endpgm 5415; 5416; GFX1132-LABEL: min_i64_constant: 5417; GFX1132: ; %bb.0: ; %entry 5418; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5419; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5420; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5421; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 5422; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 5423; GFX1132-NEXT: s_cbranch_execz .LBB20_2 5424; GFX1132-NEXT: ; %bb.1: 5425; GFX1132-NEXT: v_mov_b32_e32 v0, 5 5426; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5427; GFX1132-NEXT: v_mov_b32_e32 v2, 0 5428; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5429; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5430; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 5431; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5432; GFX1132-NEXT: buffer_gl0_inv 5433; GFX1132-NEXT: .LBB20_2: 5434; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 5435; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 5436; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 5437; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 5438; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5439; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 5440; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 5441; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5442; GFX1132-NEXT: s_mov_b32 s2, -1 5443; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5444; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5445; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5446; GFX1132-NEXT: s_endpgm 5447entry: 5448 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 5449 store i64 %old, i64 addrspace(1)* %out 5450 ret void 5451} 5452 5453define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 5454; 5455; 5456; GFX7LESS-LABEL: umax_i32_varying: 5457; GFX7LESS: ; %bb.0: ; %entry 5458; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5459; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5460; GFX7LESS-NEXT: s_mov_b32 m0, -1 5461; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5462; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 5463; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5464; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5465; GFX7LESS-NEXT: s_mov_b32 s2, -1 5466; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 5467; GFX7LESS-NEXT: s_endpgm 5468; 5469; GFX8-LABEL: umax_i32_varying: 5470; GFX8: ; %bb.0: ; %entry 5471; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5472; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5473; GFX8-NEXT: v_mov_b32_e32 v1, 0 5474; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5475; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5476; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5477; GFX8-NEXT: v_mov_b32_e32 v2, v0 5478; GFX8-NEXT: s_not_b64 exec, exec 5479; GFX8-NEXT: v_mov_b32_e32 v2, 0 5480; GFX8-NEXT: s_not_b64 exec, exec 5481; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 5482; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5483; GFX8-NEXT: s_nop 1 5484; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5485; GFX8-NEXT: s_nop 1 5486; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5487; GFX8-NEXT: s_nop 1 5488; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5489; GFX8-NEXT: s_nop 1 5490; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5491; GFX8-NEXT: s_nop 1 5492; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5493; GFX8-NEXT: v_readlane_b32 s4, v2, 63 5494; GFX8-NEXT: s_nop 0 5495; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5496; GFX8-NEXT: s_mov_b64 exec, s[2:3] 5497; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5498; GFX8-NEXT: ; implicit-def: $vgpr0 5499; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5500; GFX8-NEXT: s_cbranch_execz .LBB21_2 5501; GFX8-NEXT: ; %bb.1: 5502; GFX8-NEXT: v_mov_b32_e32 v0, 0 5503; GFX8-NEXT: v_mov_b32_e32 v3, s4 5504; GFX8-NEXT: s_mov_b32 m0, -1 5505; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5506; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 5507; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5508; GFX8-NEXT: .LBB21_2: 5509; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5510; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5511; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5512; GFX8-NEXT: v_mov_b32_e32 v0, v1 5513; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 5514; GFX8-NEXT: s_mov_b32 s3, 0xf000 5515; GFX8-NEXT: s_mov_b32 s2, -1 5516; GFX8-NEXT: s_nop 0 5517; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 5518; GFX8-NEXT: s_endpgm 5519; 5520; GFX9-LABEL: umax_i32_varying: 5521; GFX9: ; %bb.0: ; %entry 5522; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5523; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5524; GFX9-NEXT: v_mov_b32_e32 v1, 0 5525; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5526; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 5527; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 5528; GFX9-NEXT: v_mov_b32_e32 v2, v0 5529; GFX9-NEXT: s_not_b64 exec, exec 5530; GFX9-NEXT: v_mov_b32_e32 v2, 0 5531; GFX9-NEXT: s_not_b64 exec, exec 5532; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 5533; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5534; GFX9-NEXT: s_nop 1 5535; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5536; GFX9-NEXT: s_nop 1 5537; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5538; GFX9-NEXT: s_nop 1 5539; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5540; GFX9-NEXT: s_nop 1 5541; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 5542; GFX9-NEXT: s_nop 1 5543; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 5544; GFX9-NEXT: v_readlane_b32 s4, v2, 63 5545; GFX9-NEXT: s_nop 0 5546; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 5547; GFX9-NEXT: s_mov_b64 exec, s[2:3] 5548; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 5549; GFX9-NEXT: ; implicit-def: $vgpr0 5550; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5551; GFX9-NEXT: s_cbranch_execz .LBB21_2 5552; GFX9-NEXT: ; %bb.1: 5553; GFX9-NEXT: v_mov_b32_e32 v0, 0 5554; GFX9-NEXT: v_mov_b32_e32 v3, s4 5555; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5556; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 5557; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5558; GFX9-NEXT: .LBB21_2: 5559; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5560; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5561; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5562; GFX9-NEXT: v_mov_b32_e32 v0, v1 5563; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 5564; GFX9-NEXT: s_mov_b32 s3, 0xf000 5565; GFX9-NEXT: s_mov_b32 s2, -1 5566; GFX9-NEXT: s_nop 0 5567; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 5568; GFX9-NEXT: s_endpgm 5569; 5570; GFX1064-LABEL: umax_i32_varying: 5571; GFX1064: ; %bb.0: ; %entry 5572; GFX1064-NEXT: v_mov_b32_e32 v1, v0 5573; GFX1064-NEXT: s_not_b64 exec, exec 5574; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5575; GFX1064-NEXT: s_not_b64 exec, exec 5576; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5577; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5578; GFX1064-NEXT: v_mov_b32_e32 v3, 0 5579; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5580; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5581; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5582; GFX1064-NEXT: v_mov_b32_e32 v2, v1 5583; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5584; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5585; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 5586; GFX1064-NEXT: v_mov_b32_e32 v2, s4 5587; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5588; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 5589; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5590; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5591; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5592; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5593; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 5594; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 5595; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5596; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5597; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 5598; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 5599; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 5600; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 5601; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 5602; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5603; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 5604; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 5605; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 5606; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5607; GFX1064-NEXT: s_mov_b32 s2, -1 5608; GFX1064-NEXT: ; implicit-def: $vgpr0 5609; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 5610; GFX1064-NEXT: s_cbranch_execz .LBB21_2 5611; GFX1064-NEXT: ; %bb.1: 5612; GFX1064-NEXT: v_mov_b32_e32 v0, 0 5613; GFX1064-NEXT: v_mov_b32_e32 v4, s7 5614; GFX1064-NEXT: s_mov_b32 s3, s7 5615; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5616; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5617; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 5618; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5619; GFX1064-NEXT: buffer_gl0_inv 5620; GFX1064-NEXT: .LBB21_2: 5621; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5622; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 5623; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 5624; GFX1064-NEXT: v_mov_b32_e32 v0, v3 5625; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 5626; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5627; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5628; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 5629; GFX1064-NEXT: s_endpgm 5630; 5631; GFX1032-LABEL: umax_i32_varying: 5632; GFX1032: ; %bb.0: ; %entry 5633; GFX1032-NEXT: v_mov_b32_e32 v1, v0 5634; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5635; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5636; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 5637; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5638; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5639; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5640; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5641; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5642; GFX1032-NEXT: v_mov_b32_e32 v2, v1 5643; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5644; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5645; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5646; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5647; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5648; GFX1032-NEXT: v_mov_b32_e32 v3, 0 5649; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 5650; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 5651; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5652; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5653; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5654; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 5655; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 5656; GFX1032-NEXT: s_mov_b32 exec_lo, s2 5657; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5658; GFX1032-NEXT: s_mov_b32 s2, -1 5659; GFX1032-NEXT: ; implicit-def: $vgpr0 5660; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 5661; GFX1032-NEXT: s_cbranch_execz .LBB21_2 5662; GFX1032-NEXT: ; %bb.1: 5663; GFX1032-NEXT: v_mov_b32_e32 v0, 0 5664; GFX1032-NEXT: v_mov_b32_e32 v4, s4 5665; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5666; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5667; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 5668; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5669; GFX1032-NEXT: buffer_gl0_inv 5670; GFX1032-NEXT: .LBB21_2: 5671; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5672; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 5673; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 5674; GFX1032-NEXT: v_mov_b32_e32 v0, v3 5675; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 5676; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5677; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5678; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5679; GFX1032-NEXT: s_endpgm 5680; 5681; GFX1164-LABEL: umax_i32_varying: 5682; GFX1164: ; %bb.0: ; %entry 5683; GFX1164-NEXT: v_mov_b32_e32 v1, v0 5684; GFX1164-NEXT: s_not_b64 exec, exec 5685; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5686; GFX1164-NEXT: s_not_b64 exec, exec 5687; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5688; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5689; GFX1164-NEXT: v_mov_b32_e32 v3, 0 5690; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5691; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5692; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5693; GFX1164-NEXT: v_mov_b32_e32 v2, v1 5694; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5695; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5696; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 5697; GFX1164-NEXT: v_mov_b32_e32 v2, s4 5698; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 5699; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 5700; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5701; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5702; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5703; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5704; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 5705; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 5706; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5707; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5708; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 5709; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 5710; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 5711; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 5712; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 5713; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5714; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 5715; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 5716; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 5717; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5718; GFX1164-NEXT: s_mov_b32 s2, -1 5719; GFX1164-NEXT: ; implicit-def: $vgpr0 5720; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 5721; GFX1164-NEXT: s_cbranch_execz .LBB21_2 5722; GFX1164-NEXT: ; %bb.1: 5723; GFX1164-NEXT: v_mov_b32_e32 v0, 0 5724; GFX1164-NEXT: v_mov_b32_e32 v4, s7 5725; GFX1164-NEXT: s_mov_b32 s3, s7 5726; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5727; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5728; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v4 5729; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5730; GFX1164-NEXT: buffer_gl0_inv 5731; GFX1164-NEXT: .LBB21_2: 5732; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 5733; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 5734; GFX1164-NEXT: v_mov_b32_e32 v0, v3 5735; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 5736; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5737; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5738; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5739; GFX1164-NEXT: s_endpgm 5740; 5741; GFX1132-LABEL: umax_i32_varying: 5742; GFX1132: ; %bb.0: ; %entry 5743; GFX1132-NEXT: v_mov_b32_e32 v1, v0 5744; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5745; GFX1132-NEXT: v_mov_b32_e32 v1, 0 5746; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 5747; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5748; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 5749; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 5750; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 5751; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 5752; GFX1132-NEXT: v_mov_b32_e32 v2, v1 5753; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 5754; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5755; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5756; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5757; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 5758; GFX1132-NEXT: v_mov_b32_e32 v3, 0 5759; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 5760; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 5761; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 5762; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5763; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5764; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 5765; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 5766; GFX1132-NEXT: s_mov_b32 exec_lo, s2 5767; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5768; GFX1132-NEXT: s_mov_b32 s2, -1 5769; GFX1132-NEXT: ; implicit-def: $vgpr0 5770; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 5771; GFX1132-NEXT: s_cbranch_execz .LBB21_2 5772; GFX1132-NEXT: ; %bb.1: 5773; GFX1132-NEXT: v_mov_b32_e32 v0, 0 5774; GFX1132-NEXT: v_mov_b32_e32 v4, s4 5775; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5776; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 5777; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v4 5778; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5779; GFX1132-NEXT: buffer_gl0_inv 5780; GFX1132-NEXT: .LBB21_2: 5781; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 5782; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 5783; GFX1132-NEXT: v_mov_b32_e32 v0, v3 5784; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 5785; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 5786; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 5787; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 5788; GFX1132-NEXT: s_endpgm 5789entry: 5790 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5791 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5792 store i32 %old, i32 addrspace(1)* %out 5793 ret void 5794} 5795 5796define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 5797; 5798; 5799; GFX7LESS-LABEL: umax_i64_constant: 5800; GFX7LESS: ; %bb.0: ; %entry 5801; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5802; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 5803; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 5804; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5805; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5806; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5807; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 5808; GFX7LESS-NEXT: ; %bb.1: 5809; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 5810; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5811; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5812; GFX7LESS-NEXT: s_mov_b32 m0, -1 5813; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5814; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5815; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5816; GFX7LESS-NEXT: .LBB22_2: 5817; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5818; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5819; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5820; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5821; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5822; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5823; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5824; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 5825; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 5826; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5827; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 5828; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5829; GFX7LESS-NEXT: s_mov_b32 s2, -1 5830; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5831; GFX7LESS-NEXT: s_endpgm 5832; 5833; GFX8-LABEL: umax_i64_constant: 5834; GFX8: ; %bb.0: ; %entry 5835; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5836; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5837; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5838; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5839; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5840; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5841; GFX8-NEXT: s_cbranch_execz .LBB22_2 5842; GFX8-NEXT: ; %bb.1: 5843; GFX8-NEXT: v_mov_b32_e32 v0, 5 5844; GFX8-NEXT: v_mov_b32_e32 v2, 0 5845; GFX8-NEXT: v_mov_b32_e32 v1, 0 5846; GFX8-NEXT: s_mov_b32 m0, -1 5847; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5848; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5849; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5850; GFX8-NEXT: .LBB22_2: 5851; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5852; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5853; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5854; GFX8-NEXT: v_readfirstlane_b32 s3, v1 5855; GFX8-NEXT: v_mov_b32_e32 v1, 0 5856; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5857; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5858; GFX8-NEXT: v_mov_b32_e32 v2, s2 5859; GFX8-NEXT: v_mov_b32_e32 v1, s3 5860; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5861; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5862; GFX8-NEXT: s_mov_b32 s3, 0xf000 5863; GFX8-NEXT: s_mov_b32 s2, -1 5864; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5865; GFX8-NEXT: s_endpgm 5866; 5867; GFX9-LABEL: umax_i64_constant: 5868; GFX9: ; %bb.0: ; %entry 5869; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5870; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5871; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5872; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5873; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5874; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5875; GFX9-NEXT: s_cbranch_execz .LBB22_2 5876; GFX9-NEXT: ; %bb.1: 5877; GFX9-NEXT: v_mov_b32_e32 v0, 5 5878; GFX9-NEXT: v_mov_b32_e32 v1, 0 5879; GFX9-NEXT: v_mov_b32_e32 v2, 0 5880; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5881; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5882; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5883; GFX9-NEXT: .LBB22_2: 5884; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5885; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5886; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5887; GFX9-NEXT: v_readfirstlane_b32 s3, v1 5888; GFX9-NEXT: v_mov_b32_e32 v1, 0 5889; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5890; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5891; GFX9-NEXT: v_mov_b32_e32 v2, s2 5892; GFX9-NEXT: v_mov_b32_e32 v1, s3 5893; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5894; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5895; GFX9-NEXT: s_mov_b32 s3, 0xf000 5896; GFX9-NEXT: s_mov_b32 s2, -1 5897; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5898; GFX9-NEXT: s_endpgm 5899; 5900; GFX1064-LABEL: umax_i64_constant: 5901; GFX1064: ; %bb.0: ; %entry 5902; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5903; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5904; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5905; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5906; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5907; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5908; GFX1064-NEXT: s_cbranch_execz .LBB22_2 5909; GFX1064-NEXT: ; %bb.1: 5910; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5911; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5912; GFX1064-NEXT: v_mov_b32_e32 v2, 0 5913; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5914; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5915; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5916; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5917; GFX1064-NEXT: buffer_gl0_inv 5918; GFX1064-NEXT: .LBB22_2: 5919; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 5920; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5921; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 5922; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 5923; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5924; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5925; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5926; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5927; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 5928; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5929; GFX1064-NEXT: s_mov_b32 s2, -1 5930; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5931; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5932; GFX1064-NEXT: s_endpgm 5933; 5934; GFX1032-LABEL: umax_i64_constant: 5935; GFX1032: ; %bb.0: ; %entry 5936; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5937; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5938; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5939; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5940; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5941; GFX1032-NEXT: s_cbranch_execz .LBB22_2 5942; GFX1032-NEXT: ; %bb.1: 5943; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5944; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5945; GFX1032-NEXT: v_mov_b32_e32 v2, 0 5946; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5947; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5948; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5949; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5950; GFX1032-NEXT: buffer_gl0_inv 5951; GFX1032-NEXT: .LBB22_2: 5952; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 5953; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5954; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 5955; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 5956; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5957; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 5958; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 5959; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 5960; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 5961; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5962; GFX1032-NEXT: s_mov_b32 s2, -1 5963; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5964; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5965; GFX1032-NEXT: s_endpgm 5966; 5967; GFX1164-LABEL: umax_i64_constant: 5968; GFX1164: ; %bb.0: ; %entry 5969; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 5970; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 5971; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 5972; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5973; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 5974; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 5975; GFX1164-NEXT: s_cbranch_execz .LBB22_2 5976; GFX1164-NEXT: ; %bb.1: 5977; GFX1164-NEXT: v_mov_b32_e32 v0, 5 5978; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5979; GFX1164-NEXT: v_mov_b32_e32 v2, 0 5980; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5981; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 5982; GFX1164-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 5983; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5984; GFX1164-NEXT: buffer_gl0_inv 5985; GFX1164-NEXT: .LBB22_2: 5986; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 5987; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 5988; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 5989; GFX1164-NEXT: v_mov_b32_e32 v1, 0 5990; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 5991; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 5992; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 5993; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 5994; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 5995; GFX1164-NEXT: s_mov_b32 s2, -1 5996; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 5997; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 5998; GFX1164-NEXT: s_endpgm 5999; 6000; GFX1132-LABEL: umax_i64_constant: 6001; GFX1132: ; %bb.0: ; %entry 6002; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6003; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6004; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6005; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6006; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6007; GFX1132-NEXT: s_cbranch_execz .LBB22_2 6008; GFX1132-NEXT: ; %bb.1: 6009; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6010; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6011; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6012; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6013; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6014; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 6015; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6016; GFX1132-NEXT: buffer_gl0_inv 6017; GFX1132-NEXT: .LBB22_2: 6018; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6019; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6020; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6021; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6022; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 6023; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 6024; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6025; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 6026; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6027; GFX1132-NEXT: s_mov_b32 s2, -1 6028; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6029; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6030; GFX1132-NEXT: s_endpgm 6031entry: 6032 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 6033 store i64 %old, i64 addrspace(1)* %out 6034 ret void 6035} 6036 6037define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 6038; 6039; 6040; GFX7LESS-LABEL: umin_i32_varying: 6041; GFX7LESS: ; %bb.0: ; %entry 6042; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6043; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6044; GFX7LESS-NEXT: s_mov_b32 m0, -1 6045; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6046; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 6047; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6048; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6049; GFX7LESS-NEXT: s_mov_b32 s2, -1 6050; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 6051; GFX7LESS-NEXT: s_endpgm 6052; 6053; GFX8-LABEL: umin_i32_varying: 6054; GFX8: ; %bb.0: ; %entry 6055; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6056; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6057; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6058; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6059; GFX8-NEXT: v_mov_b32_e32 v1, -1 6060; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6061; GFX8-NEXT: v_mov_b32_e32 v2, v0 6062; GFX8-NEXT: s_not_b64 exec, exec 6063; GFX8-NEXT: v_mov_b32_e32 v2, -1 6064; GFX8-NEXT: s_not_b64 exec, exec 6065; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 6066; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6067; GFX8-NEXT: s_nop 1 6068; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6069; GFX8-NEXT: s_nop 1 6070; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6071; GFX8-NEXT: s_nop 1 6072; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6073; GFX8-NEXT: s_nop 1 6074; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6075; GFX8-NEXT: s_nop 1 6076; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6077; GFX8-NEXT: v_readlane_b32 s4, v2, 63 6078; GFX8-NEXT: s_nop 0 6079; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6080; GFX8-NEXT: s_mov_b64 exec, s[2:3] 6081; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6082; GFX8-NEXT: ; implicit-def: $vgpr0 6083; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6084; GFX8-NEXT: s_cbranch_execz .LBB23_2 6085; GFX8-NEXT: ; %bb.1: 6086; GFX8-NEXT: v_mov_b32_e32 v0, 0 6087; GFX8-NEXT: v_mov_b32_e32 v3, s4 6088; GFX8-NEXT: s_mov_b32 m0, -1 6089; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6090; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 6091; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6092; GFX8-NEXT: .LBB23_2: 6093; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6094; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6095; GFX8-NEXT: v_readfirstlane_b32 s2, v0 6096; GFX8-NEXT: v_mov_b32_e32 v0, v1 6097; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 6098; GFX8-NEXT: s_mov_b32 s3, 0xf000 6099; GFX8-NEXT: s_mov_b32 s2, -1 6100; GFX8-NEXT: s_nop 0 6101; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 6102; GFX8-NEXT: s_endpgm 6103; 6104; GFX9-LABEL: umin_i32_varying: 6105; GFX9: ; %bb.0: ; %entry 6106; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6107; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 6108; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 6109; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6110; GFX9-NEXT: v_mov_b32_e32 v1, -1 6111; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6112; GFX9-NEXT: v_mov_b32_e32 v2, v0 6113; GFX9-NEXT: s_not_b64 exec, exec 6114; GFX9-NEXT: v_mov_b32_e32 v2, -1 6115; GFX9-NEXT: s_not_b64 exec, exec 6116; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 6117; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 6118; GFX9-NEXT: s_nop 1 6119; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 6120; GFX9-NEXT: s_nop 1 6121; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 6122; GFX9-NEXT: s_nop 1 6123; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 6124; GFX9-NEXT: s_nop 1 6125; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 6126; GFX9-NEXT: s_nop 1 6127; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 6128; GFX9-NEXT: v_readlane_b32 s4, v2, 63 6129; GFX9-NEXT: s_nop 0 6130; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 6131; GFX9-NEXT: s_mov_b64 exec, s[2:3] 6132; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6133; GFX9-NEXT: ; implicit-def: $vgpr0 6134; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6135; GFX9-NEXT: s_cbranch_execz .LBB23_2 6136; GFX9-NEXT: ; %bb.1: 6137; GFX9-NEXT: v_mov_b32_e32 v0, 0 6138; GFX9-NEXT: v_mov_b32_e32 v3, s4 6139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6140; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 6141; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6142; GFX9-NEXT: .LBB23_2: 6143; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6144; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6145; GFX9-NEXT: v_readfirstlane_b32 s2, v0 6146; GFX9-NEXT: v_mov_b32_e32 v0, v1 6147; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 6148; GFX9-NEXT: s_mov_b32 s3, 0xf000 6149; GFX9-NEXT: s_mov_b32 s2, -1 6150; GFX9-NEXT: s_nop 0 6151; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 6152; GFX9-NEXT: s_endpgm 6153; 6154; GFX1064-LABEL: umin_i32_varying: 6155; GFX1064: ; %bb.0: ; %entry 6156; GFX1064-NEXT: v_mov_b32_e32 v1, v0 6157; GFX1064-NEXT: s_not_b64 exec, exec 6158; GFX1064-NEXT: v_mov_b32_e32 v1, -1 6159; GFX1064-NEXT: s_not_b64 exec, exec 6160; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6161; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6162; GFX1064-NEXT: v_mov_b32_e32 v3, -1 6163; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6164; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6165; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6166; GFX1064-NEXT: v_mov_b32_e32 v2, v1 6167; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6168; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6169; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 6170; GFX1064-NEXT: v_mov_b32_e32 v2, s4 6171; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6172; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 6173; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6174; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6175; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6176; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6177; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 6178; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 6179; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6180; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6181; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 6182; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 6183; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 6184; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 6185; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 6186; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6187; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 6188; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 6189; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 6190; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6191; GFX1064-NEXT: s_mov_b32 s2, -1 6192; GFX1064-NEXT: ; implicit-def: $vgpr0 6193; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 6194; GFX1064-NEXT: s_cbranch_execz .LBB23_2 6195; GFX1064-NEXT: ; %bb.1: 6196; GFX1064-NEXT: v_mov_b32_e32 v0, 0 6197; GFX1064-NEXT: v_mov_b32_e32 v4, s7 6198; GFX1064-NEXT: s_mov_b32 s3, s7 6199; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6200; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6201; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 6202; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6203; GFX1064-NEXT: buffer_gl0_inv 6204; GFX1064-NEXT: .LBB23_2: 6205; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6206; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 6207; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 6208; GFX1064-NEXT: v_mov_b32_e32 v0, v3 6209; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 6210; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6211; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6212; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 6213; GFX1064-NEXT: s_endpgm 6214; 6215; GFX1032-LABEL: umin_i32_varying: 6216; GFX1032: ; %bb.0: ; %entry 6217; GFX1032-NEXT: v_mov_b32_e32 v1, v0 6218; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6219; GFX1032-NEXT: v_mov_b32_e32 v1, -1 6220; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 6221; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6222; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6223; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6224; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6225; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6226; GFX1032-NEXT: v_mov_b32_e32 v2, v1 6227; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6228; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6229; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6230; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6231; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6232; GFX1032-NEXT: v_mov_b32_e32 v3, -1 6233; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 6234; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 6235; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6236; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6237; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6238; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 6239; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 6240; GFX1032-NEXT: s_mov_b32 exec_lo, s2 6241; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6242; GFX1032-NEXT: s_mov_b32 s2, -1 6243; GFX1032-NEXT: ; implicit-def: $vgpr0 6244; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 6245; GFX1032-NEXT: s_cbranch_execz .LBB23_2 6246; GFX1032-NEXT: ; %bb.1: 6247; GFX1032-NEXT: v_mov_b32_e32 v0, 0 6248; GFX1032-NEXT: v_mov_b32_e32 v4, s4 6249; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6250; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6251; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 6252; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6253; GFX1032-NEXT: buffer_gl0_inv 6254; GFX1032-NEXT: .LBB23_2: 6255; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6256; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 6257; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 6258; GFX1032-NEXT: v_mov_b32_e32 v0, v3 6259; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 6260; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6261; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6262; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 6263; GFX1032-NEXT: s_endpgm 6264; 6265; GFX1164-LABEL: umin_i32_varying: 6266; GFX1164: ; %bb.0: ; %entry 6267; GFX1164-NEXT: v_mov_b32_e32 v1, v0 6268; GFX1164-NEXT: s_not_b64 exec, exec 6269; GFX1164-NEXT: v_mov_b32_e32 v1, -1 6270; GFX1164-NEXT: s_not_b64 exec, exec 6271; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6272; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6273; GFX1164-NEXT: v_mov_b32_e32 v3, -1 6274; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6275; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6276; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6277; GFX1164-NEXT: v_mov_b32_e32 v2, v1 6278; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6279; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6280; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 6281; GFX1164-NEXT: v_mov_b32_e32 v2, s4 6282; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 6283; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 6284; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6285; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6286; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6287; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6288; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 6289; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 6290; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6291; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6292; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 6293; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 6294; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 6295; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 6296; GFX1164-NEXT: s_mov_b64 exec, s[2:3] 6297; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6298; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 6299; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 6300; GFX1164-NEXT: s_mov_b64 exec, s[4:5] 6301; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6302; GFX1164-NEXT: s_mov_b32 s2, -1 6303; GFX1164-NEXT: ; implicit-def: $vgpr0 6304; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc 6305; GFX1164-NEXT: s_cbranch_execz .LBB23_2 6306; GFX1164-NEXT: ; %bb.1: 6307; GFX1164-NEXT: v_mov_b32_e32 v0, 0 6308; GFX1164-NEXT: v_mov_b32_e32 v4, s7 6309; GFX1164-NEXT: s_mov_b32 s3, s7 6310; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6311; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6312; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 6313; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6314; GFX1164-NEXT: buffer_gl0_inv 6315; GFX1164-NEXT: .LBB23_2: 6316; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] 6317; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 6318; GFX1164-NEXT: v_mov_b32_e32 v0, v3 6319; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 6320; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6321; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6322; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6323; GFX1164-NEXT: s_endpgm 6324; 6325; GFX1132-LABEL: umin_i32_varying: 6326; GFX1132: ; %bb.0: ; %entry 6327; GFX1132-NEXT: v_mov_b32_e32 v1, v0 6328; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6329; GFX1132-NEXT: v_mov_b32_e32 v1, -1 6330; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo 6331; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6332; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6333; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 6334; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 6335; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 6336; GFX1132-NEXT: v_mov_b32_e32 v2, v1 6337; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 6338; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6339; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6340; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6341; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 6342; GFX1132-NEXT: v_mov_b32_e32 v3, -1 6343; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 6344; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 6345; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 6346; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6347; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6348; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 6349; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 6350; GFX1132-NEXT: s_mov_b32 exec_lo, s2 6351; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6352; GFX1132-NEXT: s_mov_b32 s2, -1 6353; GFX1132-NEXT: ; implicit-def: $vgpr0 6354; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo 6355; GFX1132-NEXT: s_cbranch_execz .LBB23_2 6356; GFX1132-NEXT: ; %bb.1: 6357; GFX1132-NEXT: v_mov_b32_e32 v0, 0 6358; GFX1132-NEXT: v_mov_b32_e32 v4, s4 6359; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6360; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6361; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 6362; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6363; GFX1132-NEXT: buffer_gl0_inv 6364; GFX1132-NEXT: .LBB23_2: 6365; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 6366; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 6367; GFX1132-NEXT: v_mov_b32_e32 v0, v3 6368; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 6369; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6370; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6371; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 6372; GFX1132-NEXT: s_endpgm 6373entry: 6374 %lane = call i32 @llvm.amdgcn.workitem.id.x() 6375 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 6376 store i32 %old, i32 addrspace(1)* %out 6377 ret void 6378} 6379 6380define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 6381; 6382; 6383; GFX7LESS-LABEL: umin_i64_constant: 6384; GFX7LESS: ; %bb.0: ; %entry 6385; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6386; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 6387; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 6388; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6389; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 6390; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 6391; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 6392; GFX7LESS-NEXT: ; %bb.1: 6393; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 6394; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 6395; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 6396; GFX7LESS-NEXT: s_mov_b32 m0, -1 6397; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6398; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6399; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6400; GFX7LESS-NEXT: .LBB24_2: 6401; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 6402; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 6403; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 6404; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 6405; GFX7LESS-NEXT: s_mov_b32 s2, -1 6406; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6407; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6408; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 6409; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6410; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6411; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 6412; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6413; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 6414; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6415; GFX7LESS-NEXT: s_endpgm 6416; 6417; GFX8-LABEL: umin_i64_constant: 6418; GFX8: ; %bb.0: ; %entry 6419; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6420; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6421; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6422; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6423; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 6424; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 6425; GFX8-NEXT: s_cbranch_execz .LBB24_2 6426; GFX8-NEXT: ; %bb.1: 6427; GFX8-NEXT: v_mov_b32_e32 v0, 5 6428; GFX8-NEXT: v_mov_b32_e32 v2, 0 6429; GFX8-NEXT: v_mov_b32_e32 v1, 0 6430; GFX8-NEXT: s_mov_b32 m0, -1 6431; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6432; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6433; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6434; GFX8-NEXT: .LBB24_2: 6435; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 6436; GFX8-NEXT: s_waitcnt lgkmcnt(0) 6437; GFX8-NEXT: v_readfirstlane_b32 s4, v0 6438; GFX8-NEXT: v_readfirstlane_b32 s5, v1 6439; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6440; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6441; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6442; GFX8-NEXT: v_mov_b32_e32 v2, s5 6443; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6444; GFX8-NEXT: v_mov_b32_e32 v2, s4 6445; GFX8-NEXT: s_mov_b32 s2, -1 6446; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6447; GFX8-NEXT: s_mov_b32 s3, 0xf000 6448; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6449; GFX8-NEXT: s_endpgm 6450; 6451; GFX9-LABEL: umin_i64_constant: 6452; GFX9: ; %bb.0: ; %entry 6453; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6454; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6455; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6456; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6457; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 6458; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 6459; GFX9-NEXT: s_cbranch_execz .LBB24_2 6460; GFX9-NEXT: ; %bb.1: 6461; GFX9-NEXT: v_mov_b32_e32 v0, 5 6462; GFX9-NEXT: v_mov_b32_e32 v1, 0 6463; GFX9-NEXT: v_mov_b32_e32 v2, 0 6464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6465; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6466; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6467; GFX9-NEXT: .LBB24_2: 6468; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 6469; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6470; GFX9-NEXT: v_readfirstlane_b32 s4, v0 6471; GFX9-NEXT: v_readfirstlane_b32 s5, v1 6472; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6473; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6474; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 6475; GFX9-NEXT: v_mov_b32_e32 v2, s5 6476; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6477; GFX9-NEXT: v_mov_b32_e32 v2, s4 6478; GFX9-NEXT: s_mov_b32 s2, -1 6479; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6480; GFX9-NEXT: s_mov_b32 s3, 0xf000 6481; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6482; GFX9-NEXT: s_endpgm 6483; 6484; GFX1064-LABEL: umin_i64_constant: 6485; GFX1064: ; %bb.0: ; %entry 6486; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6487; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6488; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6489; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6490; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 6491; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 6492; GFX1064-NEXT: s_cbranch_execz .LBB24_2 6493; GFX1064-NEXT: ; %bb.1: 6494; GFX1064-NEXT: v_mov_b32_e32 v0, 5 6495; GFX1064-NEXT: v_mov_b32_e32 v1, 0 6496; GFX1064-NEXT: v_mov_b32_e32 v2, 0 6497; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6498; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 6499; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6500; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6501; GFX1064-NEXT: buffer_gl0_inv 6502; GFX1064-NEXT: .LBB24_2: 6503; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 6504; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 6505; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 6506; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 6507; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6508; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6509; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6510; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6511; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6512; GFX1064-NEXT: s_mov_b32 s2, -1 6513; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 6514; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 6515; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6516; GFX1064-NEXT: s_endpgm 6517; 6518; GFX1032-LABEL: umin_i64_constant: 6519; GFX1032: ; %bb.0: ; %entry 6520; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6521; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6522; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6523; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 6524; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 6525; GFX1032-NEXT: s_cbranch_execz .LBB24_2 6526; GFX1032-NEXT: ; %bb.1: 6527; GFX1032-NEXT: v_mov_b32_e32 v0, 5 6528; GFX1032-NEXT: v_mov_b32_e32 v1, 0 6529; GFX1032-NEXT: v_mov_b32_e32 v2, 0 6530; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6531; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 6532; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6533; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6534; GFX1032-NEXT: buffer_gl0_inv 6535; GFX1032-NEXT: .LBB24_2: 6536; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 6537; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 6538; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 6539; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 6540; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6541; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6542; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6543; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6544; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6545; GFX1032-NEXT: s_mov_b32 s2, -1 6546; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 6547; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 6548; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6549; GFX1032-NEXT: s_endpgm 6550; 6551; GFX1164-LABEL: umin_i64_constant: 6552; GFX1164: ; %bb.0: ; %entry 6553; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6554; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6555; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 6556; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 6557; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 6558; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc 6559; GFX1164-NEXT: s_cbranch_execz .LBB24_2 6560; GFX1164-NEXT: ; %bb.1: 6561; GFX1164-NEXT: v_mov_b32_e32 v0, 5 6562; GFX1164-NEXT: v_mov_b32_e32 v1, 0 6563; GFX1164-NEXT: v_mov_b32_e32 v2, 0 6564; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6565; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 6566; GFX1164-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6567; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6568; GFX1164-NEXT: buffer_gl0_inv 6569; GFX1164-NEXT: .LBB24_2: 6570; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] 6571; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 6572; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 6573; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 6574; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 6575; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 6576; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 6577; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 6578; GFX1164-NEXT: s_mov_b32 s2, -1 6579; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 6580; GFX1164-NEXT: s_waitcnt lgkmcnt(0) 6581; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6582; GFX1164-NEXT: s_endpgm 6583; 6584; GFX1132-LABEL: umin_i64_constant: 6585; GFX1132: ; %bb.0: ; %entry 6586; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 6587; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 6588; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 6589; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 6590; GFX1132-NEXT: s_and_saveexec_b32 s2, vcc_lo 6591; GFX1132-NEXT: s_cbranch_execz .LBB24_2 6592; GFX1132-NEXT: ; %bb.1: 6593; GFX1132-NEXT: v_mov_b32_e32 v0, 5 6594; GFX1132-NEXT: v_mov_b32_e32 v1, 0 6595; GFX1132-NEXT: v_mov_b32_e32 v2, 0 6596; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6597; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 6598; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 6599; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6600; GFX1132-NEXT: buffer_gl0_inv 6601; GFX1132-NEXT: .LBB24_2: 6602; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 6603; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 6604; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 6605; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 6606; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 6607; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 6608; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 6609; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 6610; GFX1132-NEXT: s_mov_b32 s2, -1 6611; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 6612; GFX1132-NEXT: s_waitcnt lgkmcnt(0) 6613; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 6614; GFX1132-NEXT: s_endpgm 6615entry: 6616 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 6617 store i64 %old, i64 addrspace(1)* %out 6618 ret void 6619} 6620