1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show that what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 21; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 30; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 31; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s4, 5 32; GFX7LESS-NEXT: s_mov_b32 m0, -1 33; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 34; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 35; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 36; GFX7LESS-NEXT: buffer_wbinvl1 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 39; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 40; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 41; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 42; GFX7LESS-NEXT: s_mov_b32 s2, -1 43; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 59; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 60; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 61; GFX8-NEXT: s_mov_b32 m0, -1 62; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 63; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 64; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 65; GFX8-NEXT: buffer_wbinvl1_vol 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 68; GFX8-NEXT: v_readfirstlane_b32 s2, v1 69; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 70; GFX8-NEXT: s_mov_b32 s3, 0xf000 71; GFX8-NEXT: s_mov_b32 s2, -1 72; GFX8-NEXT: s_nop 1 73; GFX8-NEXT: s_waitcnt lgkmcnt(0) 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 89; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 90; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 91; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 92; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 93; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 94; GFX9-NEXT: buffer_wbinvl1_vol 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 97; GFX9-NEXT: v_readfirstlane_b32 s2, v1 98; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 99; GFX9-NEXT: s_mov_b32 s3, 0xf000 100; GFX9-NEXT: s_mov_b32 s2, -1 101; GFX9-NEXT: s_nop 1 102; GFX9-NEXT: s_waitcnt lgkmcnt(0) 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 109; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 119; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 120; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 121; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 122; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: buffer_gl0_inv 125; GFX1064-NEXT: buffer_gl1_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: v_nop 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_nop 1 134; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 135; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 136; GFX1064-NEXT: s_endpgm 137; 138; GFX1032-LABEL: add_i32_constant: 139; GFX1032: ; %bb.0: ; %entry 140; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 141; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 142; GFX1032-NEXT: ; implicit-def: $vcc_hi 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz BB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 150; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 151; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 152; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 153; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 155; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 156; GFX1032-NEXT: buffer_gl0_inv 157; GFX1032-NEXT: buffer_gl1_inv 158; GFX1032-NEXT: BB0_2: 159; GFX1032-NEXT: v_nop 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_nop 1 166; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 167; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 168; GFX1032-NEXT: s_endpgm 169entry: 170 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 171 store i32 %old, i32 addrspace(1)* %out 172 ret void 173} 174 175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 176; 177; 178; GFX7LESS-LABEL: add_i32_uniform: 179; GFX7LESS: ; %bb.0: ; %entry 180; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 181; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 182; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 183; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 184; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 185; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 186; GFX7LESS-NEXT: ; implicit-def: $vgpr1 187; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 188; GFX7LESS-NEXT: s_cbranch_execz BB1_2 189; GFX7LESS-NEXT: ; %bb.1: 190; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 191; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 192; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 193; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 194; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 195; GFX7LESS-NEXT: s_mov_b32 m0, -1 196; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 197; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 198; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 199; GFX7LESS-NEXT: buffer_wbinvl1 200; GFX7LESS-NEXT: BB1_2: 201; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 202; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 203; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 204; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 205; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 206; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 207; GFX7LESS-NEXT: s_mov_b32 s6, -1 208; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 209; GFX7LESS-NEXT: s_endpgm 210; 211; GFX8-LABEL: add_i32_uniform: 212; GFX8: ; %bb.0: ; %entry 213; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 214; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 215; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 216; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 217; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 218; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 219; GFX8-NEXT: ; implicit-def: $vgpr1 220; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 221; GFX8-NEXT: s_cbranch_execz BB1_2 222; GFX8-NEXT: ; %bb.1: 223; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 225; GFX8-NEXT: s_mul_i32 s1, s0, s1 226; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 227; GFX8-NEXT: v_mov_b32_e32 v2, s1 228; GFX8-NEXT: s_mov_b32 m0, -1 229; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 230; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 231; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 232; GFX8-NEXT: buffer_wbinvl1_vol 233; GFX8-NEXT: BB1_2: 234; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 236; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 237; GFX8-NEXT: v_readfirstlane_b32 s0, v1 238; GFX8-NEXT: s_mov_b32 s7, 0xf000 239; GFX8-NEXT: s_mov_b32 s6, -1 240; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 241; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 242; GFX8-NEXT: s_endpgm 243; 244; GFX9-LABEL: add_i32_uniform: 245; GFX9: ; %bb.0: ; %entry 246; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 247; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 248; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 249; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 250; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 251; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 252; GFX9-NEXT: ; implicit-def: $vgpr1 253; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 254; GFX9-NEXT: s_cbranch_execz BB1_2 255; GFX9-NEXT: ; %bb.1: 256; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 258; GFX9-NEXT: s_mul_i32 s1, s0, s1 259; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 260; GFX9-NEXT: v_mov_b32_e32 v2, s1 261; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 262; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 263; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 264; GFX9-NEXT: buffer_wbinvl1_vol 265; GFX9-NEXT: BB1_2: 266; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 267; GFX9-NEXT: s_waitcnt lgkmcnt(0) 268; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 269; GFX9-NEXT: v_readfirstlane_b32 s0, v1 270; GFX9-NEXT: s_mov_b32 s7, 0xf000 271; GFX9-NEXT: s_mov_b32 s6, -1 272; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 273; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 274; GFX9-NEXT: s_endpgm 275; 276; GFX1064-LABEL: add_i32_uniform: 277; GFX1064: ; %bb.0: ; %entry 278; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 279; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 280; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 281; GFX1064-NEXT: ; implicit-def: $vgpr1 282; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 283; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 284; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 285; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 286; GFX1064-NEXT: s_cbranch_execz BB1_2 287; GFX1064-NEXT: ; %bb.1: 288; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 289; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 290; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 291; GFX1064-NEXT: s_mul_i32 s1, s0, s1 292; GFX1064-NEXT: v_mov_b32_e32 v2, s1 293; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 294; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 295; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 296; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 297; GFX1064-NEXT: buffer_gl0_inv 298; GFX1064-NEXT: buffer_gl1_inv 299; GFX1064-NEXT: BB1_2: 300; GFX1064-NEXT: v_nop 301; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 302; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 303; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 304; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 305; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 306; GFX1064-NEXT: s_mov_b32 s6, -1 307; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 308; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 309; GFX1064-NEXT: s_endpgm 310; 311; GFX1032-LABEL: add_i32_uniform: 312; GFX1032: ; %bb.0: ; %entry 313; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 314; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 315; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 316; GFX1032-NEXT: ; implicit-def: $vcc_hi 317; GFX1032-NEXT: ; implicit-def: $vgpr1 318; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 319; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 320; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 321; GFX1032-NEXT: s_cbranch_execz BB1_2 322; GFX1032-NEXT: ; %bb.1: 323; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 324; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: s_mul_i32 s2, s0, s2 327; GFX1032-NEXT: v_mov_b32_e32 v2, s2 328; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 329; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 330; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 331; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 332; GFX1032-NEXT: buffer_gl0_inv 333; GFX1032-NEXT: buffer_gl1_inv 334; GFX1032-NEXT: BB1_2: 335; GFX1032-NEXT: v_nop 336; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 337; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 338; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 339; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 340; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 341; GFX1032-NEXT: s_mov_b32 s6, -1 342; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 343; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 344; GFX1032-NEXT: s_endpgm 345entry: 346 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 347 store i32 %old, i32 addrspace(1)* %out 348 ret void 349} 350 351; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 352; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 353; GFX7LESS-NOT: s_bcnt1_i32_b64 354; DPPCOMB: v_add_u32_dpp 355; DPPCOMB: v_add_u32_dpp 356; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 357; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 358; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 359define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 360; 361; 362; GFX7LESS-LABEL: add_i32_varying: 363; GFX7LESS: ; %bb.0: ; %entry 364; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 365; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 366; GFX7LESS-NEXT: s_mov_b32 m0, -1 367; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 368; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 369; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 370; GFX7LESS-NEXT: buffer_wbinvl1 371; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 372; GFX7LESS-NEXT: s_mov_b32 s2, -1 373; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 374; GFX7LESS-NEXT: s_endpgm 375; 376; GFX8-LABEL: add_i32_varying: 377; GFX8: ; %bb.0: ; %entry 378; GFX8-NEXT: v_mov_b32_e32 v2, v0 379; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 380; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 381; GFX8-NEXT: v_mov_b32_e32 v1, 0 382; GFX8-NEXT: s_mov_b64 exec, s[2:3] 383; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 384; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 385; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 386; GFX8-NEXT: s_not_b64 exec, exec 387; GFX8-NEXT: v_mov_b32_e32 v2, 0 388; GFX8-NEXT: s_not_b64 exec, exec 389; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 390; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 391; GFX8-NEXT: s_nop 1 392; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 393; GFX8-NEXT: s_nop 1 394; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 395; GFX8-NEXT: s_nop 1 396; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 397; GFX8-NEXT: s_nop 1 398; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 399; GFX8-NEXT: s_nop 1 400; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 401; GFX8-NEXT: v_readlane_b32 s2, v2, 63 402; GFX8-NEXT: s_nop 0 403; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 404; GFX8-NEXT: s_mov_b64 exec, s[4:5] 405; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 406; GFX8-NEXT: ; implicit-def: $vgpr0 407; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 408; GFX8-NEXT: s_cbranch_execz BB2_2 409; GFX8-NEXT: ; %bb.1: 410; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 411; GFX8-NEXT: v_mov_b32_e32 v3, s2 412; GFX8-NEXT: s_mov_b32 m0, -1 413; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 414; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 415; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 416; GFX8-NEXT: buffer_wbinvl1_vol 417; GFX8-NEXT: BB2_2: 418; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 419; GFX8-NEXT: v_readfirstlane_b32 s2, v0 420; GFX8-NEXT: v_mov_b32_e32 v0, v1 421; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 422; GFX8-NEXT: s_mov_b32 s3, 0xf000 423; GFX8-NEXT: s_mov_b32 s2, -1 424; GFX8-NEXT: s_nop 0 425; GFX8-NEXT: s_waitcnt lgkmcnt(0) 426; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 427; GFX8-NEXT: s_endpgm 428; 429; GFX9-LABEL: add_i32_varying: 430; GFX9: ; %bb.0: ; %entry 431; GFX9-NEXT: v_mov_b32_e32 v2, v0 432; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 433; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 434; GFX9-NEXT: v_mov_b32_e32 v1, 0 435; GFX9-NEXT: s_mov_b64 exec, s[2:3] 436; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 437; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 438; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 439; GFX9-NEXT: s_not_b64 exec, exec 440; GFX9-NEXT: v_mov_b32_e32 v2, 0 441; GFX9-NEXT: s_not_b64 exec, exec 442; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 443; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 444; GFX9-NEXT: s_nop 1 445; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 446; GFX9-NEXT: s_nop 1 447; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 448; GFX9-NEXT: s_nop 1 449; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 450; GFX9-NEXT: s_nop 1 451; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 452; GFX9-NEXT: s_nop 1 453; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 454; GFX9-NEXT: v_readlane_b32 s2, v2, 63 455; GFX9-NEXT: s_nop 0 456; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 457; GFX9-NEXT: s_mov_b64 exec, s[4:5] 458; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 459; GFX9-NEXT: ; implicit-def: $vgpr0 460; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 461; GFX9-NEXT: s_cbranch_execz BB2_2 462; GFX9-NEXT: ; %bb.1: 463; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 464; GFX9-NEXT: v_mov_b32_e32 v3, s2 465; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 466; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 467; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 468; GFX9-NEXT: buffer_wbinvl1_vol 469; GFX9-NEXT: BB2_2: 470; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 471; GFX9-NEXT: v_readfirstlane_b32 s2, v0 472; GFX9-NEXT: v_mov_b32_e32 v0, v1 473; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 474; GFX9-NEXT: s_mov_b32 s3, 0xf000 475; GFX9-NEXT: s_mov_b32 s2, -1 476; GFX9-NEXT: s_nop 0 477; GFX9-NEXT: s_waitcnt lgkmcnt(0) 478; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 479; GFX9-NEXT: s_endpgm 480; 481; GFX1064-LABEL: add_i32_varying: 482; GFX1064: ; %bb.0: ; %entry 483; GFX1064-NEXT: v_mov_b32_e32 v2, v0 484; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 485; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 486; GFX1064-NEXT: v_mov_b32_e32 v1, 0 487; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 488; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 489; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 490; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 491; GFX1064-NEXT: s_not_b64 exec, exec 492; GFX1064-NEXT: v_mov_b32_e32 v2, 0 493; GFX1064-NEXT: s_not_b64 exec, exec 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 496; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 497; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 498; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 499; GFX1064-NEXT: v_mov_b32_e32 v3, v2 500; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 501; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 502; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 503; GFX1064-NEXT: v_mov_b32_e32 v3, s2 504; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 505; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 506; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 507; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 508; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 509; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 510; GFX1064-NEXT: s_mov_b32 s2, -1 511; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 512; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 513; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 514; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 515; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 516; GFX1064-NEXT: ; implicit-def: $vgpr0 517; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 518; GFX1064-NEXT: s_cbranch_execz BB2_2 519; GFX1064-NEXT: ; %bb.1: 520; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 521; GFX1064-NEXT: v_mov_b32_e32 v7, s3 522; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 523; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 524; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 525; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 526; GFX1064-NEXT: buffer_gl0_inv 527; GFX1064-NEXT: buffer_gl1_inv 528; GFX1064-NEXT: BB2_2: 529; GFX1064-NEXT: v_nop 530; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 531; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 532; GFX1064-NEXT: v_mov_b32_e32 v0, v1 533; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 534; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 535; GFX1064-NEXT: s_nop 1 536; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 537; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 538; GFX1064-NEXT: s_endpgm 539; 540; GFX1032-LABEL: add_i32_varying: 541; GFX1032: ; %bb.0: ; %entry 542; GFX1032-NEXT: ; implicit-def: $vcc_hi 543; GFX1032-NEXT: v_mov_b32_e32 v2, v0 544; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 545; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 546; GFX1032-NEXT: v_mov_b32_e32 v1, 0 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 549; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 550; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 551; GFX1032-NEXT: v_mov_b32_e32 v2, 0 552; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 553; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 554; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 555; GFX1032-NEXT: s_mov_b32 s2, -1 556; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 557; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 558; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 559; GFX1032-NEXT: v_mov_b32_e32 v3, v2 560; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 561; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 562; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 563; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 564; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 565; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 566; GFX1032-NEXT: s_mov_b32 exec_lo, s4 567; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 568; GFX1032-NEXT: ; implicit-def: $vgpr0 569; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 570; GFX1032-NEXT: s_cbranch_execz BB2_2 571; GFX1032-NEXT: ; %bb.1: 572; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 573; GFX1032-NEXT: v_mov_b32_e32 v7, s3 574; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 575; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 576; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 577; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 578; GFX1032-NEXT: buffer_gl0_inv 579; GFX1032-NEXT: buffer_gl1_inv 580; GFX1032-NEXT: BB2_2: 581; GFX1032-NEXT: v_nop 582; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 583; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 584; GFX1032-NEXT: v_mov_b32_e32 v0, v1 585; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 586; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 587; GFX1032-NEXT: s_nop 1 588; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 589; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 590; GFX1032-NEXT: s_endpgm 591entry: 592 %lane = call i32 @llvm.amdgcn.workitem.id.x() 593 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 594 store i32 %old, i32 addrspace(1)* %out 595 ret void 596} 597 598define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 599; 600; 601; GFX7LESS-LABEL: add_i32_varying_gfx1032: 602; GFX7LESS: ; %bb.0: ; %entry 603; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 604; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 605; GFX7LESS-NEXT: s_mov_b32 m0, -1 606; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 607; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 608; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 609; GFX7LESS-NEXT: buffer_wbinvl1 610; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 611; GFX7LESS-NEXT: s_mov_b32 s2, -1 612; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 613; GFX7LESS-NEXT: s_endpgm 614; 615; GFX8-LABEL: add_i32_varying_gfx1032: 616; GFX8: ; %bb.0: ; %entry 617; GFX8-NEXT: v_mov_b32_e32 v2, v0 618; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 619; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 620; GFX8-NEXT: v_mov_b32_e32 v1, 0 621; GFX8-NEXT: s_mov_b64 exec, s[2:3] 622; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 623; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 624; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 625; GFX8-NEXT: s_not_b64 exec, exec 626; GFX8-NEXT: v_mov_b32_e32 v2, 0 627; GFX8-NEXT: s_not_b64 exec, exec 628; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 629; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 630; GFX8-NEXT: s_nop 1 631; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 632; GFX8-NEXT: s_nop 1 633; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 634; GFX8-NEXT: s_nop 1 635; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 636; GFX8-NEXT: s_nop 1 637; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 638; GFX8-NEXT: s_nop 1 639; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 640; GFX8-NEXT: v_readlane_b32 s2, v2, 63 641; GFX8-NEXT: s_nop 0 642; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 643; GFX8-NEXT: s_mov_b64 exec, s[4:5] 644; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 645; GFX8-NEXT: ; implicit-def: $vgpr0 646; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 647; GFX8-NEXT: s_cbranch_execz BB3_2 648; GFX8-NEXT: ; %bb.1: 649; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 650; GFX8-NEXT: v_mov_b32_e32 v3, s2 651; GFX8-NEXT: s_mov_b32 m0, -1 652; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 653; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 654; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 655; GFX8-NEXT: buffer_wbinvl1_vol 656; GFX8-NEXT: BB3_2: 657; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 658; GFX8-NEXT: v_readfirstlane_b32 s2, v0 659; GFX8-NEXT: v_mov_b32_e32 v0, v1 660; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 661; GFX8-NEXT: s_mov_b32 s3, 0xf000 662; GFX8-NEXT: s_mov_b32 s2, -1 663; GFX8-NEXT: s_nop 0 664; GFX8-NEXT: s_waitcnt lgkmcnt(0) 665; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 666; GFX8-NEXT: s_endpgm 667; 668; GFX9-LABEL: add_i32_varying_gfx1032: 669; GFX9: ; %bb.0: ; %entry 670; GFX9-NEXT: v_mov_b32_e32 v2, v0 671; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 672; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 673; GFX9-NEXT: v_mov_b32_e32 v1, 0 674; GFX9-NEXT: s_mov_b64 exec, s[2:3] 675; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 676; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 677; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 678; GFX9-NEXT: s_not_b64 exec, exec 679; GFX9-NEXT: v_mov_b32_e32 v2, 0 680; GFX9-NEXT: s_not_b64 exec, exec 681; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 682; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 683; GFX9-NEXT: s_nop 1 684; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 685; GFX9-NEXT: s_nop 1 686; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 687; GFX9-NEXT: s_nop 1 688; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 689; GFX9-NEXT: s_nop 1 690; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 691; GFX9-NEXT: s_nop 1 692; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 693; GFX9-NEXT: v_readlane_b32 s2, v2, 63 694; GFX9-NEXT: s_nop 0 695; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 696; GFX9-NEXT: s_mov_b64 exec, s[4:5] 697; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 698; GFX9-NEXT: ; implicit-def: $vgpr0 699; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 700; GFX9-NEXT: s_cbranch_execz BB3_2 701; GFX9-NEXT: ; %bb.1: 702; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 703; GFX9-NEXT: v_mov_b32_e32 v3, s2 704; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 705; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 706; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 707; GFX9-NEXT: buffer_wbinvl1_vol 708; GFX9-NEXT: BB3_2: 709; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 710; GFX9-NEXT: v_readfirstlane_b32 s2, v0 711; GFX9-NEXT: v_mov_b32_e32 v0, v1 712; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 713; GFX9-NEXT: s_mov_b32 s3, 0xf000 714; GFX9-NEXT: s_mov_b32 s2, -1 715; GFX9-NEXT: s_nop 0 716; GFX9-NEXT: s_waitcnt lgkmcnt(0) 717; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 718; GFX9-NEXT: s_endpgm 719; 720; GFX1064-LABEL: add_i32_varying_gfx1032: 721; GFX1064: ; %bb.0: ; %entry 722; GFX1064-NEXT: v_mov_b32_e32 v2, v0 723; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 724; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 725; GFX1064-NEXT: v_mov_b32_e32 v1, 0 726; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 727; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 728; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 729; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 730; GFX1064-NEXT: s_not_b64 exec, exec 731; GFX1064-NEXT: v_mov_b32_e32 v2, 0 732; GFX1064-NEXT: s_not_b64 exec, exec 733; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 734; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 735; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 736; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 737; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 738; GFX1064-NEXT: v_mov_b32_e32 v3, v2 739; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 740; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 741; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 742; GFX1064-NEXT: v_mov_b32_e32 v3, s2 743; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 744; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 745; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 746; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 747; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 748; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 749; GFX1064-NEXT: s_mov_b32 s2, -1 750; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 751; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 752; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 753; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 754; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 755; GFX1064-NEXT: ; implicit-def: $vgpr0 756; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 757; GFX1064-NEXT: s_cbranch_execz BB3_2 758; GFX1064-NEXT: ; %bb.1: 759; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 760; GFX1064-NEXT: v_mov_b32_e32 v7, s3 761; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 762; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 763; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 764; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 765; GFX1064-NEXT: buffer_gl0_inv 766; GFX1064-NEXT: buffer_gl1_inv 767; GFX1064-NEXT: BB3_2: 768; GFX1064-NEXT: v_nop 769; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 770; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 771; GFX1064-NEXT: v_mov_b32_e32 v0, v1 772; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 773; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 774; GFX1064-NEXT: s_nop 1 775; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 776; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 777; GFX1064-NEXT: s_endpgm 778; 779; GFX1032-LABEL: add_i32_varying_gfx1032: 780; GFX1032: ; %bb.0: ; %entry 781; GFX1032-NEXT: ; implicit-def: $vcc_hi 782; GFX1032-NEXT: v_mov_b32_e32 v2, v0 783; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 784; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 785; GFX1032-NEXT: v_mov_b32_e32 v1, 0 786; GFX1032-NEXT: s_mov_b32 exec_lo, s2 787; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 788; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 789; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 790; GFX1032-NEXT: v_mov_b32_e32 v2, 0 791; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 792; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 793; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 794; GFX1032-NEXT: s_mov_b32 s2, -1 795; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 796; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 797; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 798; GFX1032-NEXT: v_mov_b32_e32 v3, v2 799; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 800; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 801; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 802; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 803; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 804; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 805; GFX1032-NEXT: s_mov_b32 exec_lo, s4 806; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 807; GFX1032-NEXT: ; implicit-def: $vgpr0 808; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 809; GFX1032-NEXT: s_cbranch_execz BB3_2 810; GFX1032-NEXT: ; %bb.1: 811; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 812; GFX1032-NEXT: v_mov_b32_e32 v7, s3 813; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 814; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 815; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 816; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 817; GFX1032-NEXT: buffer_gl0_inv 818; GFX1032-NEXT: buffer_gl1_inv 819; GFX1032-NEXT: BB3_2: 820; GFX1032-NEXT: v_nop 821; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 822; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 823; GFX1032-NEXT: v_mov_b32_e32 v0, v1 824; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 825; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 826; GFX1032-NEXT: s_nop 1 827; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 828; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 829; GFX1032-NEXT: s_endpgm 830entry: 831 %lane = call i32 @llvm.amdgcn.workitem.id.x() 832 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 833 store i32 %old, i32 addrspace(1)* %out 834 ret void 835} 836 837define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 838; 839; 840; GFX7LESS-LABEL: add_i32_varying_gfx1064: 841; GFX7LESS: ; %bb.0: ; %entry 842; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 843; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 844; GFX7LESS-NEXT: s_mov_b32 m0, -1 845; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 846; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 847; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 848; GFX7LESS-NEXT: buffer_wbinvl1 849; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 850; GFX7LESS-NEXT: s_mov_b32 s2, -1 851; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 852; GFX7LESS-NEXT: s_endpgm 853; 854; GFX8-LABEL: add_i32_varying_gfx1064: 855; GFX8: ; %bb.0: ; %entry 856; GFX8-NEXT: v_mov_b32_e32 v2, v0 857; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 858; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 859; GFX8-NEXT: v_mov_b32_e32 v1, 0 860; GFX8-NEXT: s_mov_b64 exec, s[2:3] 861; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 862; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 863; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 864; GFX8-NEXT: s_not_b64 exec, exec 865; GFX8-NEXT: v_mov_b32_e32 v2, 0 866; GFX8-NEXT: s_not_b64 exec, exec 867; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 868; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 869; GFX8-NEXT: s_nop 1 870; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 871; GFX8-NEXT: s_nop 1 872; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 873; GFX8-NEXT: s_nop 1 874; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 875; GFX8-NEXT: s_nop 1 876; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 877; GFX8-NEXT: s_nop 1 878; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 879; GFX8-NEXT: v_readlane_b32 s2, v2, 63 880; GFX8-NEXT: s_nop 0 881; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 882; GFX8-NEXT: s_mov_b64 exec, s[4:5] 883; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 884; GFX8-NEXT: ; implicit-def: $vgpr0 885; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 886; GFX8-NEXT: s_cbranch_execz BB4_2 887; GFX8-NEXT: ; %bb.1: 888; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 889; GFX8-NEXT: v_mov_b32_e32 v3, s2 890; GFX8-NEXT: s_mov_b32 m0, -1 891; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 892; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 893; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 894; GFX8-NEXT: buffer_wbinvl1_vol 895; GFX8-NEXT: BB4_2: 896; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 897; GFX8-NEXT: v_readfirstlane_b32 s2, v0 898; GFX8-NEXT: v_mov_b32_e32 v0, v1 899; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 900; GFX8-NEXT: s_mov_b32 s3, 0xf000 901; GFX8-NEXT: s_mov_b32 s2, -1 902; GFX8-NEXT: s_nop 0 903; GFX8-NEXT: s_waitcnt lgkmcnt(0) 904; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 905; GFX8-NEXT: s_endpgm 906; 907; GFX9-LABEL: add_i32_varying_gfx1064: 908; GFX9: ; %bb.0: ; %entry 909; GFX9-NEXT: v_mov_b32_e32 v2, v0 910; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 911; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 912; GFX9-NEXT: v_mov_b32_e32 v1, 0 913; GFX9-NEXT: s_mov_b64 exec, s[2:3] 914; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 915; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 916; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 917; GFX9-NEXT: s_not_b64 exec, exec 918; GFX9-NEXT: v_mov_b32_e32 v2, 0 919; GFX9-NEXT: s_not_b64 exec, exec 920; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 921; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 922; GFX9-NEXT: s_nop 1 923; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 924; GFX9-NEXT: s_nop 1 925; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 926; GFX9-NEXT: s_nop 1 927; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 928; GFX9-NEXT: s_nop 1 929; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 930; GFX9-NEXT: s_nop 1 931; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 932; GFX9-NEXT: v_readlane_b32 s2, v2, 63 933; GFX9-NEXT: s_nop 0 934; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 935; GFX9-NEXT: s_mov_b64 exec, s[4:5] 936; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 937; GFX9-NEXT: ; implicit-def: $vgpr0 938; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 939; GFX9-NEXT: s_cbranch_execz BB4_2 940; GFX9-NEXT: ; %bb.1: 941; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 942; GFX9-NEXT: v_mov_b32_e32 v3, s2 943; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 944; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 945; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 946; GFX9-NEXT: buffer_wbinvl1_vol 947; GFX9-NEXT: BB4_2: 948; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 949; GFX9-NEXT: v_readfirstlane_b32 s2, v0 950; GFX9-NEXT: v_mov_b32_e32 v0, v1 951; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 952; GFX9-NEXT: s_mov_b32 s3, 0xf000 953; GFX9-NEXT: s_mov_b32 s2, -1 954; GFX9-NEXT: s_nop 0 955; GFX9-NEXT: s_waitcnt lgkmcnt(0) 956; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 957; GFX9-NEXT: s_endpgm 958; 959; GFX1064-LABEL: add_i32_varying_gfx1064: 960; GFX1064: ; %bb.0: ; %entry 961; GFX1064-NEXT: v_mov_b32_e32 v2, v0 962; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 963; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 964; GFX1064-NEXT: v_mov_b32_e32 v1, 0 965; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 966; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 967; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 968; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 969; GFX1064-NEXT: s_not_b64 exec, exec 970; GFX1064-NEXT: v_mov_b32_e32 v2, 0 971; GFX1064-NEXT: s_not_b64 exec, exec 972; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 973; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 974; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 975; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 976; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 977; GFX1064-NEXT: v_mov_b32_e32 v3, v2 978; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 979; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 980; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 981; GFX1064-NEXT: v_mov_b32_e32 v3, s2 982; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 983; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 984; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 985; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 986; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 987; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 988; GFX1064-NEXT: s_mov_b32 s2, -1 989; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 990; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 991; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 992; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 993; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 994; GFX1064-NEXT: ; implicit-def: $vgpr0 995; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 996; GFX1064-NEXT: s_cbranch_execz BB4_2 997; GFX1064-NEXT: ; %bb.1: 998; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 999; GFX1064-NEXT: v_mov_b32_e32 v7, s3 1000; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1001; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1002; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 1003; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1004; GFX1064-NEXT: buffer_gl0_inv 1005; GFX1064-NEXT: buffer_gl1_inv 1006; GFX1064-NEXT: BB4_2: 1007; GFX1064-NEXT: v_nop 1008; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1009; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1010; GFX1064-NEXT: v_mov_b32_e32 v0, v1 1011; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 1012; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1013; GFX1064-NEXT: s_nop 1 1014; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1016; GFX1064-NEXT: s_endpgm 1017; 1018; GFX1032-LABEL: add_i32_varying_gfx1064: 1019; GFX1032: ; %bb.0: ; %entry 1020; GFX1032-NEXT: ; implicit-def: $vcc_hi 1021; GFX1032-NEXT: v_mov_b32_e32 v2, v0 1022; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1023; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1024; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1025; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1026; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 1027; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1028; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1029; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1030; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1031; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1032; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1033; GFX1032-NEXT: s_mov_b32 s2, -1 1034; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1035; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1036; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1037; GFX1032-NEXT: v_mov_b32_e32 v3, v2 1038; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 1039; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1040; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 1041; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 1042; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 1043; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 1044; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1045; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1046; GFX1032-NEXT: ; implicit-def: $vgpr0 1047; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1048; GFX1032-NEXT: s_cbranch_execz BB4_2 1049; GFX1032-NEXT: ; %bb.1: 1050; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1051; GFX1032-NEXT: v_mov_b32_e32 v7, s3 1052; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1053; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1054; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 1055; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1056; GFX1032-NEXT: buffer_gl0_inv 1057; GFX1032-NEXT: buffer_gl1_inv 1058; GFX1032-NEXT: BB4_2: 1059; GFX1032-NEXT: v_nop 1060; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1061; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1062; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1063; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1064; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1065; GFX1032-NEXT: s_nop 1 1066; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1067; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1068; GFX1032-NEXT: s_endpgm 1069entry: 1070 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1071 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1072 store i32 %old, i32 addrspace(1)* %out 1073 ret void 1074} 1075 1076define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1077; 1078; 1079; GFX7LESS-LABEL: add_i64_constant: 1080; GFX7LESS: ; %bb.0: ; %entry 1081; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1082; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1083; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1084; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1085; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1086; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1087; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1088; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1089; GFX7LESS-NEXT: ; %bb.1: 1090; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1091; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1092; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1093; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1094; GFX7LESS-NEXT: s_mov_b32 m0, -1 1095; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1096; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1097; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1098; GFX7LESS-NEXT: buffer_wbinvl1 1099; GFX7LESS-NEXT: BB5_2: 1100; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1101; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1102; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1103; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1104; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1105; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1106; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1107; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1108; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1109; GFX7LESS-NEXT: s_mov_b32 s2, -1 1110; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1112; GFX7LESS-NEXT: s_endpgm 1113; 1114; GFX8-LABEL: add_i64_constant: 1115; GFX8: ; %bb.0: ; %entry 1116; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1117; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1118; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1119; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1120; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1121; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1122; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1123; GFX8-NEXT: s_cbranch_execz BB5_2 1124; GFX8-NEXT: ; %bb.1: 1125; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1126; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1127; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1128; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1129; GFX8-NEXT: s_mov_b32 m0, -1 1130; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1131; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1132; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1133; GFX8-NEXT: buffer_wbinvl1_vol 1134; GFX8-NEXT: BB5_2: 1135; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1136; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1137; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1138; GFX8-NEXT: v_mov_b32_e32 v1, s2 1139; GFX8-NEXT: v_mov_b32_e32 v2, s3 1140; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1141; GFX8-NEXT: s_mov_b32 s3, 0xf000 1142; GFX8-NEXT: s_mov_b32 s2, -1 1143; GFX8-NEXT: s_nop 2 1144; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1145; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1146; GFX8-NEXT: s_endpgm 1147; 1148; GFX9-LABEL: add_i64_constant: 1149; GFX9: ; %bb.0: ; %entry 1150; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1151; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1152; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1153; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1154; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1155; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1156; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1157; GFX9-NEXT: s_cbranch_execz BB5_2 1158; GFX9-NEXT: ; %bb.1: 1159; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1160; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1161; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1162; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1163; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1164; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1165; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1166; GFX9-NEXT: buffer_wbinvl1_vol 1167; GFX9-NEXT: BB5_2: 1168; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1169; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1170; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1171; GFX9-NEXT: v_mov_b32_e32 v1, s2 1172; GFX9-NEXT: v_mov_b32_e32 v2, s3 1173; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1174; GFX9-NEXT: s_mov_b32 s3, 0xf000 1175; GFX9-NEXT: s_mov_b32 s2, -1 1176; GFX9-NEXT: s_nop 2 1177; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1178; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1179; GFX9-NEXT: s_endpgm 1180; 1181; GFX1064-LABEL: add_i64_constant: 1182; GFX1064: ; %bb.0: ; %entry 1183; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1184; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1185; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1186; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1187; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 1188; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1189; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1190; GFX1064-NEXT: s_cbranch_execz BB5_2 1191; GFX1064-NEXT: ; %bb.1: 1192; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1193; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1194; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1195; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1196; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1197; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1198; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1199; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1200; GFX1064-NEXT: buffer_gl0_inv 1201; GFX1064-NEXT: buffer_gl1_inv 1202; GFX1064-NEXT: BB5_2: 1203; GFX1064-NEXT: v_nop 1204; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1205; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1206; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1207; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1208; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1209; GFX1064-NEXT: s_mov_b32 s2, -1 1210; GFX1064-NEXT: s_nop 2 1211; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1212; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1213; GFX1064-NEXT: s_endpgm 1214; 1215; GFX1032-LABEL: add_i64_constant: 1216; GFX1032: ; %bb.0: ; %entry 1217; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1218; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 1219; GFX1032-NEXT: ; implicit-def: $vcc_hi 1220; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1221; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1222; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1223; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1224; GFX1032-NEXT: s_cbranch_execz BB5_2 1225; GFX1032-NEXT: ; %bb.1: 1226; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1227; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1228; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1229; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1230; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1231; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1232; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1233; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1234; GFX1032-NEXT: buffer_gl0_inv 1235; GFX1032-NEXT: buffer_gl1_inv 1236; GFX1032-NEXT: BB5_2: 1237; GFX1032-NEXT: v_nop 1238; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1239; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1240; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1241; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1242; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1243; GFX1032-NEXT: s_mov_b32 s2, -1 1244; GFX1032-NEXT: s_nop 2 1245; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1247; GFX1032-NEXT: s_endpgm 1248entry: 1249 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1250 store i64 %old, i64 addrspace(1)* %out 1251 ret void 1252} 1253 1254define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1255; 1256; 1257; GFX7LESS-LABEL: add_i64_uniform: 1258; GFX7LESS: ; %bb.0: ; %entry 1259; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1260; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1261; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1262; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1263; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1264; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1265; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1266; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1267; GFX7LESS-NEXT: ; %bb.1: 1268; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1269; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1270; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1272; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1273; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1274; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1275; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1276; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1277; GFX7LESS-NEXT: s_mov_b32 m0, -1 1278; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1279; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1280; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1281; GFX7LESS-NEXT: buffer_wbinvl1 1282; GFX7LESS-NEXT: BB6_2: 1283; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1284; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1285; GFX7LESS-NEXT: s_mov_b32 s6, -1 1286; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX7LESS-NEXT: s_mov_b32 s4, s0 1288; GFX7LESS-NEXT: s_mov_b32 s5, s1 1289; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1290; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1291; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1292; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1293; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1294; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1295; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1296; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1297; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1298; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1299; GFX7LESS-NEXT: s_endpgm 1300; 1301; GFX8-LABEL: add_i64_uniform: 1302; GFX8: ; %bb.0: ; %entry 1303; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1304; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1305; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1306; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1307; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1308; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1309; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1310; GFX8-NEXT: s_cbranch_execz BB6_2 1311; GFX8-NEXT: ; %bb.1: 1312; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1313; GFX8-NEXT: v_mov_b32_e32 v1, s6 1314; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1315; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1316; GFX8-NEXT: s_mul_i32 s7, s3, s6 1317; GFX8-NEXT: s_mul_i32 s6, s2, s6 1318; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1319; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1320; GFX8-NEXT: v_mov_b32_e32 v1, s6 1321; GFX8-NEXT: s_mov_b32 m0, -1 1322; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1323; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1324; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1325; GFX8-NEXT: buffer_wbinvl1_vol 1326; GFX8-NEXT: BB6_2: 1327; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1328; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1329; GFX8-NEXT: s_mov_b32 s4, s0 1330; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1331; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1332; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1333; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1334; GFX8-NEXT: s_mov_b32 s5, s1 1335; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1336; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1337; GFX8-NEXT: v_mov_b32_e32 v2, s1 1338; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1339; GFX8-NEXT: s_mov_b32 s7, 0xf000 1340; GFX8-NEXT: s_mov_b32 s6, -1 1341; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1342; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1343; GFX8-NEXT: s_endpgm 1344; 1345; GFX9-LABEL: add_i64_uniform: 1346; GFX9: ; %bb.0: ; %entry 1347; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1348; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1349; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1350; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1351; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1352; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1353; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1354; GFX9-NEXT: s_cbranch_execz BB6_2 1355; GFX9-NEXT: ; %bb.1: 1356; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1357; GFX9-NEXT: v_mov_b32_e32 v1, s6 1358; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1359; GFX9-NEXT: v_mul_hi_u32 v2, s2, v1 1360; GFX9-NEXT: s_mul_i32 s7, s3, s6 1361; GFX9-NEXT: s_mul_i32 s6, s2, s6 1362; GFX9-NEXT: v_mov_b32_e32 v1, s6 1363; GFX9-NEXT: v_add_u32_e32 v2, s7, v2 1364; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1365; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1366; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1367; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1368; GFX9-NEXT: buffer_wbinvl1_vol 1369; GFX9-NEXT: BB6_2: 1370; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1371; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1372; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1373; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1374; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1375; GFX9-NEXT: s_mov_b32 s4, s0 1376; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1377; GFX9-NEXT: s_mov_b32 s5, s1 1378; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1379; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1380; GFX9-NEXT: v_mov_b32_e32 v2, s1 1381; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1382; GFX9-NEXT: s_mov_b32 s7, 0xf000 1383; GFX9-NEXT: s_mov_b32 s6, -1 1384; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1385; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1386; GFX9-NEXT: s_endpgm 1387; 1388; GFX1064-LABEL: add_i64_uniform: 1389; GFX1064: ; %bb.0: ; %entry 1390; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1391; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1392; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1393; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1394; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1395; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1396; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1397; GFX1064-NEXT: s_cbranch_execz BB6_2 1398; GFX1064-NEXT: ; %bb.1: 1399; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1400; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1401; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1402; GFX1064-NEXT: v_mul_hi_u32 v2, s2, s6 1403; GFX1064-NEXT: s_mul_i32 s7, s2, s6 1404; GFX1064-NEXT: s_mul_i32 s6, s3, s6 1405; GFX1064-NEXT: v_mov_b32_e32 v1, s7 1406; GFX1064-NEXT: v_add_nc_u32_e32 v2, s6, v2 1407; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1408; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1409; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1410; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1411; GFX1064-NEXT: buffer_gl0_inv 1412; GFX1064-NEXT: buffer_gl1_inv 1413; GFX1064-NEXT: BB6_2: 1414; GFX1064-NEXT: v_nop 1415; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1416; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1418; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1419; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1420; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 1421; GFX1064-NEXT: v_readfirstlane_b32 s5, v2 1422; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1423; GFX1064-NEXT: s_mov_b32 s2, -1 1424; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1425; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s4, v0 1426; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s5, v1, vcc 1427; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1428; GFX1064-NEXT: s_endpgm 1429; 1430; GFX1032-LABEL: add_i64_uniform: 1431; GFX1032: ; %bb.0: ; %entry 1432; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1433; GFX1032-NEXT: v_cmp_ne_u32_e64 s5, 1, 0 1434; GFX1032-NEXT: ; implicit-def: $vcc_hi 1435; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1436; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1437; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1438; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1439; GFX1032-NEXT: s_cbranch_execz BB6_2 1440; GFX1032-NEXT: ; %bb.1: 1441; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1442; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1443; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1444; GFX1032-NEXT: v_mul_hi_u32 v2, s2, s5 1445; GFX1032-NEXT: s_mul_i32 s6, s2, s5 1446; GFX1032-NEXT: s_mul_i32 s5, s3, s5 1447; GFX1032-NEXT: v_mov_b32_e32 v1, s6 1448; GFX1032-NEXT: v_add_nc_u32_e32 v2, s5, v2 1449; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1450; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1451; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1452; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1453; GFX1032-NEXT: buffer_gl0_inv 1454; GFX1032-NEXT: buffer_gl1_inv 1455; GFX1032-NEXT: BB6_2: 1456; GFX1032-NEXT: v_nop 1457; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1458; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1459; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1460; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1461; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1462; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 1463; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 1464; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1465; GFX1032-NEXT: s_mov_b32 s2, -1 1466; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1467; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s4, v0 1468; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo 1469; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1470; GFX1032-NEXT: s_endpgm 1471entry: 1472 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1473 store i64 %old, i64 addrspace(1)* %out 1474 ret void 1475} 1476 1477; GCN-NOT: v_mbcnt_lo_u32_b32 1478; GCN-NOT: v_mbcnt_hi_u32_b32 1479; GCN-NOT: s_bcnt1_i32_b64 1480define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1481; 1482; 1483; GFX7LESS-LABEL: add_i64_varying: 1484; GFX7LESS: ; %bb.0: ; %entry 1485; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1486; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1487; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1488; GFX7LESS-NEXT: s_mov_b32 m0, -1 1489; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1490; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1491; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1492; GFX7LESS-NEXT: buffer_wbinvl1 1493; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1494; GFX7LESS-NEXT: s_mov_b32 s2, -1 1495; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1496; GFX7LESS-NEXT: s_endpgm 1497; 1498; GFX8-LABEL: add_i64_varying: 1499; GFX8: ; %bb.0: ; %entry 1500; GFX8-NEXT: v_mov_b32_e32 v1, 0 1501; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1502; GFX8-NEXT: s_mov_b32 m0, -1 1503; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1504; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1505; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1506; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1507; GFX8-NEXT: buffer_wbinvl1_vol 1508; GFX8-NEXT: s_mov_b32 s3, 0xf000 1509; GFX8-NEXT: s_mov_b32 s2, -1 1510; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1511; GFX8-NEXT: s_endpgm 1512; 1513; GFX9-LABEL: add_i64_varying: 1514; GFX9: ; %bb.0: ; %entry 1515; GFX9-NEXT: v_mov_b32_e32 v1, 0 1516; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1517; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1518; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1519; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1520; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1521; GFX9-NEXT: buffer_wbinvl1_vol 1522; GFX9-NEXT: s_mov_b32 s3, 0xf000 1523; GFX9-NEXT: s_mov_b32 s2, -1 1524; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1525; GFX9-NEXT: s_endpgm 1526; 1527; GFX1064-LABEL: add_i64_varying: 1528; GFX1064: ; %bb.0: ; %entry 1529; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1530; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1531; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1532; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1533; GFX1064-NEXT: s_mov_b32 s2, -1 1534; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1535; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1536; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1537; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1538; GFX1064-NEXT: buffer_gl0_inv 1539; GFX1064-NEXT: buffer_gl1_inv 1540; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1541; GFX1064-NEXT: s_endpgm 1542; 1543; GFX1032-LABEL: add_i64_varying: 1544; GFX1032: ; %bb.0: ; %entry 1545; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1546; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1547; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1548; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1549; GFX1032-NEXT: s_mov_b32 s2, -1 1550; GFX1032-NEXT: ; implicit-def: $vcc_hi 1551; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1552; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1553; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1554; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1555; GFX1032-NEXT: buffer_gl0_inv 1556; GFX1032-NEXT: buffer_gl1_inv 1557; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1558; GFX1032-NEXT: s_endpgm 1559entry: 1560 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1561 %zext = zext i32 %lane to i64 1562 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1563 store i64 %old, i64 addrspace(1)* %out 1564 ret void 1565} 1566 1567define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1568; 1569; 1570; GFX7LESS-LABEL: sub_i32_constant: 1571; GFX7LESS: ; %bb.0: ; %entry 1572; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1573; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1574; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1575; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1576; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1577; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1578; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1579; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1580; GFX7LESS-NEXT: ; %bb.1: 1581; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1582; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1583; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s4, 5 1584; GFX7LESS-NEXT: s_mov_b32 m0, -1 1585; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1586; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1587; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1588; GFX7LESS-NEXT: buffer_wbinvl1 1589; GFX7LESS-NEXT: BB8_2: 1590; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1591; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1592; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1593; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1594; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1595; GFX7LESS-NEXT: s_mov_b32 s2, -1 1596; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1597; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1598; GFX7LESS-NEXT: s_endpgm 1599; 1600; GFX8-LABEL: sub_i32_constant: 1601; GFX8: ; %bb.0: ; %entry 1602; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1603; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1604; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1605; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1606; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1607; GFX8-NEXT: ; implicit-def: $vgpr1 1608; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1609; GFX8-NEXT: s_cbranch_execz BB8_2 1610; GFX8-NEXT: ; %bb.1: 1611; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1612; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1613; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1614; GFX8-NEXT: s_mov_b32 m0, -1 1615; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1616; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 1617; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1618; GFX8-NEXT: buffer_wbinvl1_vol 1619; GFX8-NEXT: BB8_2: 1620; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1621; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1622; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1623; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1624; GFX8-NEXT: s_mov_b32 s3, 0xf000 1625; GFX8-NEXT: s_mov_b32 s2, -1 1626; GFX8-NEXT: s_nop 0 1627; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1628; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1629; GFX8-NEXT: s_endpgm 1630; 1631; GFX9-LABEL: sub_i32_constant: 1632; GFX9: ; %bb.0: ; %entry 1633; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1634; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1635; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1636; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1637; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1638; GFX9-NEXT: ; implicit-def: $vgpr1 1639; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1640; GFX9-NEXT: s_cbranch_execz BB8_2 1641; GFX9-NEXT: ; %bb.1: 1642; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1643; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1644; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1645; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1646; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 1647; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1648; GFX9-NEXT: buffer_wbinvl1_vol 1649; GFX9-NEXT: BB8_2: 1650; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1651; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1652; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1653; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1654; GFX9-NEXT: s_mov_b32 s3, 0xf000 1655; GFX9-NEXT: s_mov_b32 s2, -1 1656; GFX9-NEXT: s_nop 0 1657; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1658; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1659; GFX9-NEXT: s_endpgm 1660; 1661; GFX1064-LABEL: sub_i32_constant: 1662; GFX1064: ; %bb.0: ; %entry 1663; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1664; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1665; GFX1064-NEXT: ; implicit-def: $vgpr1 1666; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1667; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1668; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1669; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1670; GFX1064-NEXT: s_cbranch_execz BB8_2 1671; GFX1064-NEXT: ; %bb.1: 1672; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1673; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1674; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1675; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1676; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1677; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 1678; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1679; GFX1064-NEXT: buffer_gl0_inv 1680; GFX1064-NEXT: buffer_gl1_inv 1681; GFX1064-NEXT: BB8_2: 1682; GFX1064-NEXT: v_nop 1683; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1684; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1685; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1686; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1687; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1688; GFX1064-NEXT: s_mov_b32 s2, -1 1689; GFX1064-NEXT: s_nop 0 1690; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1691; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1692; GFX1064-NEXT: s_endpgm 1693; 1694; GFX1032-LABEL: sub_i32_constant: 1695; GFX1032: ; %bb.0: ; %entry 1696; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1697; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 1698; GFX1032-NEXT: ; implicit-def: $vcc_hi 1699; GFX1032-NEXT: ; implicit-def: $vgpr1 1700; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1701; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1702; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1703; GFX1032-NEXT: s_cbranch_execz BB8_2 1704; GFX1032-NEXT: ; %bb.1: 1705; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1706; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1707; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1708; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1709; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1710; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 1711; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1712; GFX1032-NEXT: buffer_gl0_inv 1713; GFX1032-NEXT: buffer_gl1_inv 1714; GFX1032-NEXT: BB8_2: 1715; GFX1032-NEXT: v_nop 1716; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1717; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1718; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1719; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1720; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1721; GFX1032-NEXT: s_mov_b32 s2, -1 1722; GFX1032-NEXT: s_nop 0 1723; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1724; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1725; GFX1032-NEXT: s_endpgm 1726entry: 1727 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1728 store i32 %old, i32 addrspace(1)* %out 1729 ret void 1730} 1731 1732define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1733; 1734; 1735; GFX7LESS-LABEL: sub_i32_uniform: 1736; GFX7LESS: ; %bb.0: ; %entry 1737; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1738; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 1739; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1740; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1741; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1742; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1743; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1744; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1745; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1746; GFX7LESS-NEXT: ; %bb.1: 1747; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1748; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1749; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 1750; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1751; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 1752; GFX7LESS-NEXT: s_mov_b32 m0, -1 1753; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1754; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1755; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1756; GFX7LESS-NEXT: buffer_wbinvl1 1757; GFX7LESS-NEXT: BB9_2: 1758; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1759; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1760; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1761; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1762; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1763; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1764; GFX7LESS-NEXT: s_mov_b32 s6, -1 1765; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1766; GFX7LESS-NEXT: s_endpgm 1767; 1768; GFX8-LABEL: sub_i32_uniform: 1769; GFX8: ; %bb.0: ; %entry 1770; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1771; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1772; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1773; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1774; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1775; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1776; GFX8-NEXT: ; implicit-def: $vgpr1 1777; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1778; GFX8-NEXT: s_cbranch_execz BB9_2 1779; GFX8-NEXT: ; %bb.1: 1780; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 1781; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1782; GFX8-NEXT: s_mul_i32 s1, s0, s1 1783; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1784; GFX8-NEXT: v_mov_b32_e32 v2, s1 1785; GFX8-NEXT: s_mov_b32 m0, -1 1786; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1787; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1788; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1789; GFX8-NEXT: buffer_wbinvl1_vol 1790; GFX8-NEXT: BB9_2: 1791; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1792; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1793; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1794; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1795; GFX8-NEXT: s_mov_b32 s7, 0xf000 1796; GFX8-NEXT: s_mov_b32 s6, -1 1797; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1798; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1799; GFX8-NEXT: s_endpgm 1800; 1801; GFX9-LABEL: sub_i32_uniform: 1802; GFX9: ; %bb.0: ; %entry 1803; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1804; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 1805; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1806; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1807; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1808; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1809; GFX9-NEXT: ; implicit-def: $vgpr1 1810; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1811; GFX9-NEXT: s_cbranch_execz BB9_2 1812; GFX9-NEXT: ; %bb.1: 1813; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 1814; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1815; GFX9-NEXT: s_mul_i32 s1, s0, s1 1816; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1817; GFX9-NEXT: v_mov_b32_e32 v2, s1 1818; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1819; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1820; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1821; GFX9-NEXT: buffer_wbinvl1_vol 1822; GFX9-NEXT: BB9_2: 1823; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1824; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1825; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 1826; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1827; GFX9-NEXT: s_mov_b32 s7, 0xf000 1828; GFX9-NEXT: s_mov_b32 s6, -1 1829; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1830; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1831; GFX9-NEXT: s_endpgm 1832; 1833; GFX1064-LABEL: sub_i32_uniform: 1834; GFX1064: ; %bb.0: ; %entry 1835; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1836; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1837; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 1838; GFX1064-NEXT: ; implicit-def: $vgpr1 1839; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1840; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1841; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1842; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 1843; GFX1064-NEXT: s_cbranch_execz BB9_2 1844; GFX1064-NEXT: ; %bb.1: 1845; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1846; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1847; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1848; GFX1064-NEXT: s_mul_i32 s1, s0, s1 1849; GFX1064-NEXT: v_mov_b32_e32 v2, s1 1850; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1851; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1852; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1853; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1854; GFX1064-NEXT: buffer_gl0_inv 1855; GFX1064-NEXT: buffer_gl1_inv 1856; GFX1064-NEXT: BB9_2: 1857; GFX1064-NEXT: v_nop 1858; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 1859; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1860; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 1861; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1862; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1863; GFX1064-NEXT: s_mov_b32 s6, -1 1864; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1865; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1866; GFX1064-NEXT: s_endpgm 1867; 1868; GFX1032-LABEL: sub_i32_uniform: 1869; GFX1032: ; %bb.0: ; %entry 1870; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1871; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 1872; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 1873; GFX1032-NEXT: ; implicit-def: $vcc_hi 1874; GFX1032-NEXT: ; implicit-def: $vgpr1 1875; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1876; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1877; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1878; GFX1032-NEXT: s_cbranch_execz BB9_2 1879; GFX1032-NEXT: ; %bb.1: 1880; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1881; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1882; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1883; GFX1032-NEXT: s_mul_i32 s2, s0, s2 1884; GFX1032-NEXT: v_mov_b32_e32 v2, s2 1885; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1886; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1887; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1888; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1889; GFX1032-NEXT: buffer_gl0_inv 1890; GFX1032-NEXT: buffer_gl1_inv 1891; GFX1032-NEXT: BB9_2: 1892; GFX1032-NEXT: v_nop 1893; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 1894; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1895; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 1896; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1897; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1898; GFX1032-NEXT: s_mov_b32 s6, -1 1899; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1900; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1901; GFX1032-NEXT: s_endpgm 1902entry: 1903 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1904 store i32 %old, i32 addrspace(1)* %out 1905 ret void 1906} 1907 1908; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 1909; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 1910; GFX7LESS-NOT: s_bcnt1_i32_b64 1911; DPPCOMB: v_add_u32_dpp 1912; DPPCOMB: v_add_u32_dpp 1913; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 1914; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 1915; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 1916define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1917; 1918; 1919; GFX7LESS-LABEL: sub_i32_varying: 1920; GFX7LESS: ; %bb.0: ; %entry 1921; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1922; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1923; GFX7LESS-NEXT: s_mov_b32 m0, -1 1924; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1925; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1926; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1927; GFX7LESS-NEXT: buffer_wbinvl1 1928; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1929; GFX7LESS-NEXT: s_mov_b32 s2, -1 1930; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1931; GFX7LESS-NEXT: s_endpgm 1932; 1933; GFX8-LABEL: sub_i32_varying: 1934; GFX8: ; %bb.0: ; %entry 1935; GFX8-NEXT: v_mov_b32_e32 v2, v0 1936; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1937; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1938; GFX8-NEXT: v_mov_b32_e32 v1, 0 1939; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1940; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1941; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1942; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1943; GFX8-NEXT: s_not_b64 exec, exec 1944; GFX8-NEXT: v_mov_b32_e32 v2, 0 1945; GFX8-NEXT: s_not_b64 exec, exec 1946; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1947; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1948; GFX8-NEXT: s_nop 1 1949; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1950; GFX8-NEXT: s_nop 1 1951; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1952; GFX8-NEXT: s_nop 1 1953; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1954; GFX8-NEXT: s_nop 1 1955; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1956; GFX8-NEXT: s_nop 1 1957; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1958; GFX8-NEXT: v_readlane_b32 s2, v2, 63 1959; GFX8-NEXT: s_nop 0 1960; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1961; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1962; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1963; GFX8-NEXT: ; implicit-def: $vgpr0 1964; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1965; GFX8-NEXT: s_cbranch_execz BB10_2 1966; GFX8-NEXT: ; %bb.1: 1967; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1968; GFX8-NEXT: v_mov_b32_e32 v3, s2 1969; GFX8-NEXT: s_mov_b32 m0, -1 1970; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1971; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1972; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1973; GFX8-NEXT: buffer_wbinvl1_vol 1974; GFX8-NEXT: BB10_2: 1975; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1976; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1977; GFX8-NEXT: v_mov_b32_e32 v0, v1 1978; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1979; GFX8-NEXT: s_mov_b32 s3, 0xf000 1980; GFX8-NEXT: s_mov_b32 s2, -1 1981; GFX8-NEXT: s_nop 0 1982; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1983; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1984; GFX8-NEXT: s_endpgm 1985; 1986; GFX9-LABEL: sub_i32_varying: 1987; GFX9: ; %bb.0: ; %entry 1988; GFX9-NEXT: v_mov_b32_e32 v2, v0 1989; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1990; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1991; GFX9-NEXT: v_mov_b32_e32 v1, 0 1992; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1993; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1994; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1995; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1996; GFX9-NEXT: s_not_b64 exec, exec 1997; GFX9-NEXT: v_mov_b32_e32 v2, 0 1998; GFX9-NEXT: s_not_b64 exec, exec 1999; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2000; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2001; GFX9-NEXT: s_nop 1 2002; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2003; GFX9-NEXT: s_nop 1 2004; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2005; GFX9-NEXT: s_nop 1 2006; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2007; GFX9-NEXT: s_nop 1 2008; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2009; GFX9-NEXT: s_nop 1 2010; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2011; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2012; GFX9-NEXT: s_nop 0 2013; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2014; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2015; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2016; GFX9-NEXT: ; implicit-def: $vgpr0 2017; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2018; GFX9-NEXT: s_cbranch_execz BB10_2 2019; GFX9-NEXT: ; %bb.1: 2020; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2021; GFX9-NEXT: v_mov_b32_e32 v3, s2 2022; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2023; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2024; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2025; GFX9-NEXT: buffer_wbinvl1_vol 2026; GFX9-NEXT: BB10_2: 2027; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2028; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2029; GFX9-NEXT: v_mov_b32_e32 v0, v1 2030; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2031; GFX9-NEXT: s_mov_b32 s3, 0xf000 2032; GFX9-NEXT: s_mov_b32 s2, -1 2033; GFX9-NEXT: s_nop 0 2034; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2035; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2036; GFX9-NEXT: s_endpgm 2037; 2038; GFX1064-LABEL: sub_i32_varying: 2039; GFX1064: ; %bb.0: ; %entry 2040; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2041; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2042; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2043; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2044; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2045; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2046; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2047; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 2048; GFX1064-NEXT: s_not_b64 exec, exec 2049; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2050; GFX1064-NEXT: s_not_b64 exec, exec 2051; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2052; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2053; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2054; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2055; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2056; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2057; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2058; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2059; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2060; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2061; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2062; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2063; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2064; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2065; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2066; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2067; GFX1064-NEXT: s_mov_b32 s2, -1 2068; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2069; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2070; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2071; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2072; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2073; GFX1064-NEXT: ; implicit-def: $vgpr0 2074; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2075; GFX1064-NEXT: s_cbranch_execz BB10_2 2076; GFX1064-NEXT: ; %bb.1: 2077; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2078; GFX1064-NEXT: v_mov_b32_e32 v7, s3 2079; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2080; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2081; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7 2082; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2083; GFX1064-NEXT: buffer_gl0_inv 2084; GFX1064-NEXT: buffer_gl1_inv 2085; GFX1064-NEXT: BB10_2: 2086; GFX1064-NEXT: v_nop 2087; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2088; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2089; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2090; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2091; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2092; GFX1064-NEXT: s_nop 1 2093; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2094; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2095; GFX1064-NEXT: s_endpgm 2096; 2097; GFX1032-LABEL: sub_i32_varying: 2098; GFX1032: ; %bb.0: ; %entry 2099; GFX1032-NEXT: ; implicit-def: $vcc_hi 2100; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2101; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2102; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2103; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2104; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2105; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 2106; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2107; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2108; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2109; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2110; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2111; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2112; GFX1032-NEXT: s_mov_b32 s2, -1 2113; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2114; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2115; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2116; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2117; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2118; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2119; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2120; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2121; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2122; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2123; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2124; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2125; GFX1032-NEXT: ; implicit-def: $vgpr0 2126; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2127; GFX1032-NEXT: s_cbranch_execz BB10_2 2128; GFX1032-NEXT: ; %bb.1: 2129; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2130; GFX1032-NEXT: v_mov_b32_e32 v7, s3 2131; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2132; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2133; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7 2134; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2135; GFX1032-NEXT: buffer_gl0_inv 2136; GFX1032-NEXT: buffer_gl1_inv 2137; GFX1032-NEXT: BB10_2: 2138; GFX1032-NEXT: v_nop 2139; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2140; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2141; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2142; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2143; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2144; GFX1032-NEXT: s_nop 1 2145; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2146; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2147; GFX1032-NEXT: s_endpgm 2148entry: 2149 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2150 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2151 store i32 %old, i32 addrspace(1)* %out 2152 ret void 2153} 2154 2155define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2156; 2157; 2158; GFX7LESS-LABEL: sub_i64_constant: 2159; GFX7LESS: ; %bb.0: ; %entry 2160; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2161; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2162; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2163; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2164; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2165; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2166; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2167; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2168; GFX7LESS-NEXT: ; %bb.1: 2169; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2170; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2171; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2172; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2173; GFX7LESS-NEXT: s_mov_b32 m0, -1 2174; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2175; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2176; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2177; GFX7LESS-NEXT: buffer_wbinvl1 2178; GFX7LESS-NEXT: BB11_2: 2179; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2180; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2181; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2182; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2183; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2184; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2185; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2186; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2187; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2188; GFX7LESS-NEXT: s_mov_b32 s2, -1 2189; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2190; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2191; GFX7LESS-NEXT: s_endpgm 2192; 2193; GFX8-LABEL: sub_i64_constant: 2194; GFX8: ; %bb.0: ; %entry 2195; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2196; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2197; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2198; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2199; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2200; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2201; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2202; GFX8-NEXT: s_cbranch_execz BB11_2 2203; GFX8-NEXT: ; %bb.1: 2204; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2205; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2206; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2207; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2208; GFX8-NEXT: s_mov_b32 m0, -1 2209; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2210; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2211; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2212; GFX8-NEXT: buffer_wbinvl1_vol 2213; GFX8-NEXT: BB11_2: 2214; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2215; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2216; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2217; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2218; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2219; GFX8-NEXT: v_mov_b32_e32 v2, s3 2220; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2221; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2222; GFX8-NEXT: s_mov_b32 s3, 0xf000 2223; GFX8-NEXT: s_mov_b32 s2, -1 2224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2225; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2226; GFX8-NEXT: s_endpgm 2227; 2228; GFX9-LABEL: sub_i64_constant: 2229; GFX9: ; %bb.0: ; %entry 2230; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2231; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2232; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2233; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2234; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2235; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2236; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2237; GFX9-NEXT: s_cbranch_execz BB11_2 2238; GFX9-NEXT: ; %bb.1: 2239; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2240; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2241; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2242; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2243; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2244; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2245; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2246; GFX9-NEXT: buffer_wbinvl1_vol 2247; GFX9-NEXT: BB11_2: 2248; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2249; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2250; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2251; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2252; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2253; GFX9-NEXT: v_mov_b32_e32 v2, s3 2254; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2255; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2256; GFX9-NEXT: s_mov_b32 s3, 0xf000 2257; GFX9-NEXT: s_mov_b32 s2, -1 2258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2259; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2260; GFX9-NEXT: s_endpgm 2261; 2262; GFX1064-LABEL: sub_i64_constant: 2263; GFX1064: ; %bb.0: ; %entry 2264; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2265; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2266; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2267; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2268; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2269; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2270; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2271; GFX1064-NEXT: s_cbranch_execz BB11_2 2272; GFX1064-NEXT: ; %bb.1: 2273; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2274; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2275; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2276; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2277; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2278; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2279; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2280; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2281; GFX1064-NEXT: buffer_gl0_inv 2282; GFX1064-NEXT: buffer_gl1_inv 2283; GFX1064-NEXT: BB11_2: 2284; GFX1064-NEXT: v_nop 2285; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2286; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2287; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2288; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2289; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2290; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2291; GFX1064-NEXT: s_mov_b32 s2, -1 2292; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2293; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2294; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2295; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2296; GFX1064-NEXT: s_endpgm 2297; 2298; GFX1032-LABEL: sub_i64_constant: 2299; GFX1032: ; %bb.0: ; %entry 2300; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2301; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 2302; GFX1032-NEXT: ; implicit-def: $vcc_hi 2303; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2304; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2305; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2306; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2307; GFX1032-NEXT: s_cbranch_execz BB11_2 2308; GFX1032-NEXT: ; %bb.1: 2309; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2310; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2311; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2312; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 2313; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2314; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2315; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2316; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2317; GFX1032-NEXT: buffer_gl0_inv 2318; GFX1032-NEXT: buffer_gl1_inv 2319; GFX1032-NEXT: BB11_2: 2320; GFX1032-NEXT: v_nop 2321; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2322; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2323; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2324; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2325; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2326; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2327; GFX1032-NEXT: s_mov_b32 s2, -1 2328; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2329; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2330; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2331; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2332; GFX1032-NEXT: s_endpgm 2333entry: 2334 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2335 store i64 %old, i64 addrspace(1)* %out 2336 ret void 2337} 2338 2339define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2340; 2341; 2342; GFX7LESS-LABEL: sub_i64_uniform: 2343; GFX7LESS: ; %bb.0: ; %entry 2344; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2345; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2346; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2347; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2348; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2349; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2350; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2351; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2352; GFX7LESS-NEXT: ; %bb.1: 2353; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2354; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2355; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2356; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2357; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2358; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2359; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2360; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2361; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2362; GFX7LESS-NEXT: s_mov_b32 m0, -1 2363; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2364; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2365; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2366; GFX7LESS-NEXT: buffer_wbinvl1 2367; GFX7LESS-NEXT: BB12_2: 2368; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2369; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2370; GFX7LESS-NEXT: s_mov_b32 s6, -1 2371; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2372; GFX7LESS-NEXT: s_mov_b32 s4, s0 2373; GFX7LESS-NEXT: s_mov_b32 s5, s1 2374; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2375; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2376; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2377; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2378; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2379; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2380; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2381; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2382; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2383; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2384; GFX7LESS-NEXT: s_endpgm 2385; 2386; GFX8-LABEL: sub_i64_uniform: 2387; GFX8: ; %bb.0: ; %entry 2388; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2389; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2390; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2391; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2392; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2393; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2394; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2395; GFX8-NEXT: s_cbranch_execz BB12_2 2396; GFX8-NEXT: ; %bb.1: 2397; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2398; GFX8-NEXT: v_mov_b32_e32 v1, s6 2399; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2400; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2401; GFX8-NEXT: s_mul_i32 s7, s3, s6 2402; GFX8-NEXT: s_mul_i32 s6, s2, s6 2403; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2404; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2405; GFX8-NEXT: v_mov_b32_e32 v1, s6 2406; GFX8-NEXT: s_mov_b32 m0, -1 2407; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2408; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2409; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2410; GFX8-NEXT: buffer_wbinvl1_vol 2411; GFX8-NEXT: BB12_2: 2412; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2413; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2414; GFX8-NEXT: s_mov_b32 s4, s0 2415; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2416; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2417; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2418; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2419; GFX8-NEXT: s_mov_b32 s5, s1 2420; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2421; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2422; GFX8-NEXT: v_mov_b32_e32 v2, s1 2423; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2424; GFX8-NEXT: s_mov_b32 s7, 0xf000 2425; GFX8-NEXT: s_mov_b32 s6, -1 2426; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2427; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2428; GFX8-NEXT: s_endpgm 2429; 2430; GFX9-LABEL: sub_i64_uniform: 2431; GFX9: ; %bb.0: ; %entry 2432; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2433; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2434; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2435; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2436; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2437; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2438; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2439; GFX9-NEXT: s_cbranch_execz BB12_2 2440; GFX9-NEXT: ; %bb.1: 2441; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2442; GFX9-NEXT: v_mov_b32_e32 v1, s6 2443; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2444; GFX9-NEXT: v_mul_hi_u32 v2, s2, v1 2445; GFX9-NEXT: s_mul_i32 s7, s3, s6 2446; GFX9-NEXT: s_mul_i32 s6, s2, s6 2447; GFX9-NEXT: v_mov_b32_e32 v1, s6 2448; GFX9-NEXT: v_add_u32_e32 v2, s7, v2 2449; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2450; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2451; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2452; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2453; GFX9-NEXT: buffer_wbinvl1_vol 2454; GFX9-NEXT: BB12_2: 2455; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2456; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2457; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2458; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2459; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2460; GFX9-NEXT: s_mov_b32 s4, s0 2461; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2462; GFX9-NEXT: s_mov_b32 s5, s1 2463; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2464; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2465; GFX9-NEXT: v_mov_b32_e32 v2, s1 2466; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2467; GFX9-NEXT: s_mov_b32 s7, 0xf000 2468; GFX9-NEXT: s_mov_b32 s6, -1 2469; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2470; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2471; GFX9-NEXT: s_endpgm 2472; 2473; GFX1064-LABEL: sub_i64_uniform: 2474; GFX1064: ; %bb.0: ; %entry 2475; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2476; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2477; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2478; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2479; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2480; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2481; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2482; GFX1064-NEXT: s_cbranch_execz BB12_2 2483; GFX1064-NEXT: ; %bb.1: 2484; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2485; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2486; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2487; GFX1064-NEXT: v_mul_hi_u32 v2, s2, s6 2488; GFX1064-NEXT: s_mul_i32 s7, s2, s6 2489; GFX1064-NEXT: s_mul_i32 s6, s3, s6 2490; GFX1064-NEXT: v_mov_b32_e32 v1, s7 2491; GFX1064-NEXT: v_add_nc_u32_e32 v2, s6, v2 2492; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2493; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2494; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2495; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2496; GFX1064-NEXT: buffer_gl0_inv 2497; GFX1064-NEXT: buffer_gl1_inv 2498; GFX1064-NEXT: BB12_2: 2499; GFX1064-NEXT: v_nop 2500; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2501; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2502; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2503; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2504; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2505; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 2506; GFX1064-NEXT: v_readfirstlane_b32 s5, v2 2507; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2508; GFX1064-NEXT: s_mov_b32 s2, -1 2509; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2510; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s4, v0 2511; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc 2512; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2513; GFX1064-NEXT: s_endpgm 2514; 2515; GFX1032-LABEL: sub_i64_uniform: 2516; GFX1032: ; %bb.0: ; %entry 2517; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2518; GFX1032-NEXT: v_cmp_ne_u32_e64 s5, 1, 0 2519; GFX1032-NEXT: ; implicit-def: $vcc_hi 2520; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2521; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2522; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2523; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2524; GFX1032-NEXT: s_cbranch_execz BB12_2 2525; GFX1032-NEXT: ; %bb.1: 2526; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2527; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2528; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2529; GFX1032-NEXT: v_mul_hi_u32 v2, s2, s5 2530; GFX1032-NEXT: s_mul_i32 s6, s2, s5 2531; GFX1032-NEXT: s_mul_i32 s5, s3, s5 2532; GFX1032-NEXT: v_mov_b32_e32 v1, s6 2533; GFX1032-NEXT: v_add_nc_u32_e32 v2, s5, v2 2534; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2535; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2536; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2537; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2538; GFX1032-NEXT: buffer_gl0_inv 2539; GFX1032-NEXT: buffer_gl1_inv 2540; GFX1032-NEXT: BB12_2: 2541; GFX1032-NEXT: v_nop 2542; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2543; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2544; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2545; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2546; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2547; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 2548; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 2549; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2550; GFX1032-NEXT: s_mov_b32 s2, -1 2551; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2552; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s4, v0 2553; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo 2554; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2555; GFX1032-NEXT: s_endpgm 2556entry: 2557 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2558 store i64 %old, i64 addrspace(1)* %out 2559 ret void 2560} 2561 2562; GCN-NOT: v_mbcnt_lo_u32_b32 2563; GCN-NOT: v_mbcnt_hi_u32_b32 2564; GCN-NOT: s_bcnt1_i32_b64 2565define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2566; 2567; 2568; GFX7LESS-LABEL: sub_i64_varying: 2569; GFX7LESS: ; %bb.0: ; %entry 2570; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2571; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2572; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2573; GFX7LESS-NEXT: s_mov_b32 m0, -1 2574; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2575; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2576; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2577; GFX7LESS-NEXT: buffer_wbinvl1 2578; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2579; GFX7LESS-NEXT: s_mov_b32 s2, -1 2580; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2581; GFX7LESS-NEXT: s_endpgm 2582; 2583; GFX8-LABEL: sub_i64_varying: 2584; GFX8: ; %bb.0: ; %entry 2585; GFX8-NEXT: v_mov_b32_e32 v1, 0 2586; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2587; GFX8-NEXT: s_mov_b32 m0, -1 2588; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2589; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2590; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2591; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2592; GFX8-NEXT: buffer_wbinvl1_vol 2593; GFX8-NEXT: s_mov_b32 s3, 0xf000 2594; GFX8-NEXT: s_mov_b32 s2, -1 2595; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2596; GFX8-NEXT: s_endpgm 2597; 2598; GFX9-LABEL: sub_i64_varying: 2599; GFX9: ; %bb.0: ; %entry 2600; GFX9-NEXT: v_mov_b32_e32 v1, 0 2601; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2602; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2603; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2604; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2605; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2606; GFX9-NEXT: buffer_wbinvl1_vol 2607; GFX9-NEXT: s_mov_b32 s3, 0xf000 2608; GFX9-NEXT: s_mov_b32 s2, -1 2609; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2610; GFX9-NEXT: s_endpgm 2611; 2612; GFX1064-LABEL: sub_i64_varying: 2613; GFX1064: ; %bb.0: ; %entry 2614; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2615; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2616; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2617; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2618; GFX1064-NEXT: s_mov_b32 s2, -1 2619; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2620; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2621; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2622; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2623; GFX1064-NEXT: buffer_gl0_inv 2624; GFX1064-NEXT: buffer_gl1_inv 2625; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2626; GFX1064-NEXT: s_endpgm 2627; 2628; GFX1032-LABEL: sub_i64_varying: 2629; GFX1032: ; %bb.0: ; %entry 2630; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2631; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2632; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2633; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2634; GFX1032-NEXT: s_mov_b32 s2, -1 2635; GFX1032-NEXT: ; implicit-def: $vcc_hi 2636; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2637; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2638; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2639; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2640; GFX1032-NEXT: buffer_gl0_inv 2641; GFX1032-NEXT: buffer_gl1_inv 2642; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2643; GFX1032-NEXT: s_endpgm 2644entry: 2645 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2646 %zext = zext i32 %lane to i64 2647 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2648 store i64 %old, i64 addrspace(1)* %out 2649 ret void 2650} 2651 2652; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 2653; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 2654; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 2655define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2656; 2657; 2658; GFX7LESS-LABEL: and_i32_varying: 2659; GFX7LESS: ; %bb.0: ; %entry 2660; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2661; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2662; GFX7LESS-NEXT: s_mov_b32 m0, -1 2663; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2664; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2665; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2666; GFX7LESS-NEXT: buffer_wbinvl1 2667; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2668; GFX7LESS-NEXT: s_mov_b32 s2, -1 2669; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2670; GFX7LESS-NEXT: s_endpgm 2671; 2672; GFX8-LABEL: and_i32_varying: 2673; GFX8: ; %bb.0: ; %entry 2674; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2675; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2676; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 2677; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 2678; GFX8-NEXT: v_mov_b32_e32 v2, v0 2679; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2680; GFX8-NEXT: v_mov_b32_e32 v1, -1 2681; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2682; GFX8-NEXT: s_not_b64 exec, exec 2683; GFX8-NEXT: v_mov_b32_e32 v2, -1 2684; GFX8-NEXT: s_not_b64 exec, exec 2685; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2686; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2687; GFX8-NEXT: s_nop 1 2688; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2689; GFX8-NEXT: s_nop 1 2690; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2691; GFX8-NEXT: s_nop 1 2692; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2693; GFX8-NEXT: s_nop 1 2694; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2695; GFX8-NEXT: s_nop 1 2696; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2697; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2698; GFX8-NEXT: s_nop 0 2699; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2700; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2701; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2702; GFX8-NEXT: ; implicit-def: $vgpr0 2703; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2704; GFX8-NEXT: s_cbranch_execz BB14_2 2705; GFX8-NEXT: ; %bb.1: 2706; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2707; GFX8-NEXT: v_mov_b32_e32 v3, s2 2708; GFX8-NEXT: s_mov_b32 m0, -1 2709; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2710; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2711; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2712; GFX8-NEXT: buffer_wbinvl1_vol 2713; GFX8-NEXT: BB14_2: 2714; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2715; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2716; GFX8-NEXT: v_mov_b32_e32 v0, v1 2717; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2718; GFX8-NEXT: s_mov_b32 s3, 0xf000 2719; GFX8-NEXT: s_mov_b32 s2, -1 2720; GFX8-NEXT: s_nop 0 2721; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2722; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2723; GFX8-NEXT: s_endpgm 2724; 2725; GFX9-LABEL: and_i32_varying: 2726; GFX9: ; %bb.0: ; %entry 2727; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2728; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2729; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 2730; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 2731; GFX9-NEXT: v_mov_b32_e32 v2, v0 2732; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2733; GFX9-NEXT: v_mov_b32_e32 v1, -1 2734; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2735; GFX9-NEXT: s_not_b64 exec, exec 2736; GFX9-NEXT: v_mov_b32_e32 v2, -1 2737; GFX9-NEXT: s_not_b64 exec, exec 2738; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2739; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2740; GFX9-NEXT: s_nop 1 2741; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2742; GFX9-NEXT: s_nop 1 2743; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2744; GFX9-NEXT: s_nop 1 2745; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2746; GFX9-NEXT: s_nop 1 2747; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2748; GFX9-NEXT: s_nop 1 2749; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2750; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2751; GFX9-NEXT: s_nop 0 2752; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2753; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2754; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2755; GFX9-NEXT: ; implicit-def: $vgpr0 2756; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2757; GFX9-NEXT: s_cbranch_execz BB14_2 2758; GFX9-NEXT: ; %bb.1: 2759; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2760; GFX9-NEXT: v_mov_b32_e32 v3, s2 2761; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2762; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2763; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2764; GFX9-NEXT: buffer_wbinvl1_vol 2765; GFX9-NEXT: BB14_2: 2766; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2767; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2768; GFX9-NEXT: v_mov_b32_e32 v0, v1 2769; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2770; GFX9-NEXT: s_mov_b32 s3, 0xf000 2771; GFX9-NEXT: s_mov_b32 s2, -1 2772; GFX9-NEXT: s_nop 0 2773; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2774; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2775; GFX9-NEXT: s_endpgm 2776; 2777; GFX1064-LABEL: and_i32_varying: 2778; GFX1064: ; %bb.0: ; %entry 2779; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2780; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2781; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2782; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 2783; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 2784; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2785; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2786; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2787; GFX1064-NEXT: s_not_b64 exec, exec 2788; GFX1064-NEXT: v_mov_b32_e32 v2, -1 2789; GFX1064-NEXT: s_not_b64 exec, exec 2790; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2791; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2792; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2793; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2794; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2795; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2796; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2797; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2798; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2799; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2800; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2801; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2802; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2803; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2804; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2805; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2806; GFX1064-NEXT: s_mov_b32 s2, -1 2807; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2808; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2809; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2810; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2811; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 2812; GFX1064-NEXT: ; implicit-def: $vgpr0 2813; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2814; GFX1064-NEXT: s_cbranch_execz BB14_2 2815; GFX1064-NEXT: ; %bb.1: 2816; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2817; GFX1064-NEXT: v_mov_b32_e32 v7, s3 2818; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2819; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2820; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7 2821; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2822; GFX1064-NEXT: buffer_gl0_inv 2823; GFX1064-NEXT: buffer_gl1_inv 2824; GFX1064-NEXT: BB14_2: 2825; GFX1064-NEXT: v_nop 2826; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2827; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2828; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2829; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2830; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2831; GFX1064-NEXT: s_nop 1 2832; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2833; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2834; GFX1064-NEXT: s_endpgm 2835; 2836; GFX1032-LABEL: and_i32_varying: 2837; GFX1032: ; %bb.0: ; %entry 2838; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2839; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 2840; GFX1032-NEXT: ; implicit-def: $vcc_hi 2841; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2842; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 2843; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2844; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2845; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2846; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2847; GFX1032-NEXT: v_mov_b32_e32 v2, -1 2848; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2849; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2850; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2851; GFX1032-NEXT: s_mov_b32 s2, -1 2852; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2853; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2854; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2855; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2856; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2857; GFX1032-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2858; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2859; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2860; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2861; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2862; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2863; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 2864; GFX1032-NEXT: ; implicit-def: $vgpr0 2865; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2866; GFX1032-NEXT: s_cbranch_execz BB14_2 2867; GFX1032-NEXT: ; %bb.1: 2868; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2869; GFX1032-NEXT: v_mov_b32_e32 v7, s3 2870; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2871; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2872; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7 2873; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2874; GFX1032-NEXT: buffer_gl0_inv 2875; GFX1032-NEXT: buffer_gl1_inv 2876; GFX1032-NEXT: BB14_2: 2877; GFX1032-NEXT: v_nop 2878; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2879; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2880; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2881; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2882; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2883; GFX1032-NEXT: s_nop 1 2884; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2885; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2886; GFX1032-NEXT: s_endpgm 2887entry: 2888 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2889 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2890 store i32 %old, i32 addrspace(1)* %out 2891 ret void 2892} 2893 2894; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 2895; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 2896; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 2897define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2898; 2899; 2900; GFX7LESS-LABEL: or_i32_varying: 2901; GFX7LESS: ; %bb.0: ; %entry 2902; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2903; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2904; GFX7LESS-NEXT: s_mov_b32 m0, -1 2905; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2906; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2907; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2908; GFX7LESS-NEXT: buffer_wbinvl1 2909; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2910; GFX7LESS-NEXT: s_mov_b32 s2, -1 2911; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2912; GFX7LESS-NEXT: s_endpgm 2913; 2914; GFX8-LABEL: or_i32_varying: 2915; GFX8: ; %bb.0: ; %entry 2916; GFX8-NEXT: v_mov_b32_e32 v2, v0 2917; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2918; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2919; GFX8-NEXT: v_mov_b32_e32 v1, 0 2920; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2921; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2922; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2923; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2924; GFX8-NEXT: s_not_b64 exec, exec 2925; GFX8-NEXT: v_mov_b32_e32 v2, 0 2926; GFX8-NEXT: s_not_b64 exec, exec 2927; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2928; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2929; GFX8-NEXT: s_nop 1 2930; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2931; GFX8-NEXT: s_nop 1 2932; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2933; GFX8-NEXT: s_nop 1 2934; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2935; GFX8-NEXT: s_nop 1 2936; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2937; GFX8-NEXT: s_nop 1 2938; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2939; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2940; GFX8-NEXT: s_nop 0 2941; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2942; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2943; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2944; GFX8-NEXT: ; implicit-def: $vgpr0 2945; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2946; GFX8-NEXT: s_cbranch_execz BB15_2 2947; GFX8-NEXT: ; %bb.1: 2948; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2949; GFX8-NEXT: v_mov_b32_e32 v3, s2 2950; GFX8-NEXT: s_mov_b32 m0, -1 2951; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2952; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2953; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2954; GFX8-NEXT: buffer_wbinvl1_vol 2955; GFX8-NEXT: BB15_2: 2956; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2957; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2958; GFX8-NEXT: v_mov_b32_e32 v0, v1 2959; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2960; GFX8-NEXT: s_mov_b32 s3, 0xf000 2961; GFX8-NEXT: s_mov_b32 s2, -1 2962; GFX8-NEXT: s_nop 0 2963; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2964; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2965; GFX8-NEXT: s_endpgm 2966; 2967; GFX9-LABEL: or_i32_varying: 2968; GFX9: ; %bb.0: ; %entry 2969; GFX9-NEXT: v_mov_b32_e32 v2, v0 2970; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2971; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2972; GFX9-NEXT: v_mov_b32_e32 v1, 0 2973; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2974; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2975; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2976; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2977; GFX9-NEXT: s_not_b64 exec, exec 2978; GFX9-NEXT: v_mov_b32_e32 v2, 0 2979; GFX9-NEXT: s_not_b64 exec, exec 2980; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2981; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2982; GFX9-NEXT: s_nop 1 2983; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2984; GFX9-NEXT: s_nop 1 2985; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2986; GFX9-NEXT: s_nop 1 2987; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2988; GFX9-NEXT: s_nop 1 2989; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2990; GFX9-NEXT: s_nop 1 2991; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2992; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2993; GFX9-NEXT: s_nop 0 2994; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2995; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2996; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2997; GFX9-NEXT: ; implicit-def: $vgpr0 2998; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2999; GFX9-NEXT: s_cbranch_execz BB15_2 3000; GFX9-NEXT: ; %bb.1: 3001; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3002; GFX9-NEXT: v_mov_b32_e32 v3, s2 3003; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3004; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3005; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3006; GFX9-NEXT: buffer_wbinvl1_vol 3007; GFX9-NEXT: BB15_2: 3008; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3009; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3010; GFX9-NEXT: v_mov_b32_e32 v0, v1 3011; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3012; GFX9-NEXT: s_mov_b32 s3, 0xf000 3013; GFX9-NEXT: s_mov_b32 s2, -1 3014; GFX9-NEXT: s_nop 0 3015; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3016; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3017; GFX9-NEXT: s_endpgm 3018; 3019; GFX1064-LABEL: or_i32_varying: 3020; GFX1064: ; %bb.0: ; %entry 3021; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3022; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3023; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3024; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3025; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3026; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3027; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3028; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3029; GFX1064-NEXT: s_not_b64 exec, exec 3030; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3031; GFX1064-NEXT: s_not_b64 exec, exec 3032; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3033; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3034; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3035; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3036; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3037; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3038; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3039; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3040; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3041; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3042; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3043; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3044; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3045; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3046; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3047; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3048; GFX1064-NEXT: s_mov_b32 s2, -1 3049; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3050; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3051; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3052; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3053; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3054; GFX1064-NEXT: ; implicit-def: $vgpr0 3055; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3056; GFX1064-NEXT: s_cbranch_execz BB15_2 3057; GFX1064-NEXT: ; %bb.1: 3058; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3059; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3060; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3061; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3062; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7 3063; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3064; GFX1064-NEXT: buffer_gl0_inv 3065; GFX1064-NEXT: buffer_gl1_inv 3066; GFX1064-NEXT: BB15_2: 3067; GFX1064-NEXT: v_nop 3068; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3069; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3070; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3071; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3072; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3073; GFX1064-NEXT: s_nop 1 3074; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3075; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3076; GFX1064-NEXT: s_endpgm 3077; 3078; GFX1032-LABEL: or_i32_varying: 3079; GFX1032: ; %bb.0: ; %entry 3080; GFX1032-NEXT: ; implicit-def: $vcc_hi 3081; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3082; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3083; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3084; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3085; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3086; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3087; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3088; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3089; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3090; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3091; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3092; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3093; GFX1032-NEXT: s_mov_b32 s2, -1 3094; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3095; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3096; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3097; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3098; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3099; GFX1032-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3100; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3101; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3102; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3103; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3104; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3105; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3106; GFX1032-NEXT: ; implicit-def: $vgpr0 3107; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3108; GFX1032-NEXT: s_cbranch_execz BB15_2 3109; GFX1032-NEXT: ; %bb.1: 3110; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3111; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3112; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3113; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3114; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7 3115; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3116; GFX1032-NEXT: buffer_gl0_inv 3117; GFX1032-NEXT: buffer_gl1_inv 3118; GFX1032-NEXT: BB15_2: 3119; GFX1032-NEXT: v_nop 3120; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3121; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3122; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3123; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3124; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3125; GFX1032-NEXT: s_nop 1 3126; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3127; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3128; GFX1032-NEXT: s_endpgm 3129entry: 3130 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3131 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3132 store i32 %old, i32 addrspace(1)* %out 3133 ret void 3134} 3135 3136; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3137; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3138; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3139define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3140; 3141; 3142; GFX7LESS-LABEL: xor_i32_varying: 3143; GFX7LESS: ; %bb.0: ; %entry 3144; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3145; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3146; GFX7LESS-NEXT: s_mov_b32 m0, -1 3147; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3148; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3149; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3150; GFX7LESS-NEXT: buffer_wbinvl1 3151; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3152; GFX7LESS-NEXT: s_mov_b32 s2, -1 3153; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3154; GFX7LESS-NEXT: s_endpgm 3155; 3156; GFX8-LABEL: xor_i32_varying: 3157; GFX8: ; %bb.0: ; %entry 3158; GFX8-NEXT: v_mov_b32_e32 v2, v0 3159; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3160; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3161; GFX8-NEXT: v_mov_b32_e32 v1, 0 3162; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3163; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3164; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3165; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3166; GFX8-NEXT: s_not_b64 exec, exec 3167; GFX8-NEXT: v_mov_b32_e32 v2, 0 3168; GFX8-NEXT: s_not_b64 exec, exec 3169; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3170; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3171; GFX8-NEXT: s_nop 1 3172; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3173; GFX8-NEXT: s_nop 1 3174; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3175; GFX8-NEXT: s_nop 1 3176; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3177; GFX8-NEXT: s_nop 1 3178; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3179; GFX8-NEXT: s_nop 1 3180; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3181; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3182; GFX8-NEXT: s_nop 0 3183; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3184; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3185; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3186; GFX8-NEXT: ; implicit-def: $vgpr0 3187; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3188; GFX8-NEXT: s_cbranch_execz BB16_2 3189; GFX8-NEXT: ; %bb.1: 3190; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3191; GFX8-NEXT: v_mov_b32_e32 v3, s2 3192; GFX8-NEXT: s_mov_b32 m0, -1 3193; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3194; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3195; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3196; GFX8-NEXT: buffer_wbinvl1_vol 3197; GFX8-NEXT: BB16_2: 3198; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3199; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3200; GFX8-NEXT: v_mov_b32_e32 v0, v1 3201; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3202; GFX8-NEXT: s_mov_b32 s3, 0xf000 3203; GFX8-NEXT: s_mov_b32 s2, -1 3204; GFX8-NEXT: s_nop 0 3205; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3206; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3207; GFX8-NEXT: s_endpgm 3208; 3209; GFX9-LABEL: xor_i32_varying: 3210; GFX9: ; %bb.0: ; %entry 3211; GFX9-NEXT: v_mov_b32_e32 v2, v0 3212; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3213; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3214; GFX9-NEXT: v_mov_b32_e32 v1, 0 3215; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3216; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3217; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3218; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3219; GFX9-NEXT: s_not_b64 exec, exec 3220; GFX9-NEXT: v_mov_b32_e32 v2, 0 3221; GFX9-NEXT: s_not_b64 exec, exec 3222; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3223; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3224; GFX9-NEXT: s_nop 1 3225; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3226; GFX9-NEXT: s_nop 1 3227; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3228; GFX9-NEXT: s_nop 1 3229; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3230; GFX9-NEXT: s_nop 1 3231; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3232; GFX9-NEXT: s_nop 1 3233; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3234; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3235; GFX9-NEXT: s_nop 0 3236; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3237; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3238; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3239; GFX9-NEXT: ; implicit-def: $vgpr0 3240; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3241; GFX9-NEXT: s_cbranch_execz BB16_2 3242; GFX9-NEXT: ; %bb.1: 3243; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3244; GFX9-NEXT: v_mov_b32_e32 v3, s2 3245; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3246; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3247; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3248; GFX9-NEXT: buffer_wbinvl1_vol 3249; GFX9-NEXT: BB16_2: 3250; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3251; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3252; GFX9-NEXT: v_mov_b32_e32 v0, v1 3253; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3254; GFX9-NEXT: s_mov_b32 s3, 0xf000 3255; GFX9-NEXT: s_mov_b32 s2, -1 3256; GFX9-NEXT: s_nop 0 3257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3258; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3259; GFX9-NEXT: s_endpgm 3260; 3261; GFX1064-LABEL: xor_i32_varying: 3262; GFX1064: ; %bb.0: ; %entry 3263; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3264; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3265; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3266; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3267; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3268; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3269; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3270; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3271; GFX1064-NEXT: s_not_b64 exec, exec 3272; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3273; GFX1064-NEXT: s_not_b64 exec, exec 3274; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3275; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3276; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3277; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3278; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3279; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3280; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3281; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3282; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3283; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3284; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3285; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3286; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3287; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3288; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3289; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3290; GFX1064-NEXT: s_mov_b32 s2, -1 3291; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3292; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3293; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3294; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3295; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3296; GFX1064-NEXT: ; implicit-def: $vgpr0 3297; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3298; GFX1064-NEXT: s_cbranch_execz BB16_2 3299; GFX1064-NEXT: ; %bb.1: 3300; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3301; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3302; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3303; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3304; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7 3305; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3306; GFX1064-NEXT: buffer_gl0_inv 3307; GFX1064-NEXT: buffer_gl1_inv 3308; GFX1064-NEXT: BB16_2: 3309; GFX1064-NEXT: v_nop 3310; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3311; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3312; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3313; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3314; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3315; GFX1064-NEXT: s_nop 1 3316; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3317; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3318; GFX1064-NEXT: s_endpgm 3319; 3320; GFX1032-LABEL: xor_i32_varying: 3321; GFX1032: ; %bb.0: ; %entry 3322; GFX1032-NEXT: ; implicit-def: $vcc_hi 3323; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3324; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3325; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3326; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3327; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3328; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3329; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3330; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3331; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3332; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3333; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3334; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3335; GFX1032-NEXT: s_mov_b32 s2, -1 3336; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3337; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3338; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3339; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3340; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3341; GFX1032-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3342; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3343; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3344; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3345; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3346; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3347; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3348; GFX1032-NEXT: ; implicit-def: $vgpr0 3349; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3350; GFX1032-NEXT: s_cbranch_execz BB16_2 3351; GFX1032-NEXT: ; %bb.1: 3352; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3353; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3354; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3355; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3356; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7 3357; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3358; GFX1032-NEXT: buffer_gl0_inv 3359; GFX1032-NEXT: buffer_gl1_inv 3360; GFX1032-NEXT: BB16_2: 3361; GFX1032-NEXT: v_nop 3362; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3363; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3364; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3365; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3366; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3367; GFX1032-NEXT: s_nop 1 3368; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3369; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3370; GFX1032-NEXT: s_endpgm 3371entry: 3372 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3373 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3374 store i32 %old, i32 addrspace(1)* %out 3375 ret void 3376} 3377 3378; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3379; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3380; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3381define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3382; 3383; 3384; GFX7LESS-LABEL: max_i32_varying: 3385; GFX7LESS: ; %bb.0: ; %entry 3386; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3387; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3388; GFX7LESS-NEXT: s_mov_b32 m0, -1 3389; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3390; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3391; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3392; GFX7LESS-NEXT: buffer_wbinvl1 3393; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3394; GFX7LESS-NEXT: s_mov_b32 s2, -1 3395; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3396; GFX7LESS-NEXT: s_endpgm 3397; 3398; GFX8-LABEL: max_i32_varying: 3399; GFX8: ; %bb.0: ; %entry 3400; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3401; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3402; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3403; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3404; GFX8-NEXT: v_mov_b32_e32 v2, v0 3405; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3406; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3407; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3408; GFX8-NEXT: s_not_b64 exec, exec 3409; GFX8-NEXT: v_mov_b32_e32 v2, v1 3410; GFX8-NEXT: s_not_b64 exec, exec 3411; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3412; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3413; GFX8-NEXT: s_nop 1 3414; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3415; GFX8-NEXT: s_nop 1 3416; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3417; GFX8-NEXT: s_nop 1 3418; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3419; GFX8-NEXT: s_nop 1 3420; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3421; GFX8-NEXT: s_nop 1 3422; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3423; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3424; GFX8-NEXT: s_nop 0 3425; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3426; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3427; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3428; GFX8-NEXT: ; implicit-def: $vgpr0 3429; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3430; GFX8-NEXT: s_cbranch_execz BB17_2 3431; GFX8-NEXT: ; %bb.1: 3432; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3433; GFX8-NEXT: v_mov_b32_e32 v3, s2 3434; GFX8-NEXT: s_mov_b32 m0, -1 3435; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3436; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3437; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3438; GFX8-NEXT: buffer_wbinvl1_vol 3439; GFX8-NEXT: BB17_2: 3440; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3441; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3442; GFX8-NEXT: v_mov_b32_e32 v0, v1 3443; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3444; GFX8-NEXT: s_mov_b32 s3, 0xf000 3445; GFX8-NEXT: s_mov_b32 s2, -1 3446; GFX8-NEXT: s_nop 0 3447; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3448; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3449; GFX8-NEXT: s_endpgm 3450; 3451; GFX9-LABEL: max_i32_varying: 3452; GFX9: ; %bb.0: ; %entry 3453; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3454; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3455; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3456; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3457; GFX9-NEXT: v_mov_b32_e32 v2, v0 3458; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3459; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3460; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3461; GFX9-NEXT: s_not_b64 exec, exec 3462; GFX9-NEXT: v_mov_b32_e32 v2, v1 3463; GFX9-NEXT: s_not_b64 exec, exec 3464; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3465; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3466; GFX9-NEXT: s_nop 1 3467; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3468; GFX9-NEXT: s_nop 1 3469; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3470; GFX9-NEXT: s_nop 1 3471; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3472; GFX9-NEXT: s_nop 1 3473; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3474; GFX9-NEXT: s_nop 1 3475; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3476; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3477; GFX9-NEXT: s_nop 0 3478; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3479; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3480; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3481; GFX9-NEXT: ; implicit-def: $vgpr0 3482; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3483; GFX9-NEXT: s_cbranch_execz BB17_2 3484; GFX9-NEXT: ; %bb.1: 3485; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3486; GFX9-NEXT: v_mov_b32_e32 v3, s2 3487; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3488; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3489; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3490; GFX9-NEXT: buffer_wbinvl1_vol 3491; GFX9-NEXT: BB17_2: 3492; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3493; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3494; GFX9-NEXT: v_mov_b32_e32 v0, v1 3495; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3496; GFX9-NEXT: s_mov_b32 s3, 0xf000 3497; GFX9-NEXT: s_mov_b32 s2, -1 3498; GFX9-NEXT: s_nop 0 3499; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3500; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3501; GFX9-NEXT: s_endpgm 3502; 3503; GFX1064-LABEL: max_i32_varying: 3504; GFX1064: ; %bb.0: ; %entry 3505; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3506; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3507; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3508; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 3509; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 3510; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3511; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3512; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3513; GFX1064-NEXT: s_not_b64 exec, exec 3514; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3515; GFX1064-NEXT: s_not_b64 exec, exec 3516; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3517; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3518; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3519; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3520; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3521; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3522; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3523; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3524; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3525; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3526; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3527; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3528; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3529; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3530; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3531; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3532; GFX1064-NEXT: s_mov_b32 s2, -1 3533; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3534; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3535; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3536; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3537; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3538; GFX1064-NEXT: ; implicit-def: $vgpr0 3539; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3540; GFX1064-NEXT: s_cbranch_execz BB17_2 3541; GFX1064-NEXT: ; %bb.1: 3542; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3543; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3544; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3545; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3546; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7 3547; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3548; GFX1064-NEXT: buffer_gl0_inv 3549; GFX1064-NEXT: buffer_gl1_inv 3550; GFX1064-NEXT: BB17_2: 3551; GFX1064-NEXT: v_nop 3552; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3553; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3554; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3555; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3556; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3557; GFX1064-NEXT: s_nop 1 3558; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3559; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3560; GFX1064-NEXT: s_endpgm 3561; 3562; GFX1032-LABEL: max_i32_varying: 3563; GFX1032: ; %bb.0: ; %entry 3564; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3565; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3566; GFX1032-NEXT: ; implicit-def: $vcc_hi 3567; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3568; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 3569; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3570; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3571; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3572; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3573; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3574; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3575; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3576; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3577; GFX1032-NEXT: s_mov_b32 s2, -1 3578; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3579; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3580; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3581; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3582; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3583; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3584; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3585; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3586; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3587; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3588; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3589; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 3590; GFX1032-NEXT: ; implicit-def: $vgpr0 3591; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3592; GFX1032-NEXT: s_cbranch_execz BB17_2 3593; GFX1032-NEXT: ; %bb.1: 3594; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3595; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3596; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3597; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3598; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7 3599; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3600; GFX1032-NEXT: buffer_gl0_inv 3601; GFX1032-NEXT: buffer_gl1_inv 3602; GFX1032-NEXT: BB17_2: 3603; GFX1032-NEXT: v_nop 3604; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3605; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3606; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3607; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3608; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3609; GFX1032-NEXT: s_nop 1 3610; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3611; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3612; GFX1032-NEXT: s_endpgm 3613entry: 3614 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3615 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3616 store i32 %old, i32 addrspace(1)* %out 3617 ret void 3618} 3619 3620define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3621; 3622; 3623; GFX7LESS-LABEL: max_i64_constant: 3624; GFX7LESS: ; %bb.0: ; %entry 3625; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3626; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3627; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3628; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 3629; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3630; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3631; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3632; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3633; GFX7LESS-NEXT: ; %bb.1: 3634; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3635; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3636; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3637; GFX7LESS-NEXT: s_mov_b32 m0, -1 3638; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3639; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3640; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3641; GFX7LESS-NEXT: buffer_wbinvl1 3642; GFX7LESS-NEXT: BB18_2: 3643; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3644; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3645; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3646; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3647; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3648; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3649; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3650; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3651; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3652; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3653; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3654; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3655; GFX7LESS-NEXT: s_mov_b32 s2, -1 3656; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3657; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3658; GFX7LESS-NEXT: s_endpgm 3659; 3660; GFX8-LABEL: max_i64_constant: 3661; GFX8: ; %bb.0: ; %entry 3662; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3663; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3664; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3665; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3666; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3667; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3668; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3669; GFX8-NEXT: s_cbranch_execz BB18_2 3670; GFX8-NEXT: ; %bb.1: 3671; GFX8-NEXT: v_mov_b32_e32 v0, 5 3672; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3673; GFX8-NEXT: v_mov_b32_e32 v1, 0 3674; GFX8-NEXT: s_mov_b32 m0, -1 3675; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3676; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3677; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3678; GFX8-NEXT: buffer_wbinvl1_vol 3679; GFX8-NEXT: BB18_2: 3680; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3681; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3682; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3683; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3684; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3685; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3686; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3687; GFX8-NEXT: v_mov_b32_e32 v2, s3 3688; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3689; GFX8-NEXT: v_mov_b32_e32 v2, s2 3690; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3691; GFX8-NEXT: s_mov_b32 s3, 0xf000 3692; GFX8-NEXT: s_mov_b32 s2, -1 3693; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3694; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3695; GFX8-NEXT: s_endpgm 3696; 3697; GFX9-LABEL: max_i64_constant: 3698; GFX9: ; %bb.0: ; %entry 3699; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3700; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3701; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3702; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3703; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3704; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3705; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3706; GFX9-NEXT: s_cbranch_execz BB18_2 3707; GFX9-NEXT: ; %bb.1: 3708; GFX9-NEXT: v_mov_b32_e32 v0, 5 3709; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3710; GFX9-NEXT: v_mov_b32_e32 v1, 0 3711; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3712; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3713; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3714; GFX9-NEXT: buffer_wbinvl1_vol 3715; GFX9-NEXT: BB18_2: 3716; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3717; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3718; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3719; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3720; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3721; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3722; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3723; GFX9-NEXT: v_mov_b32_e32 v2, s3 3724; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3725; GFX9-NEXT: v_mov_b32_e32 v2, s2 3726; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3727; GFX9-NEXT: s_mov_b32 s3, 0xf000 3728; GFX9-NEXT: s_mov_b32 s2, -1 3729; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3730; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3731; GFX9-NEXT: s_endpgm 3732; 3733; GFX1064-LABEL: max_i64_constant: 3734; GFX1064: ; %bb.0: ; %entry 3735; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3736; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3737; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3738; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3739; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3740; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3741; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3742; GFX1064-NEXT: s_cbranch_execz BB18_2 3743; GFX1064-NEXT: ; %bb.1: 3744; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3745; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3746; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3747; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3748; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3749; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3750; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3751; GFX1064-NEXT: buffer_gl0_inv 3752; GFX1064-NEXT: buffer_gl1_inv 3753; GFX1064-NEXT: BB18_2: 3754; GFX1064-NEXT: v_nop 3755; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3756; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 3757; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 3758; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3759; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3760; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3761; GFX1064-NEXT: s_mov_b32 s2, -1 3762; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3763; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 3764; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 3765; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3766; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3767; GFX1064-NEXT: s_endpgm 3768; 3769; GFX1032-LABEL: max_i64_constant: 3770; GFX1032: ; %bb.0: ; %entry 3771; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3772; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3773; GFX1032-NEXT: ; implicit-def: $vcc_hi 3774; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3775; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3776; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3777; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3778; GFX1032-NEXT: s_cbranch_execz BB18_2 3779; GFX1032-NEXT: ; %bb.1: 3780; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3781; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3782; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3783; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3784; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3785; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3786; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3787; GFX1032-NEXT: buffer_gl0_inv 3788; GFX1032-NEXT: buffer_gl1_inv 3789; GFX1032-NEXT: BB18_2: 3790; GFX1032-NEXT: v_nop 3791; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3792; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 3793; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 3794; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3795; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3796; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3797; GFX1032-NEXT: s_mov_b32 s2, -1 3798; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[0:1] 3799; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 3800; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 3801; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3802; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3803; GFX1032-NEXT: s_endpgm 3804entry: 3805 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3806 store i64 %old, i64 addrspace(1)* %out 3807 ret void 3808} 3809 3810; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3811; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3812; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3813define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3814; 3815; 3816; GFX7LESS-LABEL: min_i32_varying: 3817; GFX7LESS: ; %bb.0: ; %entry 3818; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3819; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3820; GFX7LESS-NEXT: s_mov_b32 m0, -1 3821; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3822; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3823; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3824; GFX7LESS-NEXT: buffer_wbinvl1 3825; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3826; GFX7LESS-NEXT: s_mov_b32 s2, -1 3827; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3828; GFX7LESS-NEXT: s_endpgm 3829; 3830; GFX8-LABEL: min_i32_varying: 3831; GFX8: ; %bb.0: ; %entry 3832; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3833; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3834; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3835; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3836; GFX8-NEXT: v_mov_b32_e32 v2, v0 3837; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3838; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3839; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3840; GFX8-NEXT: s_not_b64 exec, exec 3841; GFX8-NEXT: v_mov_b32_e32 v2, v1 3842; GFX8-NEXT: s_not_b64 exec, exec 3843; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3844; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3845; GFX8-NEXT: s_nop 1 3846; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3847; GFX8-NEXT: s_nop 1 3848; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3849; GFX8-NEXT: s_nop 1 3850; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3851; GFX8-NEXT: s_nop 1 3852; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3853; GFX8-NEXT: s_nop 1 3854; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3855; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3856; GFX8-NEXT: s_nop 0 3857; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3858; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3859; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3860; GFX8-NEXT: ; implicit-def: $vgpr0 3861; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3862; GFX8-NEXT: s_cbranch_execz BB19_2 3863; GFX8-NEXT: ; %bb.1: 3864; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3865; GFX8-NEXT: v_mov_b32_e32 v3, s2 3866; GFX8-NEXT: s_mov_b32 m0, -1 3867; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3868; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3869; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3870; GFX8-NEXT: buffer_wbinvl1_vol 3871; GFX8-NEXT: BB19_2: 3872; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3873; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3874; GFX8-NEXT: v_mov_b32_e32 v0, v1 3875; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3876; GFX8-NEXT: s_mov_b32 s3, 0xf000 3877; GFX8-NEXT: s_mov_b32 s2, -1 3878; GFX8-NEXT: s_nop 0 3879; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3880; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3881; GFX8-NEXT: s_endpgm 3882; 3883; GFX9-LABEL: min_i32_varying: 3884; GFX9: ; %bb.0: ; %entry 3885; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3886; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3887; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3888; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3889; GFX9-NEXT: v_mov_b32_e32 v2, v0 3890; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3891; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3892; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3893; GFX9-NEXT: s_not_b64 exec, exec 3894; GFX9-NEXT: v_mov_b32_e32 v2, v1 3895; GFX9-NEXT: s_not_b64 exec, exec 3896; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3897; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3898; GFX9-NEXT: s_nop 1 3899; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3900; GFX9-NEXT: s_nop 1 3901; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3902; GFX9-NEXT: s_nop 1 3903; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3904; GFX9-NEXT: s_nop 1 3905; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3906; GFX9-NEXT: s_nop 1 3907; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3908; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3909; GFX9-NEXT: s_nop 0 3910; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3911; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3912; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3913; GFX9-NEXT: ; implicit-def: $vgpr0 3914; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3915; GFX9-NEXT: s_cbranch_execz BB19_2 3916; GFX9-NEXT: ; %bb.1: 3917; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3918; GFX9-NEXT: v_mov_b32_e32 v3, s2 3919; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3920; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3921; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3922; GFX9-NEXT: buffer_wbinvl1_vol 3923; GFX9-NEXT: BB19_2: 3924; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3925; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3926; GFX9-NEXT: v_mov_b32_e32 v0, v1 3927; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3928; GFX9-NEXT: s_mov_b32 s3, 0xf000 3929; GFX9-NEXT: s_mov_b32 s2, -1 3930; GFX9-NEXT: s_nop 0 3931; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3932; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3933; GFX9-NEXT: s_endpgm 3934; 3935; GFX1064-LABEL: min_i32_varying: 3936; GFX1064: ; %bb.0: ; %entry 3937; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3938; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3939; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3940; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 3941; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 3942; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3943; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3944; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3945; GFX1064-NEXT: s_not_b64 exec, exec 3946; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3947; GFX1064-NEXT: s_not_b64 exec, exec 3948; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3949; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3950; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3951; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3952; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3953; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3954; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3955; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3956; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3957; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3958; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3959; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3960; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3961; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3962; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3963; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3964; GFX1064-NEXT: s_mov_b32 s2, -1 3965; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3966; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3967; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3968; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3969; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3970; GFX1064-NEXT: ; implicit-def: $vgpr0 3971; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3972; GFX1064-NEXT: s_cbranch_execz BB19_2 3973; GFX1064-NEXT: ; %bb.1: 3974; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3975; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3976; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3977; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3978; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7 3979; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3980; GFX1064-NEXT: buffer_gl0_inv 3981; GFX1064-NEXT: buffer_gl1_inv 3982; GFX1064-NEXT: BB19_2: 3983; GFX1064-NEXT: v_nop 3984; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3985; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3986; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3987; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3988; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3989; GFX1064-NEXT: s_nop 1 3990; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3991; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3992; GFX1064-NEXT: s_endpgm 3993; 3994; GFX1032-LABEL: min_i32_varying: 3995; GFX1032: ; %bb.0: ; %entry 3996; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3997; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3998; GFX1032-NEXT: ; implicit-def: $vcc_hi 3999; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4000; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4001; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4002; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 4003; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4004; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4005; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4006; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4007; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4008; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4009; GFX1032-NEXT: s_mov_b32 s2, -1 4010; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4011; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4012; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4013; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4014; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4015; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4016; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4017; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4018; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4019; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4020; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4021; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4022; GFX1032-NEXT: ; implicit-def: $vgpr0 4023; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4024; GFX1032-NEXT: s_cbranch_execz BB19_2 4025; GFX1032-NEXT: ; %bb.1: 4026; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4027; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4028; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4029; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4030; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7 4031; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4032; GFX1032-NEXT: buffer_gl0_inv 4033; GFX1032-NEXT: buffer_gl1_inv 4034; GFX1032-NEXT: BB19_2: 4035; GFX1032-NEXT: v_nop 4036; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4037; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4038; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4039; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 4040; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4041; GFX1032-NEXT: s_nop 1 4042; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4043; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4044; GFX1032-NEXT: s_endpgm 4045entry: 4046 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4047 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4048 store i32 %old, i32 addrspace(1)* %out 4049 ret void 4050} 4051 4052define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 4053; 4054; 4055; GFX7LESS-LABEL: min_i64_constant: 4056; GFX7LESS: ; %bb.0: ; %entry 4057; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4058; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4059; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4060; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 4061; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4062; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4063; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4064; GFX7LESS-NEXT: s_cbranch_execz BB20_2 4065; GFX7LESS-NEXT: ; %bb.1: 4066; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4067; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4068; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4069; GFX7LESS-NEXT: s_mov_b32 m0, -1 4070; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4071; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4072; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4073; GFX7LESS-NEXT: buffer_wbinvl1 4074; GFX7LESS-NEXT: BB20_2: 4075; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4076; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4077; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4078; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 4079; GFX7LESS-NEXT: s_mov_b32 s2, -1 4080; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4081; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4082; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4083; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4084; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4085; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4086; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4087; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4088; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4089; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4090; GFX7LESS-NEXT: s_endpgm 4091; 4092; GFX8-LABEL: min_i64_constant: 4093; GFX8: ; %bb.0: ; %entry 4094; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4095; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4096; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4097; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4098; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4099; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4100; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4101; GFX8-NEXT: s_cbranch_execz BB20_2 4102; GFX8-NEXT: ; %bb.1: 4103; GFX8-NEXT: v_mov_b32_e32 v0, 5 4104; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4105; GFX8-NEXT: v_mov_b32_e32 v1, 0 4106; GFX8-NEXT: s_mov_b32 m0, -1 4107; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4108; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4109; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4110; GFX8-NEXT: buffer_wbinvl1_vol 4111; GFX8-NEXT: BB20_2: 4112; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4113; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4114; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 4115; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4116; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4117; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4118; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4119; GFX8-NEXT: v_mov_b32_e32 v2, s5 4120; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4121; GFX8-NEXT: v_mov_b32_e32 v2, s4 4122; GFX8-NEXT: s_mov_b32 s2, -1 4123; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4124; GFX8-NEXT: s_mov_b32 s3, 0xf000 4125; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4126; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4127; GFX8-NEXT: s_endpgm 4128; 4129; GFX9-LABEL: min_i64_constant: 4130; GFX9: ; %bb.0: ; %entry 4131; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4132; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4133; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4134; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4135; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4136; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4137; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4138; GFX9-NEXT: s_cbranch_execz BB20_2 4139; GFX9-NEXT: ; %bb.1: 4140; GFX9-NEXT: v_mov_b32_e32 v0, 5 4141; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4142; GFX9-NEXT: v_mov_b32_e32 v1, 0 4143; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4144; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4145; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4146; GFX9-NEXT: buffer_wbinvl1_vol 4147; GFX9-NEXT: BB20_2: 4148; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4149; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4150; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4151; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4152; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4153; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4154; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4155; GFX9-NEXT: v_mov_b32_e32 v2, s5 4156; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4157; GFX9-NEXT: v_mov_b32_e32 v2, s4 4158; GFX9-NEXT: s_mov_b32 s2, -1 4159; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4160; GFX9-NEXT: s_mov_b32 s3, 0xf000 4161; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4162; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4163; GFX9-NEXT: s_endpgm 4164; 4165; GFX1064-LABEL: min_i64_constant: 4166; GFX1064: ; %bb.0: ; %entry 4167; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4168; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4169; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4170; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4171; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4172; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4173; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4174; GFX1064-NEXT: s_cbranch_execz BB20_2 4175; GFX1064-NEXT: ; %bb.1: 4176; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4177; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4178; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4179; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4180; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4181; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4182; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4183; GFX1064-NEXT: buffer_gl0_inv 4184; GFX1064-NEXT: buffer_gl1_inv 4185; GFX1064-NEXT: BB20_2: 4186; GFX1064-NEXT: v_nop 4187; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4188; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4189; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4190; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4191; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4192; GFX1064-NEXT: s_mov_b32 s2, -1 4193; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4194; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4195; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 4196; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4197; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4198; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4199; GFX1064-NEXT: s_endpgm 4200; 4201; GFX1032-LABEL: min_i64_constant: 4202; GFX1032: ; %bb.0: ; %entry 4203; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4204; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4205; GFX1032-NEXT: ; implicit-def: $vcc_hi 4206; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4207; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4208; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4209; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4210; GFX1032-NEXT: s_cbranch_execz BB20_2 4211; GFX1032-NEXT: ; %bb.1: 4212; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4213; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4214; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4215; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4216; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4217; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4218; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4219; GFX1032-NEXT: buffer_gl0_inv 4220; GFX1032-NEXT: buffer_gl1_inv 4221; GFX1032-NEXT: BB20_2: 4222; GFX1032-NEXT: v_nop 4223; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4224; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 4225; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 4226; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4227; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4228; GFX1032-NEXT: s_mov_b32 s2, -1 4229; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4230; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] 4231; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 4232; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 4233; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4234; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4235; GFX1032-NEXT: s_endpgm 4236entry: 4237 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4238 store i64 %old, i64 addrspace(1)* %out 4239 ret void 4240} 4241 4242; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 4243; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 4244; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 4245define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4246; 4247; 4248; GFX7LESS-LABEL: umax_i32_varying: 4249; GFX7LESS: ; %bb.0: ; %entry 4250; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4251; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4252; GFX7LESS-NEXT: s_mov_b32 m0, -1 4253; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4254; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4255; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4256; GFX7LESS-NEXT: buffer_wbinvl1 4257; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4258; GFX7LESS-NEXT: s_mov_b32 s2, -1 4259; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4260; GFX7LESS-NEXT: s_endpgm 4261; 4262; GFX8-LABEL: umax_i32_varying: 4263; GFX8: ; %bb.0: ; %entry 4264; GFX8-NEXT: v_mov_b32_e32 v2, v0 4265; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4266; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4267; GFX8-NEXT: v_mov_b32_e32 v1, 0 4268; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4269; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4270; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4271; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4272; GFX8-NEXT: s_not_b64 exec, exec 4273; GFX8-NEXT: v_mov_b32_e32 v2, 0 4274; GFX8-NEXT: s_not_b64 exec, exec 4275; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4276; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4277; GFX8-NEXT: s_nop 1 4278; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4279; GFX8-NEXT: s_nop 1 4280; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4281; GFX8-NEXT: s_nop 1 4282; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4283; GFX8-NEXT: s_nop 1 4284; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4285; GFX8-NEXT: s_nop 1 4286; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4287; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4288; GFX8-NEXT: s_nop 0 4289; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4290; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4291; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4292; GFX8-NEXT: ; implicit-def: $vgpr0 4293; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4294; GFX8-NEXT: s_cbranch_execz BB21_2 4295; GFX8-NEXT: ; %bb.1: 4296; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4297; GFX8-NEXT: v_mov_b32_e32 v3, s2 4298; GFX8-NEXT: s_mov_b32 m0, -1 4299; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4300; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4301; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4302; GFX8-NEXT: buffer_wbinvl1_vol 4303; GFX8-NEXT: BB21_2: 4304; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4305; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4306; GFX8-NEXT: v_mov_b32_e32 v0, v1 4307; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4308; GFX8-NEXT: s_mov_b32 s3, 0xf000 4309; GFX8-NEXT: s_mov_b32 s2, -1 4310; GFX8-NEXT: s_nop 0 4311; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4312; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4313; GFX8-NEXT: s_endpgm 4314; 4315; GFX9-LABEL: umax_i32_varying: 4316; GFX9: ; %bb.0: ; %entry 4317; GFX9-NEXT: v_mov_b32_e32 v2, v0 4318; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4319; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4320; GFX9-NEXT: v_mov_b32_e32 v1, 0 4321; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4322; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4323; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4324; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4325; GFX9-NEXT: s_not_b64 exec, exec 4326; GFX9-NEXT: v_mov_b32_e32 v2, 0 4327; GFX9-NEXT: s_not_b64 exec, exec 4328; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4329; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4330; GFX9-NEXT: s_nop 1 4331; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4332; GFX9-NEXT: s_nop 1 4333; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4334; GFX9-NEXT: s_nop 1 4335; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4336; GFX9-NEXT: s_nop 1 4337; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4338; GFX9-NEXT: s_nop 1 4339; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4340; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4341; GFX9-NEXT: s_nop 0 4342; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4343; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4344; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4345; GFX9-NEXT: ; implicit-def: $vgpr0 4346; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4347; GFX9-NEXT: s_cbranch_execz BB21_2 4348; GFX9-NEXT: ; %bb.1: 4349; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4350; GFX9-NEXT: v_mov_b32_e32 v3, s2 4351; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4352; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4353; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4354; GFX9-NEXT: buffer_wbinvl1_vol 4355; GFX9-NEXT: BB21_2: 4356; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4357; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4358; GFX9-NEXT: v_mov_b32_e32 v0, v1 4359; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4360; GFX9-NEXT: s_mov_b32 s3, 0xf000 4361; GFX9-NEXT: s_mov_b32 s2, -1 4362; GFX9-NEXT: s_nop 0 4363; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4364; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4365; GFX9-NEXT: s_endpgm 4366; 4367; GFX1064-LABEL: umax_i32_varying: 4368; GFX1064: ; %bb.0: ; %entry 4369; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4370; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4371; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4372; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4373; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4374; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4375; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4376; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4377; GFX1064-NEXT: s_not_b64 exec, exec 4378; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4379; GFX1064-NEXT: s_not_b64 exec, exec 4380; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4381; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4382; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4383; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4384; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4385; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4386; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4387; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4388; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4389; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4390; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4391; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4392; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4393; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4394; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4395; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4396; GFX1064-NEXT: s_mov_b32 s2, -1 4397; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4398; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4399; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4400; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4401; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4402; GFX1064-NEXT: ; implicit-def: $vgpr0 4403; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4404; GFX1064-NEXT: s_cbranch_execz BB21_2 4405; GFX1064-NEXT: ; %bb.1: 4406; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4407; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4408; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4409; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4410; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7 4411; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4412; GFX1064-NEXT: buffer_gl0_inv 4413; GFX1064-NEXT: buffer_gl1_inv 4414; GFX1064-NEXT: BB21_2: 4415; GFX1064-NEXT: v_nop 4416; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4417; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4418; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4419; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4420; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4421; GFX1064-NEXT: s_nop 1 4422; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4423; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4424; GFX1064-NEXT: s_endpgm 4425; 4426; GFX1032-LABEL: umax_i32_varying: 4427; GFX1032: ; %bb.0: ; %entry 4428; GFX1032-NEXT: ; implicit-def: $vcc_hi 4429; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4430; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4431; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4432; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4433; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4434; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4435; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4436; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4437; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4438; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4439; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4440; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4441; GFX1032-NEXT: s_mov_b32 s2, -1 4442; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4443; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4444; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4445; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4446; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4447; GFX1032-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4448; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4449; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4450; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4451; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4452; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4453; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4454; GFX1032-NEXT: ; implicit-def: $vgpr0 4455; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4456; GFX1032-NEXT: s_cbranch_execz BB21_2 4457; GFX1032-NEXT: ; %bb.1: 4458; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4459; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4460; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4461; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4462; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7 4463; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4464; GFX1032-NEXT: buffer_gl0_inv 4465; GFX1032-NEXT: buffer_gl1_inv 4466; GFX1032-NEXT: BB21_2: 4467; GFX1032-NEXT: v_nop 4468; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4469; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4470; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4471; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4472; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4473; GFX1032-NEXT: s_nop 1 4474; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4475; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4476; GFX1032-NEXT: s_endpgm 4477entry: 4478 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4479 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4480 store i32 %old, i32 addrspace(1)* %out 4481 ret void 4482} 4483 4484define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4485; 4486; 4487; GFX7LESS-LABEL: umax_i64_constant: 4488; GFX7LESS: ; %bb.0: ; %entry 4489; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4490; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4491; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4492; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 4493; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4494; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4495; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4496; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4497; GFX7LESS-NEXT: ; %bb.1: 4498; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4499; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4500; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4501; GFX7LESS-NEXT: s_mov_b32 m0, -1 4502; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4503; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4504; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4505; GFX7LESS-NEXT: buffer_wbinvl1 4506; GFX7LESS-NEXT: BB22_2: 4507; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4508; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4509; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4510; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4511; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4512; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4513; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4514; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4515; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4516; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4517; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4518; GFX7LESS-NEXT: s_mov_b32 s2, -1 4519; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4520; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4521; GFX7LESS-NEXT: s_endpgm 4522; 4523; GFX8-LABEL: umax_i64_constant: 4524; GFX8: ; %bb.0: ; %entry 4525; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4526; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4527; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4528; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4529; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4530; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4531; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4532; GFX8-NEXT: s_cbranch_execz BB22_2 4533; GFX8-NEXT: ; %bb.1: 4534; GFX8-NEXT: v_mov_b32_e32 v0, 5 4535; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4536; GFX8-NEXT: v_mov_b32_e32 v1, 0 4537; GFX8-NEXT: s_mov_b32 m0, -1 4538; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4539; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4540; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4541; GFX8-NEXT: buffer_wbinvl1_vol 4542; GFX8-NEXT: BB22_2: 4543; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4544; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4545; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4546; GFX8-NEXT: v_mov_b32_e32 v1, 0 4547; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4548; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4549; GFX8-NEXT: v_mov_b32_e32 v1, s3 4550; GFX8-NEXT: v_mov_b32_e32 v2, s2 4551; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4552; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4553; GFX8-NEXT: s_mov_b32 s3, 0xf000 4554; GFX8-NEXT: s_mov_b32 s2, -1 4555; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4556; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4557; GFX8-NEXT: s_endpgm 4558; 4559; GFX9-LABEL: umax_i64_constant: 4560; GFX9: ; %bb.0: ; %entry 4561; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4562; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4563; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4564; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4565; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4566; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4567; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4568; GFX9-NEXT: s_cbranch_execz BB22_2 4569; GFX9-NEXT: ; %bb.1: 4570; GFX9-NEXT: v_mov_b32_e32 v0, 5 4571; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4572; GFX9-NEXT: v_mov_b32_e32 v1, 0 4573; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4574; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4575; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4576; GFX9-NEXT: buffer_wbinvl1_vol 4577; GFX9-NEXT: BB22_2: 4578; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4579; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4580; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4581; GFX9-NEXT: v_mov_b32_e32 v1, 0 4582; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4583; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4584; GFX9-NEXT: v_mov_b32_e32 v1, s3 4585; GFX9-NEXT: v_mov_b32_e32 v2, s2 4586; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4587; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4588; GFX9-NEXT: s_mov_b32 s3, 0xf000 4589; GFX9-NEXT: s_mov_b32 s2, -1 4590; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4591; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4592; GFX9-NEXT: s_endpgm 4593; 4594; GFX1064-LABEL: umax_i64_constant: 4595; GFX1064: ; %bb.0: ; %entry 4596; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4597; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4598; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4599; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4600; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4601; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4602; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4603; GFX1064-NEXT: s_cbranch_execz BB22_2 4604; GFX1064-NEXT: ; %bb.1: 4605; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4606; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4607; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4608; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4609; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4610; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4611; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4612; GFX1064-NEXT: buffer_gl0_inv 4613; GFX1064-NEXT: buffer_gl1_inv 4614; GFX1064-NEXT: BB22_2: 4615; GFX1064-NEXT: v_nop 4616; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4617; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4618; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4619; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4620; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4621; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4622; GFX1064-NEXT: s_mov_b32 s2, -1 4623; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4624; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4625; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc 4626; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4627; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4628; GFX1064-NEXT: s_endpgm 4629; 4630; GFX1032-LABEL: umax_i64_constant: 4631; GFX1032: ; %bb.0: ; %entry 4632; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4633; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4634; GFX1032-NEXT: ; implicit-def: $vcc_hi 4635; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4636; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4637; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4638; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4639; GFX1032-NEXT: s_cbranch_execz BB22_2 4640; GFX1032-NEXT: ; %bb.1: 4641; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4642; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4643; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4644; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4645; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4646; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4647; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4648; GFX1032-NEXT: buffer_gl0_inv 4649; GFX1032-NEXT: buffer_gl1_inv 4650; GFX1032-NEXT: BB22_2: 4651; GFX1032-NEXT: v_nop 4652; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4653; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 4654; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 4655; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4656; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4657; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4658; GFX1032-NEXT: s_mov_b32 s2, -1 4659; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] 4660; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 4661; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo 4662; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4663; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4664; GFX1032-NEXT: s_endpgm 4665entry: 4666 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4667 store i64 %old, i64 addrspace(1)* %out 4668 ret void 4669} 4670 4671; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 4672; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 4673; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 4674define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4675; 4676; 4677; GFX7LESS-LABEL: umin_i32_varying: 4678; GFX7LESS: ; %bb.0: ; %entry 4679; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4680; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4681; GFX7LESS-NEXT: s_mov_b32 m0, -1 4682; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4683; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4684; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4685; GFX7LESS-NEXT: buffer_wbinvl1 4686; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4687; GFX7LESS-NEXT: s_mov_b32 s2, -1 4688; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4689; GFX7LESS-NEXT: s_endpgm 4690; 4691; GFX8-LABEL: umin_i32_varying: 4692; GFX8: ; %bb.0: ; %entry 4693; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4694; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4695; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 4696; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 4697; GFX8-NEXT: v_mov_b32_e32 v2, v0 4698; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4699; GFX8-NEXT: v_mov_b32_e32 v1, -1 4700; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4701; GFX8-NEXT: s_not_b64 exec, exec 4702; GFX8-NEXT: v_mov_b32_e32 v2, -1 4703; GFX8-NEXT: s_not_b64 exec, exec 4704; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4705; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4706; GFX8-NEXT: s_nop 1 4707; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4708; GFX8-NEXT: s_nop 1 4709; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4710; GFX8-NEXT: s_nop 1 4711; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4712; GFX8-NEXT: s_nop 1 4713; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4714; GFX8-NEXT: s_nop 1 4715; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4716; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4717; GFX8-NEXT: s_nop 0 4718; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4719; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4720; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4721; GFX8-NEXT: ; implicit-def: $vgpr0 4722; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4723; GFX8-NEXT: s_cbranch_execz BB23_2 4724; GFX8-NEXT: ; %bb.1: 4725; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4726; GFX8-NEXT: v_mov_b32_e32 v3, s2 4727; GFX8-NEXT: s_mov_b32 m0, -1 4728; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4729; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4730; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4731; GFX8-NEXT: buffer_wbinvl1_vol 4732; GFX8-NEXT: BB23_2: 4733; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4734; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4735; GFX8-NEXT: v_mov_b32_e32 v0, v1 4736; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4737; GFX8-NEXT: s_mov_b32 s3, 0xf000 4738; GFX8-NEXT: s_mov_b32 s2, -1 4739; GFX8-NEXT: s_nop 0 4740; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4741; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4742; GFX8-NEXT: s_endpgm 4743; 4744; GFX9-LABEL: umin_i32_varying: 4745; GFX9: ; %bb.0: ; %entry 4746; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4747; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4748; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 4749; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 4750; GFX9-NEXT: v_mov_b32_e32 v2, v0 4751; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4752; GFX9-NEXT: v_mov_b32_e32 v1, -1 4753; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4754; GFX9-NEXT: s_not_b64 exec, exec 4755; GFX9-NEXT: v_mov_b32_e32 v2, -1 4756; GFX9-NEXT: s_not_b64 exec, exec 4757; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4758; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4759; GFX9-NEXT: s_nop 1 4760; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4761; GFX9-NEXT: s_nop 1 4762; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4763; GFX9-NEXT: s_nop 1 4764; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4765; GFX9-NEXT: s_nop 1 4766; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4767; GFX9-NEXT: s_nop 1 4768; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4769; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4770; GFX9-NEXT: s_nop 0 4771; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4772; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4773; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4774; GFX9-NEXT: ; implicit-def: $vgpr0 4775; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4776; GFX9-NEXT: s_cbranch_execz BB23_2 4777; GFX9-NEXT: ; %bb.1: 4778; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4779; GFX9-NEXT: v_mov_b32_e32 v3, s2 4780; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4781; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4782; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4783; GFX9-NEXT: buffer_wbinvl1_vol 4784; GFX9-NEXT: BB23_2: 4785; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4786; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4787; GFX9-NEXT: v_mov_b32_e32 v0, v1 4788; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4789; GFX9-NEXT: s_mov_b32 s3, 0xf000 4790; GFX9-NEXT: s_mov_b32 s2, -1 4791; GFX9-NEXT: s_nop 0 4792; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4793; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4794; GFX9-NEXT: s_endpgm 4795; 4796; GFX1064-LABEL: umin_i32_varying: 4797; GFX1064: ; %bb.0: ; %entry 4798; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4799; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4800; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4801; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4802; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 4803; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4804; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4805; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4806; GFX1064-NEXT: s_not_b64 exec, exec 4807; GFX1064-NEXT: v_mov_b32_e32 v2, -1 4808; GFX1064-NEXT: s_not_b64 exec, exec 4809; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4810; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4811; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4812; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4813; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4814; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4815; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4816; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4817; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4818; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4819; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4820; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4821; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4822; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4823; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4824; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4825; GFX1064-NEXT: s_mov_b32 s2, -1 4826; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4827; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4828; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4829; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4830; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 4831; GFX1064-NEXT: ; implicit-def: $vgpr0 4832; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4833; GFX1064-NEXT: s_cbranch_execz BB23_2 4834; GFX1064-NEXT: ; %bb.1: 4835; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4836; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4837; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4838; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4839; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7 4840; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4841; GFX1064-NEXT: buffer_gl0_inv 4842; GFX1064-NEXT: buffer_gl1_inv 4843; GFX1064-NEXT: BB23_2: 4844; GFX1064-NEXT: v_nop 4845; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4846; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4847; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4848; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4849; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4850; GFX1064-NEXT: s_nop 1 4851; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4852; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4853; GFX1064-NEXT: s_endpgm 4854; 4855; GFX1032-LABEL: umin_i32_varying: 4856; GFX1032: ; %bb.0: ; %entry 4857; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4858; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4859; GFX1032-NEXT: ; implicit-def: $vcc_hi 4860; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4861; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4862; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4863; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4864; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4865; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4866; GFX1032-NEXT: v_mov_b32_e32 v2, -1 4867; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4868; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4869; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4870; GFX1032-NEXT: s_mov_b32 s2, -1 4871; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4872; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4873; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4874; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4875; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4876; GFX1032-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4877; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4878; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4879; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4880; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4881; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4882; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4883; GFX1032-NEXT: ; implicit-def: $vgpr0 4884; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4885; GFX1032-NEXT: s_cbranch_execz BB23_2 4886; GFX1032-NEXT: ; %bb.1: 4887; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4888; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4889; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4890; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4891; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7 4892; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4893; GFX1032-NEXT: buffer_gl0_inv 4894; GFX1032-NEXT: buffer_gl1_inv 4895; GFX1032-NEXT: BB23_2: 4896; GFX1032-NEXT: v_nop 4897; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4898; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4899; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4900; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4901; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4902; GFX1032-NEXT: s_nop 1 4903; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4904; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4905; GFX1032-NEXT: s_endpgm 4906entry: 4907 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4908 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4909 store i32 %old, i32 addrspace(1)* %out 4910 ret void 4911} 4912 4913define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4914; 4915; 4916; GFX7LESS-LABEL: umin_i64_constant: 4917; GFX7LESS: ; %bb.0: ; %entry 4918; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4919; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4920; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4921; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 4922; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4923; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4924; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4925; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4926; GFX7LESS-NEXT: ; %bb.1: 4927; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4928; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4929; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4930; GFX7LESS-NEXT: s_mov_b32 m0, -1 4931; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4932; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4933; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4934; GFX7LESS-NEXT: buffer_wbinvl1 4935; GFX7LESS-NEXT: BB24_2: 4936; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4937; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4938; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4939; GFX7LESS-NEXT: s_mov_b32 s2, -1 4940; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4941; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4942; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4943; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4944; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4945; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4946; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4947; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4948; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4949; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4950; GFX7LESS-NEXT: s_endpgm 4951; 4952; GFX8-LABEL: umin_i64_constant: 4953; GFX8: ; %bb.0: ; %entry 4954; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4955; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4956; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4957; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4958; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4959; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4960; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4961; GFX8-NEXT: s_cbranch_execz BB24_2 4962; GFX8-NEXT: ; %bb.1: 4963; GFX8-NEXT: v_mov_b32_e32 v0, 5 4964; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4965; GFX8-NEXT: v_mov_b32_e32 v1, 0 4966; GFX8-NEXT: s_mov_b32 m0, -1 4967; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4968; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4969; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4970; GFX8-NEXT: buffer_wbinvl1_vol 4971; GFX8-NEXT: BB24_2: 4972; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4973; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4974; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4975; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4976; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4977; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4978; GFX8-NEXT: v_mov_b32_e32 v2, s5 4979; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4980; GFX8-NEXT: v_mov_b32_e32 v2, s4 4981; GFX8-NEXT: s_mov_b32 s2, -1 4982; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4983; GFX8-NEXT: s_mov_b32 s3, 0xf000 4984; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4985; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4986; GFX8-NEXT: s_endpgm 4987; 4988; GFX9-LABEL: umin_i64_constant: 4989; GFX9: ; %bb.0: ; %entry 4990; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4991; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4992; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4993; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4994; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4995; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4996; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4997; GFX9-NEXT: s_cbranch_execz BB24_2 4998; GFX9-NEXT: ; %bb.1: 4999; GFX9-NEXT: v_mov_b32_e32 v0, 5 5000; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5001; GFX9-NEXT: v_mov_b32_e32 v1, 0 5002; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5003; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5004; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5005; GFX9-NEXT: buffer_wbinvl1_vol 5006; GFX9-NEXT: BB24_2: 5007; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5008; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5009; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5010; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5011; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5012; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5013; GFX9-NEXT: v_mov_b32_e32 v2, s5 5014; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5015; GFX9-NEXT: v_mov_b32_e32 v2, s4 5016; GFX9-NEXT: s_mov_b32 s2, -1 5017; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5018; GFX9-NEXT: s_mov_b32 s3, 0xf000 5019; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5020; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5021; GFX9-NEXT: s_endpgm 5022; 5023; GFX1064-LABEL: umin_i64_constant: 5024; GFX1064: ; %bb.0: ; %entry 5025; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 5026; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5027; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 5028; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 5029; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5030; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5031; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5032; GFX1064-NEXT: s_cbranch_execz BB24_2 5033; GFX1064-NEXT: ; %bb.1: 5034; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5035; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5036; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5037; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5038; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5039; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5040; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5041; GFX1064-NEXT: buffer_gl0_inv 5042; GFX1064-NEXT: buffer_gl1_inv 5043; GFX1064-NEXT: BB24_2: 5044; GFX1064-NEXT: v_nop 5045; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5046; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 5047; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 5048; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5049; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5050; GFX1064-NEXT: s_mov_b32 s2, -1 5051; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5052; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5053; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 5054; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 5055; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5056; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5057; GFX1064-NEXT: s_endpgm 5058; 5059; GFX1032-LABEL: umin_i64_constant: 5060; GFX1032: ; %bb.0: ; %entry 5061; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5062; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 5063; GFX1032-NEXT: ; implicit-def: $vcc_hi 5064; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 5065; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5066; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5067; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5068; GFX1032-NEXT: s_cbranch_execz BB24_2 5069; GFX1032-NEXT: ; %bb.1: 5070; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5071; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5072; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5073; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5074; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5075; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5076; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5077; GFX1032-NEXT: buffer_gl0_inv 5078; GFX1032-NEXT: buffer_gl1_inv 5079; GFX1032-NEXT: BB24_2: 5080; GFX1032-NEXT: v_nop 5081; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5082; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 5083; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 5084; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 5085; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5086; GFX1032-NEXT: s_mov_b32 s2, -1 5087; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5088; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] 5089; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 5090; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 5091; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5092; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5093; GFX1032-NEXT: s_endpgm 5094entry: 5095 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 5096 store i64 %old, i64 addrspace(1)* %out 5097 ret void 5098} 5099