1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show that what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 21; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 30; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 31; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s4, 5 32; GFX7LESS-NEXT: s_mov_b32 m0, -1 33; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 34; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 35; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 36; GFX7LESS-NEXT: buffer_wbinvl1 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 39; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 40; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 41; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 42; GFX7LESS-NEXT: s_mov_b32 s2, -1 43; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 59; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 60; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 61; GFX8-NEXT: s_mov_b32 m0, -1 62; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 63; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 64; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 65; GFX8-NEXT: buffer_wbinvl1_vol 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 68; GFX8-NEXT: v_readfirstlane_b32 s2, v1 69; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 70; GFX8-NEXT: s_mov_b32 s3, 0xf000 71; GFX8-NEXT: s_mov_b32 s2, -1 72; GFX8-NEXT: s_nop 1 73; GFX8-NEXT: s_waitcnt lgkmcnt(0) 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 89; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 90; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 91; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 92; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 93; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 94; GFX9-NEXT: buffer_wbinvl1_vol 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 97; GFX9-NEXT: v_readfirstlane_b32 s2, v1 98; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 99; GFX9-NEXT: s_mov_b32 s3, 0xf000 100; GFX9-NEXT: s_mov_b32 s2, -1 101; GFX9-NEXT: s_nop 1 102; GFX9-NEXT: s_waitcnt lgkmcnt(0) 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 109; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 119; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 120; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 121; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 122; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: buffer_gl0_inv 125; GFX1064-NEXT: buffer_gl1_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: v_nop 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_nop 1 134; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 135; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 136; GFX1064-NEXT: s_endpgm 137; 138; GFX1032-LABEL: add_i32_constant: 139; GFX1032: ; %bb.0: ; %entry 140; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 141; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 142; GFX1032-NEXT: ; implicit-def: $vcc_hi 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz BB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 150; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 151; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 152; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 153; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 155; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 156; GFX1032-NEXT: buffer_gl0_inv 157; GFX1032-NEXT: buffer_gl1_inv 158; GFX1032-NEXT: BB0_2: 159; GFX1032-NEXT: v_nop 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_nop 1 166; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 167; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 168; GFX1032-NEXT: s_endpgm 169entry: 170 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 171 store i32 %old, i32 addrspace(1)* %out 172 ret void 173} 174 175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 176; 177; 178; GFX7LESS-LABEL: add_i32_uniform: 179; GFX7LESS: ; %bb.0: ; %entry 180; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 181; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 182; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 183; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 184; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 185; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 186; GFX7LESS-NEXT: ; implicit-def: $vgpr1 187; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 188; GFX7LESS-NEXT: s_cbranch_execz BB1_2 189; GFX7LESS-NEXT: ; %bb.1: 190; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 191; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 192; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 193; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 194; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 195; GFX7LESS-NEXT: s_mov_b32 m0, -1 196; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 197; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 198; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 199; GFX7LESS-NEXT: buffer_wbinvl1 200; GFX7LESS-NEXT: BB1_2: 201; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 202; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 203; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 204; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 205; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 206; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 207; GFX7LESS-NEXT: s_mov_b32 s6, -1 208; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 209; GFX7LESS-NEXT: s_endpgm 210; 211; GFX8-LABEL: add_i32_uniform: 212; GFX8: ; %bb.0: ; %entry 213; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 214; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 215; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 216; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 217; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 218; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 219; GFX8-NEXT: ; implicit-def: $vgpr1 220; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 221; GFX8-NEXT: s_cbranch_execz BB1_2 222; GFX8-NEXT: ; %bb.1: 223; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 225; GFX8-NEXT: s_mul_i32 s1, s0, s1 226; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 227; GFX8-NEXT: v_mov_b32_e32 v2, s1 228; GFX8-NEXT: s_mov_b32 m0, -1 229; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 230; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 231; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 232; GFX8-NEXT: buffer_wbinvl1_vol 233; GFX8-NEXT: BB1_2: 234; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 236; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 237; GFX8-NEXT: v_readfirstlane_b32 s0, v1 238; GFX8-NEXT: s_mov_b32 s7, 0xf000 239; GFX8-NEXT: s_mov_b32 s6, -1 240; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 241; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 242; GFX8-NEXT: s_endpgm 243; 244; GFX9-LABEL: add_i32_uniform: 245; GFX9: ; %bb.0: ; %entry 246; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 247; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 248; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 249; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 250; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 251; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 252; GFX9-NEXT: ; implicit-def: $vgpr1 253; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 254; GFX9-NEXT: s_cbranch_execz BB1_2 255; GFX9-NEXT: ; %bb.1: 256; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 258; GFX9-NEXT: s_mul_i32 s1, s0, s1 259; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 260; GFX9-NEXT: v_mov_b32_e32 v2, s1 261; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 262; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 263; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 264; GFX9-NEXT: buffer_wbinvl1_vol 265; GFX9-NEXT: BB1_2: 266; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 267; GFX9-NEXT: s_waitcnt lgkmcnt(0) 268; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 269; GFX9-NEXT: v_readfirstlane_b32 s0, v1 270; GFX9-NEXT: s_mov_b32 s7, 0xf000 271; GFX9-NEXT: s_mov_b32 s6, -1 272; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 273; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 274; GFX9-NEXT: s_endpgm 275; 276; GFX1064-LABEL: add_i32_uniform: 277; GFX1064: ; %bb.0: ; %entry 278; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 279; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 280; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 281; GFX1064-NEXT: ; implicit-def: $vgpr1 282; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 283; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 284; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 285; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 286; GFX1064-NEXT: s_cbranch_execz BB1_2 287; GFX1064-NEXT: ; %bb.1: 288; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 289; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 290; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 291; GFX1064-NEXT: s_mul_i32 s1, s0, s1 292; GFX1064-NEXT: v_mov_b32_e32 v2, s1 293; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 294; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 295; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 296; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 297; GFX1064-NEXT: buffer_gl0_inv 298; GFX1064-NEXT: buffer_gl1_inv 299; GFX1064-NEXT: BB1_2: 300; GFX1064-NEXT: v_nop 301; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 302; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 303; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 304; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 305; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 306; GFX1064-NEXT: s_mov_b32 s6, -1 307; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 308; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 309; GFX1064-NEXT: s_endpgm 310; 311; GFX1032-LABEL: add_i32_uniform: 312; GFX1032: ; %bb.0: ; %entry 313; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 314; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 315; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 316; GFX1032-NEXT: ; implicit-def: $vcc_hi 317; GFX1032-NEXT: ; implicit-def: $vgpr1 318; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 319; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 320; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 321; GFX1032-NEXT: s_cbranch_execz BB1_2 322; GFX1032-NEXT: ; %bb.1: 323; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 324; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: s_mul_i32 s2, s0, s2 327; GFX1032-NEXT: v_mov_b32_e32 v2, s2 328; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 329; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 330; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 331; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 332; GFX1032-NEXT: buffer_gl0_inv 333; GFX1032-NEXT: buffer_gl1_inv 334; GFX1032-NEXT: BB1_2: 335; GFX1032-NEXT: v_nop 336; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 337; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 338; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 339; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 340; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 341; GFX1032-NEXT: s_mov_b32 s6, -1 342; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 343; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 344; GFX1032-NEXT: s_endpgm 345entry: 346 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 347 store i32 %old, i32 addrspace(1)* %out 348 ret void 349} 350 351; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 352; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 353; GFX7LESS-NOT: s_bcnt1_i32_b64 354; DPPCOMB: v_add_u32_dpp 355; DPPCOMB: v_add_u32_dpp 356; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 357; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 358; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 359define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 360; 361; 362; GFX7LESS-LABEL: add_i32_varying: 363; GFX7LESS: ; %bb.0: ; %entry 364; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 365; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 366; GFX7LESS-NEXT: s_mov_b32 m0, -1 367; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 368; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 369; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 370; GFX7LESS-NEXT: buffer_wbinvl1 371; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 372; GFX7LESS-NEXT: s_mov_b32 s2, -1 373; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 374; GFX7LESS-NEXT: s_endpgm 375; 376; GFX8-LABEL: add_i32_varying: 377; GFX8: ; %bb.0: ; %entry 378; GFX8-NEXT: v_mov_b32_e32 v2, v0 379; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 380; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 381; GFX8-NEXT: v_mov_b32_e32 v1, 0 382; GFX8-NEXT: s_mov_b64 exec, s[2:3] 383; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 384; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 385; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 386; GFX8-NEXT: s_not_b64 exec, exec 387; GFX8-NEXT: v_mov_b32_e32 v2, 0 388; GFX8-NEXT: s_not_b64 exec, exec 389; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 390; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 391; GFX8-NEXT: s_nop 1 392; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 393; GFX8-NEXT: s_nop 1 394; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 395; GFX8-NEXT: s_nop 1 396; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 397; GFX8-NEXT: s_nop 1 398; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 399; GFX8-NEXT: s_nop 1 400; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 401; GFX8-NEXT: v_readlane_b32 s2, v2, 63 402; GFX8-NEXT: s_nop 0 403; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 404; GFX8-NEXT: s_mov_b64 exec, s[4:5] 405; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 406; GFX8-NEXT: ; implicit-def: $vgpr0 407; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 408; GFX8-NEXT: s_cbranch_execz BB2_2 409; GFX8-NEXT: ; %bb.1: 410; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 411; GFX8-NEXT: v_mov_b32_e32 v3, s2 412; GFX8-NEXT: s_mov_b32 m0, -1 413; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 414; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 415; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 416; GFX8-NEXT: buffer_wbinvl1_vol 417; GFX8-NEXT: BB2_2: 418; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 419; GFX8-NEXT: v_readfirstlane_b32 s2, v0 420; GFX8-NEXT: v_mov_b32_e32 v0, v1 421; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 422; GFX8-NEXT: s_mov_b32 s3, 0xf000 423; GFX8-NEXT: s_mov_b32 s2, -1 424; GFX8-NEXT: s_nop 0 425; GFX8-NEXT: s_waitcnt lgkmcnt(0) 426; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 427; GFX8-NEXT: s_endpgm 428; 429; GFX9-LABEL: add_i32_varying: 430; GFX9: ; %bb.0: ; %entry 431; GFX9-NEXT: v_mov_b32_e32 v2, v0 432; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 433; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 434; GFX9-NEXT: v_mov_b32_e32 v1, 0 435; GFX9-NEXT: s_mov_b64 exec, s[2:3] 436; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 437; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 438; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 439; GFX9-NEXT: s_not_b64 exec, exec 440; GFX9-NEXT: v_mov_b32_e32 v2, 0 441; GFX9-NEXT: s_not_b64 exec, exec 442; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 443; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 444; GFX9-NEXT: s_nop 1 445; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 446; GFX9-NEXT: s_nop 1 447; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 448; GFX9-NEXT: s_nop 1 449; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 450; GFX9-NEXT: s_nop 1 451; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 452; GFX9-NEXT: s_nop 1 453; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 454; GFX9-NEXT: v_readlane_b32 s2, v2, 63 455; GFX9-NEXT: s_nop 0 456; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 457; GFX9-NEXT: s_mov_b64 exec, s[4:5] 458; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 459; GFX9-NEXT: ; implicit-def: $vgpr0 460; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 461; GFX9-NEXT: s_cbranch_execz BB2_2 462; GFX9-NEXT: ; %bb.1: 463; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 464; GFX9-NEXT: v_mov_b32_e32 v3, s2 465; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 466; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 467; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 468; GFX9-NEXT: buffer_wbinvl1_vol 469; GFX9-NEXT: BB2_2: 470; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 471; GFX9-NEXT: v_readfirstlane_b32 s2, v0 472; GFX9-NEXT: v_mov_b32_e32 v0, v1 473; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 474; GFX9-NEXT: s_mov_b32 s3, 0xf000 475; GFX9-NEXT: s_mov_b32 s2, -1 476; GFX9-NEXT: s_nop 0 477; GFX9-NEXT: s_waitcnt lgkmcnt(0) 478; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 479; GFX9-NEXT: s_endpgm 480; 481; GFX1064-LABEL: add_i32_varying: 482; GFX1064: ; %bb.0: ; %entry 483; GFX1064-NEXT: v_mov_b32_e32 v2, v0 484; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 485; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 486; GFX1064-NEXT: v_mov_b32_e32 v1, 0 487; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 488; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 489; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 490; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 491; GFX1064-NEXT: s_not_b64 exec, exec 492; GFX1064-NEXT: v_mov_b32_e32 v2, 0 493; GFX1064-NEXT: s_not_b64 exec, exec 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 496; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 497; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 498; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 499; GFX1064-NEXT: v_mov_b32_e32 v3, v2 500; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 501; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 502; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 503; GFX1064-NEXT: v_mov_b32_e32 v3, s2 504; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 505; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 506; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 507; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 508; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 509; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 510; GFX1064-NEXT: s_mov_b32 s2, -1 511; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 512; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 513; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 514; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 515; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 516; GFX1064-NEXT: ; implicit-def: $vgpr0 517; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 518; GFX1064-NEXT: s_cbranch_execz BB2_2 519; GFX1064-NEXT: ; %bb.1: 520; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 521; GFX1064-NEXT: v_mov_b32_e32 v7, s3 522; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 523; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 524; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 525; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 526; GFX1064-NEXT: buffer_gl0_inv 527; GFX1064-NEXT: buffer_gl1_inv 528; GFX1064-NEXT: BB2_2: 529; GFX1064-NEXT: v_nop 530; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 531; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 532; GFX1064-NEXT: v_mov_b32_e32 v0, v1 533; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 534; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 535; GFX1064-NEXT: s_nop 1 536; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 537; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 538; GFX1064-NEXT: s_endpgm 539; 540; GFX1032-LABEL: add_i32_varying: 541; GFX1032: ; %bb.0: ; %entry 542; GFX1032-NEXT: ; implicit-def: $vcc_hi 543; GFX1032-NEXT: v_mov_b32_e32 v2, v0 544; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 545; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 546; GFX1032-NEXT: v_mov_b32_e32 v1, 0 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 549; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 550; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 551; GFX1032-NEXT: v_mov_b32_e32 v2, 0 552; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 553; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 554; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 555; GFX1032-NEXT: s_mov_b32 s2, -1 556; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 557; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 558; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 559; GFX1032-NEXT: v_mov_b32_e32 v3, v2 560; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 561; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 562; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 563; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 564; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 565; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 566; GFX1032-NEXT: s_mov_b32 exec_lo, s4 567; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 568; GFX1032-NEXT: ; implicit-def: $vgpr0 569; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 570; GFX1032-NEXT: s_cbranch_execz BB2_2 571; GFX1032-NEXT: ; %bb.1: 572; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 573; GFX1032-NEXT: v_mov_b32_e32 v7, s3 574; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 575; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 576; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 577; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 578; GFX1032-NEXT: buffer_gl0_inv 579; GFX1032-NEXT: buffer_gl1_inv 580; GFX1032-NEXT: BB2_2: 581; GFX1032-NEXT: v_nop 582; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 583; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 584; GFX1032-NEXT: v_mov_b32_e32 v0, v1 585; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 586; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 587; GFX1032-NEXT: s_nop 1 588; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 589; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 590; GFX1032-NEXT: s_endpgm 591entry: 592 %lane = call i32 @llvm.amdgcn.workitem.id.x() 593 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 594 store i32 %old, i32 addrspace(1)* %out 595 ret void 596} 597 598define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 599; 600; 601; GFX7LESS-LABEL: add_i32_varying_gfx1032: 602; GFX7LESS: ; %bb.0: ; %entry 603; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 604; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 605; GFX7LESS-NEXT: s_mov_b32 m0, -1 606; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 607; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 608; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 609; GFX7LESS-NEXT: buffer_wbinvl1 610; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 611; GFX7LESS-NEXT: s_mov_b32 s2, -1 612; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 613; GFX7LESS-NEXT: s_endpgm 614; 615; GFX8-LABEL: add_i32_varying_gfx1032: 616; GFX8: ; %bb.0: ; %entry 617; GFX8-NEXT: v_mov_b32_e32 v2, v0 618; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 619; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 620; GFX8-NEXT: v_mov_b32_e32 v1, 0 621; GFX8-NEXT: s_mov_b64 exec, s[2:3] 622; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 623; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 624; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 625; GFX8-NEXT: s_not_b64 exec, exec 626; GFX8-NEXT: v_mov_b32_e32 v2, 0 627; GFX8-NEXT: s_not_b64 exec, exec 628; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 629; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 630; GFX8-NEXT: s_nop 1 631; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 632; GFX8-NEXT: s_nop 1 633; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 634; GFX8-NEXT: s_nop 1 635; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 636; GFX8-NEXT: s_nop 1 637; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 638; GFX8-NEXT: s_nop 1 639; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 640; GFX8-NEXT: v_readlane_b32 s2, v2, 63 641; GFX8-NEXT: s_nop 0 642; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 643; GFX8-NEXT: s_mov_b64 exec, s[4:5] 644; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 645; GFX8-NEXT: ; implicit-def: $vgpr0 646; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 647; GFX8-NEXT: s_cbranch_execz BB3_2 648; GFX8-NEXT: ; %bb.1: 649; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 650; GFX8-NEXT: v_mov_b32_e32 v3, s2 651; GFX8-NEXT: s_mov_b32 m0, -1 652; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 653; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 654; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 655; GFX8-NEXT: buffer_wbinvl1_vol 656; GFX8-NEXT: BB3_2: 657; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 658; GFX8-NEXT: v_readfirstlane_b32 s2, v0 659; GFX8-NEXT: v_mov_b32_e32 v0, v1 660; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 661; GFX8-NEXT: s_mov_b32 s3, 0xf000 662; GFX8-NEXT: s_mov_b32 s2, -1 663; GFX8-NEXT: s_nop 0 664; GFX8-NEXT: s_waitcnt lgkmcnt(0) 665; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 666; GFX8-NEXT: s_endpgm 667; 668; GFX9-LABEL: add_i32_varying_gfx1032: 669; GFX9: ; %bb.0: ; %entry 670; GFX9-NEXT: v_mov_b32_e32 v2, v0 671; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 672; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 673; GFX9-NEXT: v_mov_b32_e32 v1, 0 674; GFX9-NEXT: s_mov_b64 exec, s[2:3] 675; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 676; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 677; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 678; GFX9-NEXT: s_not_b64 exec, exec 679; GFX9-NEXT: v_mov_b32_e32 v2, 0 680; GFX9-NEXT: s_not_b64 exec, exec 681; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 682; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 683; GFX9-NEXT: s_nop 1 684; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 685; GFX9-NEXT: s_nop 1 686; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 687; GFX9-NEXT: s_nop 1 688; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 689; GFX9-NEXT: s_nop 1 690; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 691; GFX9-NEXT: s_nop 1 692; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 693; GFX9-NEXT: v_readlane_b32 s2, v2, 63 694; GFX9-NEXT: s_nop 0 695; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 696; GFX9-NEXT: s_mov_b64 exec, s[4:5] 697; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 698; GFX9-NEXT: ; implicit-def: $vgpr0 699; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 700; GFX9-NEXT: s_cbranch_execz BB3_2 701; GFX9-NEXT: ; %bb.1: 702; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 703; GFX9-NEXT: v_mov_b32_e32 v3, s2 704; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 705; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 706; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 707; GFX9-NEXT: buffer_wbinvl1_vol 708; GFX9-NEXT: BB3_2: 709; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 710; GFX9-NEXT: v_readfirstlane_b32 s2, v0 711; GFX9-NEXT: v_mov_b32_e32 v0, v1 712; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 713; GFX9-NEXT: s_mov_b32 s3, 0xf000 714; GFX9-NEXT: s_mov_b32 s2, -1 715; GFX9-NEXT: s_nop 0 716; GFX9-NEXT: s_waitcnt lgkmcnt(0) 717; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 718; GFX9-NEXT: s_endpgm 719; 720; GFX1064-LABEL: add_i32_varying_gfx1032: 721; GFX1064: ; %bb.0: ; %entry 722; GFX1064-NEXT: v_mov_b32_e32 v2, v0 723; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 724; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 725; GFX1064-NEXT: v_mov_b32_e32 v1, 0 726; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 727; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 728; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 729; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 730; GFX1064-NEXT: s_not_b64 exec, exec 731; GFX1064-NEXT: v_mov_b32_e32 v2, 0 732; GFX1064-NEXT: s_not_b64 exec, exec 733; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 734; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 735; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 736; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 737; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 738; GFX1064-NEXT: v_mov_b32_e32 v3, v2 739; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 740; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 741; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 742; GFX1064-NEXT: v_mov_b32_e32 v3, s2 743; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 744; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 745; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 746; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 747; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 748; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 749; GFX1064-NEXT: s_mov_b32 s2, -1 750; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 751; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 752; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 753; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 754; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 755; GFX1064-NEXT: ; implicit-def: $vgpr0 756; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 757; GFX1064-NEXT: s_cbranch_execz BB3_2 758; GFX1064-NEXT: ; %bb.1: 759; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 760; GFX1064-NEXT: v_mov_b32_e32 v7, s3 761; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 762; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 763; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 764; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 765; GFX1064-NEXT: buffer_gl0_inv 766; GFX1064-NEXT: buffer_gl1_inv 767; GFX1064-NEXT: BB3_2: 768; GFX1064-NEXT: v_nop 769; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 770; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 771; GFX1064-NEXT: v_mov_b32_e32 v0, v1 772; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 773; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 774; GFX1064-NEXT: s_nop 1 775; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 776; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 777; GFX1064-NEXT: s_endpgm 778; 779; GFX1032-LABEL: add_i32_varying_gfx1032: 780; GFX1032: ; %bb.0: ; %entry 781; GFX1032-NEXT: ; implicit-def: $vcc_hi 782; GFX1032-NEXT: v_mov_b32_e32 v2, v0 783; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 784; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 785; GFX1032-NEXT: v_mov_b32_e32 v1, 0 786; GFX1032-NEXT: s_mov_b32 exec_lo, s2 787; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 788; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 789; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 790; GFX1032-NEXT: v_mov_b32_e32 v2, 0 791; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 792; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 793; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 794; GFX1032-NEXT: s_mov_b32 s2, -1 795; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 796; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 797; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 798; GFX1032-NEXT: v_mov_b32_e32 v3, v2 799; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 800; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 801; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 802; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 803; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 804; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 805; GFX1032-NEXT: s_mov_b32 exec_lo, s4 806; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 807; GFX1032-NEXT: ; implicit-def: $vgpr0 808; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 809; GFX1032-NEXT: s_cbranch_execz BB3_2 810; GFX1032-NEXT: ; %bb.1: 811; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 812; GFX1032-NEXT: v_mov_b32_e32 v7, s3 813; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 814; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 815; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 816; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 817; GFX1032-NEXT: buffer_gl0_inv 818; GFX1032-NEXT: buffer_gl1_inv 819; GFX1032-NEXT: BB3_2: 820; GFX1032-NEXT: v_nop 821; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 822; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 823; GFX1032-NEXT: v_mov_b32_e32 v0, v1 824; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 825; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 826; GFX1032-NEXT: s_nop 1 827; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 828; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 829; GFX1032-NEXT: s_endpgm 830entry: 831 %lane = call i32 @llvm.amdgcn.workitem.id.x() 832 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 833 store i32 %old, i32 addrspace(1)* %out 834 ret void 835} 836 837define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 838; 839; 840; GFX7LESS-LABEL: add_i32_varying_gfx1064: 841; GFX7LESS: ; %bb.0: ; %entry 842; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 843; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 844; GFX7LESS-NEXT: s_mov_b32 m0, -1 845; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 846; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 847; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 848; GFX7LESS-NEXT: buffer_wbinvl1 849; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 850; GFX7LESS-NEXT: s_mov_b32 s2, -1 851; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 852; GFX7LESS-NEXT: s_endpgm 853; 854; GFX8-LABEL: add_i32_varying_gfx1064: 855; GFX8: ; %bb.0: ; %entry 856; GFX8-NEXT: v_mov_b32_e32 v2, v0 857; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 858; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 859; GFX8-NEXT: v_mov_b32_e32 v1, 0 860; GFX8-NEXT: s_mov_b64 exec, s[2:3] 861; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 862; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 863; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 864; GFX8-NEXT: s_not_b64 exec, exec 865; GFX8-NEXT: v_mov_b32_e32 v2, 0 866; GFX8-NEXT: s_not_b64 exec, exec 867; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 868; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 869; GFX8-NEXT: s_nop 1 870; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 871; GFX8-NEXT: s_nop 1 872; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 873; GFX8-NEXT: s_nop 1 874; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 875; GFX8-NEXT: s_nop 1 876; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 877; GFX8-NEXT: s_nop 1 878; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 879; GFX8-NEXT: v_readlane_b32 s2, v2, 63 880; GFX8-NEXT: s_nop 0 881; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 882; GFX8-NEXT: s_mov_b64 exec, s[4:5] 883; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 884; GFX8-NEXT: ; implicit-def: $vgpr0 885; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 886; GFX8-NEXT: s_cbranch_execz BB4_2 887; GFX8-NEXT: ; %bb.1: 888; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 889; GFX8-NEXT: v_mov_b32_e32 v3, s2 890; GFX8-NEXT: s_mov_b32 m0, -1 891; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 892; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 893; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 894; GFX8-NEXT: buffer_wbinvl1_vol 895; GFX8-NEXT: BB4_2: 896; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 897; GFX8-NEXT: v_readfirstlane_b32 s2, v0 898; GFX8-NEXT: v_mov_b32_e32 v0, v1 899; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 900; GFX8-NEXT: s_mov_b32 s3, 0xf000 901; GFX8-NEXT: s_mov_b32 s2, -1 902; GFX8-NEXT: s_nop 0 903; GFX8-NEXT: s_waitcnt lgkmcnt(0) 904; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 905; GFX8-NEXT: s_endpgm 906; 907; GFX9-LABEL: add_i32_varying_gfx1064: 908; GFX9: ; %bb.0: ; %entry 909; GFX9-NEXT: v_mov_b32_e32 v2, v0 910; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 911; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 912; GFX9-NEXT: v_mov_b32_e32 v1, 0 913; GFX9-NEXT: s_mov_b64 exec, s[2:3] 914; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 915; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 916; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 917; GFX9-NEXT: s_not_b64 exec, exec 918; GFX9-NEXT: v_mov_b32_e32 v2, 0 919; GFX9-NEXT: s_not_b64 exec, exec 920; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 921; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 922; GFX9-NEXT: s_nop 1 923; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 924; GFX9-NEXT: s_nop 1 925; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 926; GFX9-NEXT: s_nop 1 927; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 928; GFX9-NEXT: s_nop 1 929; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 930; GFX9-NEXT: s_nop 1 931; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 932; GFX9-NEXT: v_readlane_b32 s2, v2, 63 933; GFX9-NEXT: s_nop 0 934; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 935; GFX9-NEXT: s_mov_b64 exec, s[4:5] 936; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 937; GFX9-NEXT: ; implicit-def: $vgpr0 938; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 939; GFX9-NEXT: s_cbranch_execz BB4_2 940; GFX9-NEXT: ; %bb.1: 941; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 942; GFX9-NEXT: v_mov_b32_e32 v3, s2 943; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 944; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 945; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 946; GFX9-NEXT: buffer_wbinvl1_vol 947; GFX9-NEXT: BB4_2: 948; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 949; GFX9-NEXT: v_readfirstlane_b32 s2, v0 950; GFX9-NEXT: v_mov_b32_e32 v0, v1 951; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 952; GFX9-NEXT: s_mov_b32 s3, 0xf000 953; GFX9-NEXT: s_mov_b32 s2, -1 954; GFX9-NEXT: s_nop 0 955; GFX9-NEXT: s_waitcnt lgkmcnt(0) 956; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 957; GFX9-NEXT: s_endpgm 958; 959; GFX1064-LABEL: add_i32_varying_gfx1064: 960; GFX1064: ; %bb.0: ; %entry 961; GFX1064-NEXT: v_mov_b32_e32 v2, v0 962; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 963; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 964; GFX1064-NEXT: v_mov_b32_e32 v1, 0 965; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 966; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 967; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 968; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 969; GFX1064-NEXT: s_not_b64 exec, exec 970; GFX1064-NEXT: v_mov_b32_e32 v2, 0 971; GFX1064-NEXT: s_not_b64 exec, exec 972; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 973; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 974; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 975; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 976; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 977; GFX1064-NEXT: v_mov_b32_e32 v3, v2 978; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 979; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 980; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 981; GFX1064-NEXT: v_mov_b32_e32 v3, s2 982; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 983; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 984; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 985; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 986; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 987; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 988; GFX1064-NEXT: s_mov_b32 s2, -1 989; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 990; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 991; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 992; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 993; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 994; GFX1064-NEXT: ; implicit-def: $vgpr0 995; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 996; GFX1064-NEXT: s_cbranch_execz BB4_2 997; GFX1064-NEXT: ; %bb.1: 998; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 999; GFX1064-NEXT: v_mov_b32_e32 v7, s3 1000; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1001; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1002; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 1003; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1004; GFX1064-NEXT: buffer_gl0_inv 1005; GFX1064-NEXT: buffer_gl1_inv 1006; GFX1064-NEXT: BB4_2: 1007; GFX1064-NEXT: v_nop 1008; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1009; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1010; GFX1064-NEXT: v_mov_b32_e32 v0, v1 1011; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 1012; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1013; GFX1064-NEXT: s_nop 1 1014; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1016; GFX1064-NEXT: s_endpgm 1017; 1018; GFX1032-LABEL: add_i32_varying_gfx1064: 1019; GFX1032: ; %bb.0: ; %entry 1020; GFX1032-NEXT: ; implicit-def: $vcc_hi 1021; GFX1032-NEXT: v_mov_b32_e32 v2, v0 1022; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1023; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1024; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1025; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1026; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 1027; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1028; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1029; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1030; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1031; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1032; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1033; GFX1032-NEXT: s_mov_b32 s2, -1 1034; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1035; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1036; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1037; GFX1032-NEXT: v_mov_b32_e32 v3, v2 1038; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 1039; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1040; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 1041; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 1042; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 1043; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 1044; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1045; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1046; GFX1032-NEXT: ; implicit-def: $vgpr0 1047; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1048; GFX1032-NEXT: s_cbranch_execz BB4_2 1049; GFX1032-NEXT: ; %bb.1: 1050; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1051; GFX1032-NEXT: v_mov_b32_e32 v7, s3 1052; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1053; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1054; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 1055; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1056; GFX1032-NEXT: buffer_gl0_inv 1057; GFX1032-NEXT: buffer_gl1_inv 1058; GFX1032-NEXT: BB4_2: 1059; GFX1032-NEXT: v_nop 1060; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1061; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1062; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1063; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1064; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1065; GFX1032-NEXT: s_nop 1 1066; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1067; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1068; GFX1032-NEXT: s_endpgm 1069entry: 1070 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1071 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1072 store i32 %old, i32 addrspace(1)* %out 1073 ret void 1074} 1075 1076define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1077; 1078; 1079; GFX7LESS-LABEL: add_i64_constant: 1080; GFX7LESS: ; %bb.0: ; %entry 1081; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1082; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1083; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1084; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1085; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1086; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1087; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1088; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1089; GFX7LESS-NEXT: ; %bb.1: 1090; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1091; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1092; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1093; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1094; GFX7LESS-NEXT: s_mov_b32 m0, -1 1095; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1096; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1097; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1098; GFX7LESS-NEXT: buffer_wbinvl1 1099; GFX7LESS-NEXT: BB5_2: 1100; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1101; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1102; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1103; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1104; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1105; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1106; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1107; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1108; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1109; GFX7LESS-NEXT: s_mov_b32 s2, -1 1110; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1112; GFX7LESS-NEXT: s_endpgm 1113; 1114; GFX8-LABEL: add_i64_constant: 1115; GFX8: ; %bb.0: ; %entry 1116; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1117; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1118; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1119; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1120; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1121; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1122; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1123; GFX8-NEXT: s_cbranch_execz BB5_2 1124; GFX8-NEXT: ; %bb.1: 1125; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1126; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1127; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1128; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1129; GFX8-NEXT: s_mov_b32 m0, -1 1130; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1131; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1132; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1133; GFX8-NEXT: buffer_wbinvl1_vol 1134; GFX8-NEXT: BB5_2: 1135; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1136; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1137; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1138; GFX8-NEXT: v_mov_b32_e32 v1, s2 1139; GFX8-NEXT: v_mov_b32_e32 v2, s3 1140; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1141; GFX8-NEXT: s_mov_b32 s3, 0xf000 1142; GFX8-NEXT: s_mov_b32 s2, -1 1143; GFX8-NEXT: s_nop 2 1144; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1145; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1146; GFX8-NEXT: s_endpgm 1147; 1148; GFX9-LABEL: add_i64_constant: 1149; GFX9: ; %bb.0: ; %entry 1150; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1151; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1152; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1153; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1154; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1155; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1156; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1157; GFX9-NEXT: s_cbranch_execz BB5_2 1158; GFX9-NEXT: ; %bb.1: 1159; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1160; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1161; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1162; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1163; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1164; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1165; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1166; GFX9-NEXT: buffer_wbinvl1_vol 1167; GFX9-NEXT: BB5_2: 1168; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1169; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1170; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1171; GFX9-NEXT: v_mov_b32_e32 v1, s2 1172; GFX9-NEXT: v_mov_b32_e32 v2, s3 1173; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1174; GFX9-NEXT: s_mov_b32 s3, 0xf000 1175; GFX9-NEXT: s_mov_b32 s2, -1 1176; GFX9-NEXT: s_nop 2 1177; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1178; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1179; GFX9-NEXT: s_endpgm 1180; 1181; GFX1064-LABEL: add_i64_constant: 1182; GFX1064: ; %bb.0: ; %entry 1183; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1184; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1185; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1186; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1187; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 1188; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1189; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1190; GFX1064-NEXT: s_cbranch_execz BB5_2 1191; GFX1064-NEXT: ; %bb.1: 1192; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1193; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1194; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1195; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1196; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1197; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1198; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1199; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1200; GFX1064-NEXT: buffer_gl0_inv 1201; GFX1064-NEXT: buffer_gl1_inv 1202; GFX1064-NEXT: BB5_2: 1203; GFX1064-NEXT: v_nop 1204; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1205; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1206; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1207; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1208; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1209; GFX1064-NEXT: s_mov_b32 s2, -1 1210; GFX1064-NEXT: s_nop 2 1211; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1212; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1213; GFX1064-NEXT: s_endpgm 1214; 1215; GFX1032-LABEL: add_i64_constant: 1216; GFX1032: ; %bb.0: ; %entry 1217; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1218; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 1219; GFX1032-NEXT: ; implicit-def: $vcc_hi 1220; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1221; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1222; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1223; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1224; GFX1032-NEXT: s_cbranch_execz BB5_2 1225; GFX1032-NEXT: ; %bb.1: 1226; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1227; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1228; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1229; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1230; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1231; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1232; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1233; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1234; GFX1032-NEXT: buffer_gl0_inv 1235; GFX1032-NEXT: buffer_gl1_inv 1236; GFX1032-NEXT: BB5_2: 1237; GFX1032-NEXT: v_nop 1238; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1239; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1240; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1241; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1242; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1243; GFX1032-NEXT: s_mov_b32 s2, -1 1244; GFX1032-NEXT: s_nop 2 1245; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1247; GFX1032-NEXT: s_endpgm 1248entry: 1249 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1250 store i64 %old, i64 addrspace(1)* %out 1251 ret void 1252} 1253 1254define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1255; 1256; 1257; GFX7LESS-LABEL: add_i64_uniform: 1258; GFX7LESS: ; %bb.0: ; %entry 1259; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1260; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1261; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1262; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1263; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1264; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1265; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1266; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1267; GFX7LESS-NEXT: ; %bb.1: 1268; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1269; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1270; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1272; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1273; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1274; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1275; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1276; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1277; GFX7LESS-NEXT: s_mov_b32 m0, -1 1278; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1279; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1280; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1281; GFX7LESS-NEXT: buffer_wbinvl1 1282; GFX7LESS-NEXT: BB6_2: 1283; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1284; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1285; GFX7LESS-NEXT: s_mov_b32 s6, -1 1286; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX7LESS-NEXT: s_mov_b32 s4, s0 1288; GFX7LESS-NEXT: s_mov_b32 s5, s1 1289; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1290; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1291; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1292; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1293; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1294; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1295; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1296; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1297; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1298; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1299; GFX7LESS-NEXT: s_endpgm 1300; 1301; GFX8-LABEL: add_i64_uniform: 1302; GFX8: ; %bb.0: ; %entry 1303; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1304; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1305; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1306; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1307; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1308; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1309; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1310; GFX8-NEXT: s_cbranch_execz BB6_2 1311; GFX8-NEXT: ; %bb.1: 1312; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1313; GFX8-NEXT: v_mov_b32_e32 v1, s6 1314; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1315; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1316; GFX8-NEXT: s_mul_i32 s7, s3, s6 1317; GFX8-NEXT: s_mul_i32 s6, s2, s6 1318; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1319; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1320; GFX8-NEXT: v_mov_b32_e32 v1, s6 1321; GFX8-NEXT: s_mov_b32 m0, -1 1322; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1323; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1324; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1325; GFX8-NEXT: buffer_wbinvl1_vol 1326; GFX8-NEXT: BB6_2: 1327; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1328; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1329; GFX8-NEXT: s_mov_b32 s4, s0 1330; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1331; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1332; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1333; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1334; GFX8-NEXT: s_mov_b32 s5, s1 1335; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1336; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1337; GFX8-NEXT: v_mov_b32_e32 v2, s1 1338; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1339; GFX8-NEXT: s_mov_b32 s7, 0xf000 1340; GFX8-NEXT: s_mov_b32 s6, -1 1341; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1342; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1343; GFX8-NEXT: s_endpgm 1344; 1345; GFX9-LABEL: add_i64_uniform: 1346; GFX9: ; %bb.0: ; %entry 1347; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1348; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1349; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1350; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1351; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1352; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1353; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1354; GFX9-NEXT: s_cbranch_execz BB6_2 1355; GFX9-NEXT: ; %bb.1: 1356; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1357; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1358; GFX9-NEXT: s_mul_i32 s7, s3, s6 1359; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1360; GFX9-NEXT: s_add_i32 s8, s8, s7 1361; GFX9-NEXT: s_mul_i32 s6, s2, s6 1362; GFX9-NEXT: v_mov_b32_e32 v1, s6 1363; GFX9-NEXT: v_mov_b32_e32 v2, s8 1364; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1365; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1366; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1367; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1368; GFX9-NEXT: buffer_wbinvl1_vol 1369; GFX9-NEXT: BB6_2: 1370; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1371; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1372; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1373; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1374; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1375; GFX9-NEXT: s_mov_b32 s4, s0 1376; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1377; GFX9-NEXT: s_mov_b32 s5, s1 1378; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1379; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1380; GFX9-NEXT: v_mov_b32_e32 v2, s1 1381; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1382; GFX9-NEXT: s_mov_b32 s7, 0xf000 1383; GFX9-NEXT: s_mov_b32 s6, -1 1384; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1385; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1386; GFX9-NEXT: s_endpgm 1387; 1388; GFX1064-LABEL: add_i64_uniform: 1389; GFX1064: ; %bb.0: ; %entry 1390; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1391; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1392; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1393; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1394; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1395; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1396; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1397; GFX1064-NEXT: s_cbranch_execz BB6_2 1398; GFX1064-NEXT: ; %bb.1: 1399; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1400; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1401; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1402; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1403; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1404; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1405; GFX1064-NEXT: s_add_i32 s8, s8, s7 1406; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1407; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1408; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1409; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1410; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1411; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1412; GFX1064-NEXT: buffer_gl0_inv 1413; GFX1064-NEXT: buffer_gl1_inv 1414; GFX1064-NEXT: BB6_2: 1415; GFX1064-NEXT: v_nop 1416; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1417; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1418; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1419; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1420; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1421; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 1422; GFX1064-NEXT: v_readfirstlane_b32 s5, v2 1423; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1424; GFX1064-NEXT: s_mov_b32 s2, -1 1425; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1426; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s4, v0 1427; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s5, v1, vcc 1428; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1429; GFX1064-NEXT: s_endpgm 1430; 1431; GFX1032-LABEL: add_i64_uniform: 1432; GFX1032: ; %bb.0: ; %entry 1433; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1434; GFX1032-NEXT: v_cmp_ne_u32_e64 s5, 1, 0 1435; GFX1032-NEXT: ; implicit-def: $vcc_hi 1436; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1437; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1438; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1439; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1440; GFX1032-NEXT: s_cbranch_execz BB6_2 1441; GFX1032-NEXT: ; %bb.1: 1442; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1443; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1444; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1445; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1446; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1447; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1448; GFX1032-NEXT: s_add_i32 s7, s7, s6 1449; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1450; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1451; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1452; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1453; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1454; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1455; GFX1032-NEXT: buffer_gl0_inv 1456; GFX1032-NEXT: buffer_gl1_inv 1457; GFX1032-NEXT: BB6_2: 1458; GFX1032-NEXT: v_nop 1459; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1460; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1461; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1462; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1463; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1464; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 1465; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 1466; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1467; GFX1032-NEXT: s_mov_b32 s2, -1 1468; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1469; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s4, v0 1470; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo 1471; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1472; GFX1032-NEXT: s_endpgm 1473entry: 1474 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1475 store i64 %old, i64 addrspace(1)* %out 1476 ret void 1477} 1478 1479; GCN-NOT: v_mbcnt_lo_u32_b32 1480; GCN-NOT: v_mbcnt_hi_u32_b32 1481; GCN-NOT: s_bcnt1_i32_b64 1482define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1483; 1484; 1485; GFX7LESS-LABEL: add_i64_varying: 1486; GFX7LESS: ; %bb.0: ; %entry 1487; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1488; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1489; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1490; GFX7LESS-NEXT: s_mov_b32 m0, -1 1491; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1492; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1493; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1494; GFX7LESS-NEXT: buffer_wbinvl1 1495; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1496; GFX7LESS-NEXT: s_mov_b32 s2, -1 1497; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1498; GFX7LESS-NEXT: s_endpgm 1499; 1500; GFX8-LABEL: add_i64_varying: 1501; GFX8: ; %bb.0: ; %entry 1502; GFX8-NEXT: v_mov_b32_e32 v1, 0 1503; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1504; GFX8-NEXT: s_mov_b32 m0, -1 1505; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1506; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1507; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1508; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1509; GFX8-NEXT: buffer_wbinvl1_vol 1510; GFX8-NEXT: s_mov_b32 s3, 0xf000 1511; GFX8-NEXT: s_mov_b32 s2, -1 1512; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1513; GFX8-NEXT: s_endpgm 1514; 1515; GFX9-LABEL: add_i64_varying: 1516; GFX9: ; %bb.0: ; %entry 1517; GFX9-NEXT: v_mov_b32_e32 v1, 0 1518; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1519; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1520; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1521; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1522; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1523; GFX9-NEXT: buffer_wbinvl1_vol 1524; GFX9-NEXT: s_mov_b32 s3, 0xf000 1525; GFX9-NEXT: s_mov_b32 s2, -1 1526; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1527; GFX9-NEXT: s_endpgm 1528; 1529; GFX1064-LABEL: add_i64_varying: 1530; GFX1064: ; %bb.0: ; %entry 1531; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1532; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1533; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1534; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1535; GFX1064-NEXT: s_mov_b32 s2, -1 1536; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1537; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1538; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1539; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1540; GFX1064-NEXT: buffer_gl0_inv 1541; GFX1064-NEXT: buffer_gl1_inv 1542; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1543; GFX1064-NEXT: s_endpgm 1544; 1545; GFX1032-LABEL: add_i64_varying: 1546; GFX1032: ; %bb.0: ; %entry 1547; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1548; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1549; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1550; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1551; GFX1032-NEXT: s_mov_b32 s2, -1 1552; GFX1032-NEXT: ; implicit-def: $vcc_hi 1553; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1554; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1555; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1556; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1557; GFX1032-NEXT: buffer_gl0_inv 1558; GFX1032-NEXT: buffer_gl1_inv 1559; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1560; GFX1032-NEXT: s_endpgm 1561entry: 1562 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1563 %zext = zext i32 %lane to i64 1564 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1565 store i64 %old, i64 addrspace(1)* %out 1566 ret void 1567} 1568 1569define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1570; 1571; 1572; GFX7LESS-LABEL: sub_i32_constant: 1573; GFX7LESS: ; %bb.0: ; %entry 1574; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1575; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1576; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1577; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1578; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1579; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1580; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1581; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1582; GFX7LESS-NEXT: ; %bb.1: 1583; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1584; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1585; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s4, 5 1586; GFX7LESS-NEXT: s_mov_b32 m0, -1 1587; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1588; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1589; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1590; GFX7LESS-NEXT: buffer_wbinvl1 1591; GFX7LESS-NEXT: BB8_2: 1592; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1593; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1594; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1595; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1596; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1597; GFX7LESS-NEXT: s_mov_b32 s2, -1 1598; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1599; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1600; GFX7LESS-NEXT: s_endpgm 1601; 1602; GFX8-LABEL: sub_i32_constant: 1603; GFX8: ; %bb.0: ; %entry 1604; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1605; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1606; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1607; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1608; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1609; GFX8-NEXT: ; implicit-def: $vgpr1 1610; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1611; GFX8-NEXT: s_cbranch_execz BB8_2 1612; GFX8-NEXT: ; %bb.1: 1613; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1614; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1615; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1616; GFX8-NEXT: s_mov_b32 m0, -1 1617; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1618; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 1619; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1620; GFX8-NEXT: buffer_wbinvl1_vol 1621; GFX8-NEXT: BB8_2: 1622; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1623; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1624; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1625; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1626; GFX8-NEXT: s_mov_b32 s3, 0xf000 1627; GFX8-NEXT: s_mov_b32 s2, -1 1628; GFX8-NEXT: s_nop 0 1629; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1630; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1631; GFX8-NEXT: s_endpgm 1632; 1633; GFX9-LABEL: sub_i32_constant: 1634; GFX9: ; %bb.0: ; %entry 1635; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1636; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1637; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1638; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1639; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1640; GFX9-NEXT: ; implicit-def: $vgpr1 1641; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1642; GFX9-NEXT: s_cbranch_execz BB8_2 1643; GFX9-NEXT: ; %bb.1: 1644; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1645; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1646; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1647; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1648; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 1649; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1650; GFX9-NEXT: buffer_wbinvl1_vol 1651; GFX9-NEXT: BB8_2: 1652; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1653; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1654; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1655; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1656; GFX9-NEXT: s_mov_b32 s3, 0xf000 1657; GFX9-NEXT: s_mov_b32 s2, -1 1658; GFX9-NEXT: s_nop 0 1659; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1660; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1661; GFX9-NEXT: s_endpgm 1662; 1663; GFX1064-LABEL: sub_i32_constant: 1664; GFX1064: ; %bb.0: ; %entry 1665; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1666; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1667; GFX1064-NEXT: ; implicit-def: $vgpr1 1668; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1669; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1670; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1671; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1672; GFX1064-NEXT: s_cbranch_execz BB8_2 1673; GFX1064-NEXT: ; %bb.1: 1674; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1675; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1676; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1677; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1678; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1679; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 1680; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1681; GFX1064-NEXT: buffer_gl0_inv 1682; GFX1064-NEXT: buffer_gl1_inv 1683; GFX1064-NEXT: BB8_2: 1684; GFX1064-NEXT: v_nop 1685; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1686; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1687; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1688; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1689; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1690; GFX1064-NEXT: s_mov_b32 s2, -1 1691; GFX1064-NEXT: s_nop 0 1692; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1693; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1694; GFX1064-NEXT: s_endpgm 1695; 1696; GFX1032-LABEL: sub_i32_constant: 1697; GFX1032: ; %bb.0: ; %entry 1698; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1699; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 1700; GFX1032-NEXT: ; implicit-def: $vcc_hi 1701; GFX1032-NEXT: ; implicit-def: $vgpr1 1702; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1703; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1704; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1705; GFX1032-NEXT: s_cbranch_execz BB8_2 1706; GFX1032-NEXT: ; %bb.1: 1707; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1708; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1709; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1710; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1711; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1712; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 1713; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1714; GFX1032-NEXT: buffer_gl0_inv 1715; GFX1032-NEXT: buffer_gl1_inv 1716; GFX1032-NEXT: BB8_2: 1717; GFX1032-NEXT: v_nop 1718; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1719; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1720; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1721; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1722; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1723; GFX1032-NEXT: s_mov_b32 s2, -1 1724; GFX1032-NEXT: s_nop 0 1725; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1726; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1727; GFX1032-NEXT: s_endpgm 1728entry: 1729 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1730 store i32 %old, i32 addrspace(1)* %out 1731 ret void 1732} 1733 1734define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1735; 1736; 1737; GFX7LESS-LABEL: sub_i32_uniform: 1738; GFX7LESS: ; %bb.0: ; %entry 1739; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1740; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 1741; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1742; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1743; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1744; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1745; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1746; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1747; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1748; GFX7LESS-NEXT: ; %bb.1: 1749; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1750; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1751; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 1752; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1753; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 1754; GFX7LESS-NEXT: s_mov_b32 m0, -1 1755; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1756; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1757; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1758; GFX7LESS-NEXT: buffer_wbinvl1 1759; GFX7LESS-NEXT: BB9_2: 1760; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1761; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1762; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1763; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1764; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1765; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1766; GFX7LESS-NEXT: s_mov_b32 s6, -1 1767; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1768; GFX7LESS-NEXT: s_endpgm 1769; 1770; GFX8-LABEL: sub_i32_uniform: 1771; GFX8: ; %bb.0: ; %entry 1772; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1773; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1774; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1775; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1776; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1777; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1778; GFX8-NEXT: ; implicit-def: $vgpr1 1779; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1780; GFX8-NEXT: s_cbranch_execz BB9_2 1781; GFX8-NEXT: ; %bb.1: 1782; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 1783; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1784; GFX8-NEXT: s_mul_i32 s1, s0, s1 1785; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1786; GFX8-NEXT: v_mov_b32_e32 v2, s1 1787; GFX8-NEXT: s_mov_b32 m0, -1 1788; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1789; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1790; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1791; GFX8-NEXT: buffer_wbinvl1_vol 1792; GFX8-NEXT: BB9_2: 1793; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1794; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1795; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1796; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1797; GFX8-NEXT: s_mov_b32 s7, 0xf000 1798; GFX8-NEXT: s_mov_b32 s6, -1 1799; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1800; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1801; GFX8-NEXT: s_endpgm 1802; 1803; GFX9-LABEL: sub_i32_uniform: 1804; GFX9: ; %bb.0: ; %entry 1805; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1806; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 1807; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1808; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1809; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1810; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1811; GFX9-NEXT: ; implicit-def: $vgpr1 1812; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1813; GFX9-NEXT: s_cbranch_execz BB9_2 1814; GFX9-NEXT: ; %bb.1: 1815; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 1816; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1817; GFX9-NEXT: s_mul_i32 s1, s0, s1 1818; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1819; GFX9-NEXT: v_mov_b32_e32 v2, s1 1820; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1821; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1822; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1823; GFX9-NEXT: buffer_wbinvl1_vol 1824; GFX9-NEXT: BB9_2: 1825; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1826; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1827; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 1828; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1829; GFX9-NEXT: s_mov_b32 s7, 0xf000 1830; GFX9-NEXT: s_mov_b32 s6, -1 1831; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1832; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1833; GFX9-NEXT: s_endpgm 1834; 1835; GFX1064-LABEL: sub_i32_uniform: 1836; GFX1064: ; %bb.0: ; %entry 1837; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1838; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1839; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 1840; GFX1064-NEXT: ; implicit-def: $vgpr1 1841; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1842; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1843; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1844; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 1845; GFX1064-NEXT: s_cbranch_execz BB9_2 1846; GFX1064-NEXT: ; %bb.1: 1847; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1848; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1849; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1850; GFX1064-NEXT: s_mul_i32 s1, s0, s1 1851; GFX1064-NEXT: v_mov_b32_e32 v2, s1 1852; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1853; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1854; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1855; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1856; GFX1064-NEXT: buffer_gl0_inv 1857; GFX1064-NEXT: buffer_gl1_inv 1858; GFX1064-NEXT: BB9_2: 1859; GFX1064-NEXT: v_nop 1860; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 1861; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1862; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 1863; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1864; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1865; GFX1064-NEXT: s_mov_b32 s6, -1 1866; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1867; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1868; GFX1064-NEXT: s_endpgm 1869; 1870; GFX1032-LABEL: sub_i32_uniform: 1871; GFX1032: ; %bb.0: ; %entry 1872; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1873; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 1874; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 1875; GFX1032-NEXT: ; implicit-def: $vcc_hi 1876; GFX1032-NEXT: ; implicit-def: $vgpr1 1877; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1878; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1879; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1880; GFX1032-NEXT: s_cbranch_execz BB9_2 1881; GFX1032-NEXT: ; %bb.1: 1882; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1883; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1884; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1885; GFX1032-NEXT: s_mul_i32 s2, s0, s2 1886; GFX1032-NEXT: v_mov_b32_e32 v2, s2 1887; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1888; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1889; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1890; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1891; GFX1032-NEXT: buffer_gl0_inv 1892; GFX1032-NEXT: buffer_gl1_inv 1893; GFX1032-NEXT: BB9_2: 1894; GFX1032-NEXT: v_nop 1895; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 1896; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1897; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 1898; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1899; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1900; GFX1032-NEXT: s_mov_b32 s6, -1 1901; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1902; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1903; GFX1032-NEXT: s_endpgm 1904entry: 1905 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1906 store i32 %old, i32 addrspace(1)* %out 1907 ret void 1908} 1909 1910; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 1911; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 1912; GFX7LESS-NOT: s_bcnt1_i32_b64 1913; DPPCOMB: v_add_u32_dpp 1914; DPPCOMB: v_add_u32_dpp 1915; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 1916; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 1917; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 1918define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1919; 1920; 1921; GFX7LESS-LABEL: sub_i32_varying: 1922; GFX7LESS: ; %bb.0: ; %entry 1923; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1924; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1925; GFX7LESS-NEXT: s_mov_b32 m0, -1 1926; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1927; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1928; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1929; GFX7LESS-NEXT: buffer_wbinvl1 1930; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1931; GFX7LESS-NEXT: s_mov_b32 s2, -1 1932; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1933; GFX7LESS-NEXT: s_endpgm 1934; 1935; GFX8-LABEL: sub_i32_varying: 1936; GFX8: ; %bb.0: ; %entry 1937; GFX8-NEXT: v_mov_b32_e32 v2, v0 1938; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1939; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1940; GFX8-NEXT: v_mov_b32_e32 v1, 0 1941; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1942; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1943; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1944; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1945; GFX8-NEXT: s_not_b64 exec, exec 1946; GFX8-NEXT: v_mov_b32_e32 v2, 0 1947; GFX8-NEXT: s_not_b64 exec, exec 1948; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1949; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1950; GFX8-NEXT: s_nop 1 1951; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1952; GFX8-NEXT: s_nop 1 1953; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1954; GFX8-NEXT: s_nop 1 1955; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1956; GFX8-NEXT: s_nop 1 1957; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1958; GFX8-NEXT: s_nop 1 1959; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1960; GFX8-NEXT: v_readlane_b32 s2, v2, 63 1961; GFX8-NEXT: s_nop 0 1962; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1963; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1964; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1965; GFX8-NEXT: ; implicit-def: $vgpr0 1966; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1967; GFX8-NEXT: s_cbranch_execz BB10_2 1968; GFX8-NEXT: ; %bb.1: 1969; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1970; GFX8-NEXT: v_mov_b32_e32 v3, s2 1971; GFX8-NEXT: s_mov_b32 m0, -1 1972; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1973; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1974; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1975; GFX8-NEXT: buffer_wbinvl1_vol 1976; GFX8-NEXT: BB10_2: 1977; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1978; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1979; GFX8-NEXT: v_mov_b32_e32 v0, v1 1980; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1981; GFX8-NEXT: s_mov_b32 s3, 0xf000 1982; GFX8-NEXT: s_mov_b32 s2, -1 1983; GFX8-NEXT: s_nop 0 1984; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1986; GFX8-NEXT: s_endpgm 1987; 1988; GFX9-LABEL: sub_i32_varying: 1989; GFX9: ; %bb.0: ; %entry 1990; GFX9-NEXT: v_mov_b32_e32 v2, v0 1991; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1992; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1993; GFX9-NEXT: v_mov_b32_e32 v1, 0 1994; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1995; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1996; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1997; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1998; GFX9-NEXT: s_not_b64 exec, exec 1999; GFX9-NEXT: v_mov_b32_e32 v2, 0 2000; GFX9-NEXT: s_not_b64 exec, exec 2001; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2002; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2003; GFX9-NEXT: s_nop 1 2004; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2005; GFX9-NEXT: s_nop 1 2006; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2007; GFX9-NEXT: s_nop 1 2008; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2009; GFX9-NEXT: s_nop 1 2010; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2011; GFX9-NEXT: s_nop 1 2012; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2013; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2014; GFX9-NEXT: s_nop 0 2015; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2016; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2017; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2018; GFX9-NEXT: ; implicit-def: $vgpr0 2019; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2020; GFX9-NEXT: s_cbranch_execz BB10_2 2021; GFX9-NEXT: ; %bb.1: 2022; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2023; GFX9-NEXT: v_mov_b32_e32 v3, s2 2024; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2025; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2026; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2027; GFX9-NEXT: buffer_wbinvl1_vol 2028; GFX9-NEXT: BB10_2: 2029; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2030; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2031; GFX9-NEXT: v_mov_b32_e32 v0, v1 2032; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2033; GFX9-NEXT: s_mov_b32 s3, 0xf000 2034; GFX9-NEXT: s_mov_b32 s2, -1 2035; GFX9-NEXT: s_nop 0 2036; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2037; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2038; GFX9-NEXT: s_endpgm 2039; 2040; GFX1064-LABEL: sub_i32_varying: 2041; GFX1064: ; %bb.0: ; %entry 2042; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2043; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2044; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2045; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2046; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2047; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2048; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2049; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 2050; GFX1064-NEXT: s_not_b64 exec, exec 2051; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2052; GFX1064-NEXT: s_not_b64 exec, exec 2053; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2054; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2055; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2056; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2057; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2058; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2059; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2060; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2061; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2062; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2063; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2064; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2065; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2066; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2067; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2068; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2069; GFX1064-NEXT: s_mov_b32 s2, -1 2070; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2071; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2072; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2073; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2074; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2075; GFX1064-NEXT: ; implicit-def: $vgpr0 2076; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2077; GFX1064-NEXT: s_cbranch_execz BB10_2 2078; GFX1064-NEXT: ; %bb.1: 2079; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2080; GFX1064-NEXT: v_mov_b32_e32 v7, s3 2081; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2082; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2083; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7 2084; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2085; GFX1064-NEXT: buffer_gl0_inv 2086; GFX1064-NEXT: buffer_gl1_inv 2087; GFX1064-NEXT: BB10_2: 2088; GFX1064-NEXT: v_nop 2089; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2090; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2091; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2092; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2093; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2094; GFX1064-NEXT: s_nop 1 2095; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2096; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2097; GFX1064-NEXT: s_endpgm 2098; 2099; GFX1032-LABEL: sub_i32_varying: 2100; GFX1032: ; %bb.0: ; %entry 2101; GFX1032-NEXT: ; implicit-def: $vcc_hi 2102; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2103; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2104; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2105; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2106; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2107; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 2108; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2109; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2110; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2111; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2112; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2113; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2114; GFX1032-NEXT: s_mov_b32 s2, -1 2115; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2116; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2117; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2118; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2119; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2120; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2121; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2122; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2123; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2124; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2125; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2126; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2127; GFX1032-NEXT: ; implicit-def: $vgpr0 2128; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2129; GFX1032-NEXT: s_cbranch_execz BB10_2 2130; GFX1032-NEXT: ; %bb.1: 2131; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2132; GFX1032-NEXT: v_mov_b32_e32 v7, s3 2133; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2134; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2135; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7 2136; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2137; GFX1032-NEXT: buffer_gl0_inv 2138; GFX1032-NEXT: buffer_gl1_inv 2139; GFX1032-NEXT: BB10_2: 2140; GFX1032-NEXT: v_nop 2141; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2142; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2143; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2144; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2145; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2146; GFX1032-NEXT: s_nop 1 2147; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2148; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2149; GFX1032-NEXT: s_endpgm 2150entry: 2151 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2152 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2153 store i32 %old, i32 addrspace(1)* %out 2154 ret void 2155} 2156 2157define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2158; 2159; 2160; GFX7LESS-LABEL: sub_i64_constant: 2161; GFX7LESS: ; %bb.0: ; %entry 2162; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2163; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2164; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2165; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2166; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2167; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2168; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2169; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2170; GFX7LESS-NEXT: ; %bb.1: 2171; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2172; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2173; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2174; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2175; GFX7LESS-NEXT: s_mov_b32 m0, -1 2176; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2177; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2178; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2179; GFX7LESS-NEXT: buffer_wbinvl1 2180; GFX7LESS-NEXT: BB11_2: 2181; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2182; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2183; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2184; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2185; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2186; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2187; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2188; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2189; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2190; GFX7LESS-NEXT: s_mov_b32 s2, -1 2191; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2192; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2193; GFX7LESS-NEXT: s_endpgm 2194; 2195; GFX8-LABEL: sub_i64_constant: 2196; GFX8: ; %bb.0: ; %entry 2197; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2198; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2199; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2200; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2201; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2202; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2203; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2204; GFX8-NEXT: s_cbranch_execz BB11_2 2205; GFX8-NEXT: ; %bb.1: 2206; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2207; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2208; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2209; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2210; GFX8-NEXT: s_mov_b32 m0, -1 2211; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2212; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2213; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2214; GFX8-NEXT: buffer_wbinvl1_vol 2215; GFX8-NEXT: BB11_2: 2216; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2217; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2218; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2219; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2220; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2221; GFX8-NEXT: v_mov_b32_e32 v2, s3 2222; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2223; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2224; GFX8-NEXT: s_mov_b32 s3, 0xf000 2225; GFX8-NEXT: s_mov_b32 s2, -1 2226; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2227; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2228; GFX8-NEXT: s_endpgm 2229; 2230; GFX9-LABEL: sub_i64_constant: 2231; GFX9: ; %bb.0: ; %entry 2232; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2233; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2234; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2235; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2236; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2237; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2238; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2239; GFX9-NEXT: s_cbranch_execz BB11_2 2240; GFX9-NEXT: ; %bb.1: 2241; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2242; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2243; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2244; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2245; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2246; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2247; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2248; GFX9-NEXT: buffer_wbinvl1_vol 2249; GFX9-NEXT: BB11_2: 2250; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2251; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2252; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2253; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2254; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2255; GFX9-NEXT: v_mov_b32_e32 v2, s3 2256; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2257; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2258; GFX9-NEXT: s_mov_b32 s3, 0xf000 2259; GFX9-NEXT: s_mov_b32 s2, -1 2260; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2261; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2262; GFX9-NEXT: s_endpgm 2263; 2264; GFX1064-LABEL: sub_i64_constant: 2265; GFX1064: ; %bb.0: ; %entry 2266; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2267; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2268; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2269; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2270; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2271; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2272; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2273; GFX1064-NEXT: s_cbranch_execz BB11_2 2274; GFX1064-NEXT: ; %bb.1: 2275; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2276; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2277; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2278; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2279; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2280; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2281; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2282; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2283; GFX1064-NEXT: buffer_gl0_inv 2284; GFX1064-NEXT: buffer_gl1_inv 2285; GFX1064-NEXT: BB11_2: 2286; GFX1064-NEXT: v_nop 2287; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2288; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2289; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2290; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2291; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2292; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2293; GFX1064-NEXT: s_mov_b32 s2, -1 2294; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2295; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2296; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2297; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2298; GFX1064-NEXT: s_endpgm 2299; 2300; GFX1032-LABEL: sub_i64_constant: 2301; GFX1032: ; %bb.0: ; %entry 2302; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2303; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 2304; GFX1032-NEXT: ; implicit-def: $vcc_hi 2305; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2306; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2307; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2308; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2309; GFX1032-NEXT: s_cbranch_execz BB11_2 2310; GFX1032-NEXT: ; %bb.1: 2311; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2312; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2313; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2314; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 2315; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2316; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2317; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2318; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2319; GFX1032-NEXT: buffer_gl0_inv 2320; GFX1032-NEXT: buffer_gl1_inv 2321; GFX1032-NEXT: BB11_2: 2322; GFX1032-NEXT: v_nop 2323; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2324; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2325; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2326; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2327; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2328; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2329; GFX1032-NEXT: s_mov_b32 s2, -1 2330; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2331; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2332; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2333; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2334; GFX1032-NEXT: s_endpgm 2335entry: 2336 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2337 store i64 %old, i64 addrspace(1)* %out 2338 ret void 2339} 2340 2341define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2342; 2343; 2344; GFX7LESS-LABEL: sub_i64_uniform: 2345; GFX7LESS: ; %bb.0: ; %entry 2346; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2347; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2348; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2349; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2350; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2351; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2352; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2353; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2354; GFX7LESS-NEXT: ; %bb.1: 2355; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2356; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2357; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2358; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2359; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2360; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2361; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2362; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2363; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2364; GFX7LESS-NEXT: s_mov_b32 m0, -1 2365; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2366; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2367; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2368; GFX7LESS-NEXT: buffer_wbinvl1 2369; GFX7LESS-NEXT: BB12_2: 2370; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2371; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2372; GFX7LESS-NEXT: s_mov_b32 s6, -1 2373; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2374; GFX7LESS-NEXT: s_mov_b32 s4, s0 2375; GFX7LESS-NEXT: s_mov_b32 s5, s1 2376; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2377; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2378; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2379; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2380; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2381; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2382; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2383; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2384; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2385; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2386; GFX7LESS-NEXT: s_endpgm 2387; 2388; GFX8-LABEL: sub_i64_uniform: 2389; GFX8: ; %bb.0: ; %entry 2390; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2391; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2392; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2393; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2394; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2395; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2396; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2397; GFX8-NEXT: s_cbranch_execz BB12_2 2398; GFX8-NEXT: ; %bb.1: 2399; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2400; GFX8-NEXT: v_mov_b32_e32 v1, s6 2401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2402; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2403; GFX8-NEXT: s_mul_i32 s7, s3, s6 2404; GFX8-NEXT: s_mul_i32 s6, s2, s6 2405; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2406; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2407; GFX8-NEXT: v_mov_b32_e32 v1, s6 2408; GFX8-NEXT: s_mov_b32 m0, -1 2409; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2410; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2411; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2412; GFX8-NEXT: buffer_wbinvl1_vol 2413; GFX8-NEXT: BB12_2: 2414; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2415; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2416; GFX8-NEXT: s_mov_b32 s4, s0 2417; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2418; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2419; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2420; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2421; GFX8-NEXT: s_mov_b32 s5, s1 2422; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2423; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2424; GFX8-NEXT: v_mov_b32_e32 v2, s1 2425; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2426; GFX8-NEXT: s_mov_b32 s7, 0xf000 2427; GFX8-NEXT: s_mov_b32 s6, -1 2428; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2429; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2430; GFX8-NEXT: s_endpgm 2431; 2432; GFX9-LABEL: sub_i64_uniform: 2433; GFX9: ; %bb.0: ; %entry 2434; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2435; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2436; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2437; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2438; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2439; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2440; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2441; GFX9-NEXT: s_cbranch_execz BB12_2 2442; GFX9-NEXT: ; %bb.1: 2443; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2445; GFX9-NEXT: s_mul_i32 s7, s3, s6 2446; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2447; GFX9-NEXT: s_add_i32 s8, s8, s7 2448; GFX9-NEXT: s_mul_i32 s6, s2, s6 2449; GFX9-NEXT: v_mov_b32_e32 v1, s6 2450; GFX9-NEXT: v_mov_b32_e32 v2, s8 2451; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2452; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2453; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2454; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2455; GFX9-NEXT: buffer_wbinvl1_vol 2456; GFX9-NEXT: BB12_2: 2457; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2458; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2459; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2460; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2461; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2462; GFX9-NEXT: s_mov_b32 s4, s0 2463; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2464; GFX9-NEXT: s_mov_b32 s5, s1 2465; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2466; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2467; GFX9-NEXT: v_mov_b32_e32 v2, s1 2468; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2469; GFX9-NEXT: s_mov_b32 s7, 0xf000 2470; GFX9-NEXT: s_mov_b32 s6, -1 2471; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2472; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2473; GFX9-NEXT: s_endpgm 2474; 2475; GFX1064-LABEL: sub_i64_uniform: 2476; GFX1064: ; %bb.0: ; %entry 2477; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2478; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2479; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2480; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2481; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2482; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2483; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2484; GFX1064-NEXT: s_cbranch_execz BB12_2 2485; GFX1064-NEXT: ; %bb.1: 2486; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2487; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2488; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2489; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2490; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2491; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2492; GFX1064-NEXT: s_add_i32 s8, s8, s7 2493; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2494; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2495; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2496; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2497; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2498; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2499; GFX1064-NEXT: buffer_gl0_inv 2500; GFX1064-NEXT: buffer_gl1_inv 2501; GFX1064-NEXT: BB12_2: 2502; GFX1064-NEXT: v_nop 2503; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2504; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2505; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2506; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2507; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2508; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 2509; GFX1064-NEXT: v_readfirstlane_b32 s5, v2 2510; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2511; GFX1064-NEXT: s_mov_b32 s2, -1 2512; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2513; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s4, v0 2514; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc 2515; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2516; GFX1064-NEXT: s_endpgm 2517; 2518; GFX1032-LABEL: sub_i64_uniform: 2519; GFX1032: ; %bb.0: ; %entry 2520; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2521; GFX1032-NEXT: v_cmp_ne_u32_e64 s5, 1, 0 2522; GFX1032-NEXT: ; implicit-def: $vcc_hi 2523; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2524; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2525; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2526; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2527; GFX1032-NEXT: s_cbranch_execz BB12_2 2528; GFX1032-NEXT: ; %bb.1: 2529; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2530; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2531; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2532; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2533; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2534; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2535; GFX1032-NEXT: s_add_i32 s7, s7, s6 2536; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2537; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2538; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2539; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2540; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2541; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2542; GFX1032-NEXT: buffer_gl0_inv 2543; GFX1032-NEXT: buffer_gl1_inv 2544; GFX1032-NEXT: BB12_2: 2545; GFX1032-NEXT: v_nop 2546; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2547; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2548; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2549; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2550; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2551; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 2552; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 2553; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2554; GFX1032-NEXT: s_mov_b32 s2, -1 2555; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2556; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s4, v0 2557; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo 2558; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2559; GFX1032-NEXT: s_endpgm 2560entry: 2561 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2562 store i64 %old, i64 addrspace(1)* %out 2563 ret void 2564} 2565 2566; GCN-NOT: v_mbcnt_lo_u32_b32 2567; GCN-NOT: v_mbcnt_hi_u32_b32 2568; GCN-NOT: s_bcnt1_i32_b64 2569define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2570; 2571; 2572; GFX7LESS-LABEL: sub_i64_varying: 2573; GFX7LESS: ; %bb.0: ; %entry 2574; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2575; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2576; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2577; GFX7LESS-NEXT: s_mov_b32 m0, -1 2578; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2579; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2580; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2581; GFX7LESS-NEXT: buffer_wbinvl1 2582; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2583; GFX7LESS-NEXT: s_mov_b32 s2, -1 2584; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2585; GFX7LESS-NEXT: s_endpgm 2586; 2587; GFX8-LABEL: sub_i64_varying: 2588; GFX8: ; %bb.0: ; %entry 2589; GFX8-NEXT: v_mov_b32_e32 v1, 0 2590; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2591; GFX8-NEXT: s_mov_b32 m0, -1 2592; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2593; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2594; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2595; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2596; GFX8-NEXT: buffer_wbinvl1_vol 2597; GFX8-NEXT: s_mov_b32 s3, 0xf000 2598; GFX8-NEXT: s_mov_b32 s2, -1 2599; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2600; GFX8-NEXT: s_endpgm 2601; 2602; GFX9-LABEL: sub_i64_varying: 2603; GFX9: ; %bb.0: ; %entry 2604; GFX9-NEXT: v_mov_b32_e32 v1, 0 2605; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2606; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2607; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2608; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2609; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2610; GFX9-NEXT: buffer_wbinvl1_vol 2611; GFX9-NEXT: s_mov_b32 s3, 0xf000 2612; GFX9-NEXT: s_mov_b32 s2, -1 2613; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2614; GFX9-NEXT: s_endpgm 2615; 2616; GFX1064-LABEL: sub_i64_varying: 2617; GFX1064: ; %bb.0: ; %entry 2618; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2619; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2620; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2621; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2622; GFX1064-NEXT: s_mov_b32 s2, -1 2623; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2624; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2625; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2626; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2627; GFX1064-NEXT: buffer_gl0_inv 2628; GFX1064-NEXT: buffer_gl1_inv 2629; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2630; GFX1064-NEXT: s_endpgm 2631; 2632; GFX1032-LABEL: sub_i64_varying: 2633; GFX1032: ; %bb.0: ; %entry 2634; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2635; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2636; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2637; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2638; GFX1032-NEXT: s_mov_b32 s2, -1 2639; GFX1032-NEXT: ; implicit-def: $vcc_hi 2640; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2641; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2642; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2643; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2644; GFX1032-NEXT: buffer_gl0_inv 2645; GFX1032-NEXT: buffer_gl1_inv 2646; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2647; GFX1032-NEXT: s_endpgm 2648entry: 2649 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2650 %zext = zext i32 %lane to i64 2651 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2652 store i64 %old, i64 addrspace(1)* %out 2653 ret void 2654} 2655 2656; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 2657; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 2658; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 2659define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2660; 2661; 2662; GFX7LESS-LABEL: and_i32_varying: 2663; GFX7LESS: ; %bb.0: ; %entry 2664; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2665; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2666; GFX7LESS-NEXT: s_mov_b32 m0, -1 2667; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2668; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2669; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2670; GFX7LESS-NEXT: buffer_wbinvl1 2671; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2672; GFX7LESS-NEXT: s_mov_b32 s2, -1 2673; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2674; GFX7LESS-NEXT: s_endpgm 2675; 2676; GFX8-LABEL: and_i32_varying: 2677; GFX8: ; %bb.0: ; %entry 2678; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2679; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2680; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 2681; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 2682; GFX8-NEXT: v_mov_b32_e32 v2, v0 2683; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2684; GFX8-NEXT: v_mov_b32_e32 v1, -1 2685; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2686; GFX8-NEXT: s_not_b64 exec, exec 2687; GFX8-NEXT: v_mov_b32_e32 v2, -1 2688; GFX8-NEXT: s_not_b64 exec, exec 2689; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2690; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2691; GFX8-NEXT: s_nop 1 2692; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2693; GFX8-NEXT: s_nop 1 2694; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2695; GFX8-NEXT: s_nop 1 2696; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2697; GFX8-NEXT: s_nop 1 2698; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2699; GFX8-NEXT: s_nop 1 2700; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2701; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2702; GFX8-NEXT: s_nop 0 2703; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2704; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2705; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2706; GFX8-NEXT: ; implicit-def: $vgpr0 2707; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2708; GFX8-NEXT: s_cbranch_execz BB14_2 2709; GFX8-NEXT: ; %bb.1: 2710; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2711; GFX8-NEXT: v_mov_b32_e32 v3, s2 2712; GFX8-NEXT: s_mov_b32 m0, -1 2713; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2714; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2715; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2716; GFX8-NEXT: buffer_wbinvl1_vol 2717; GFX8-NEXT: BB14_2: 2718; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2719; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2720; GFX8-NEXT: v_mov_b32_e32 v0, v1 2721; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2722; GFX8-NEXT: s_mov_b32 s3, 0xf000 2723; GFX8-NEXT: s_mov_b32 s2, -1 2724; GFX8-NEXT: s_nop 0 2725; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2726; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2727; GFX8-NEXT: s_endpgm 2728; 2729; GFX9-LABEL: and_i32_varying: 2730; GFX9: ; %bb.0: ; %entry 2731; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2732; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2733; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 2734; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 2735; GFX9-NEXT: v_mov_b32_e32 v2, v0 2736; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2737; GFX9-NEXT: v_mov_b32_e32 v1, -1 2738; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2739; GFX9-NEXT: s_not_b64 exec, exec 2740; GFX9-NEXT: v_mov_b32_e32 v2, -1 2741; GFX9-NEXT: s_not_b64 exec, exec 2742; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2743; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2744; GFX9-NEXT: s_nop 1 2745; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2746; GFX9-NEXT: s_nop 1 2747; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2748; GFX9-NEXT: s_nop 1 2749; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2750; GFX9-NEXT: s_nop 1 2751; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2752; GFX9-NEXT: s_nop 1 2753; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2754; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2755; GFX9-NEXT: s_nop 0 2756; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2757; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2758; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2759; GFX9-NEXT: ; implicit-def: $vgpr0 2760; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2761; GFX9-NEXT: s_cbranch_execz BB14_2 2762; GFX9-NEXT: ; %bb.1: 2763; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2764; GFX9-NEXT: v_mov_b32_e32 v3, s2 2765; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2766; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2767; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2768; GFX9-NEXT: buffer_wbinvl1_vol 2769; GFX9-NEXT: BB14_2: 2770; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2771; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2772; GFX9-NEXT: v_mov_b32_e32 v0, v1 2773; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2774; GFX9-NEXT: s_mov_b32 s3, 0xf000 2775; GFX9-NEXT: s_mov_b32 s2, -1 2776; GFX9-NEXT: s_nop 0 2777; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2778; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2779; GFX9-NEXT: s_endpgm 2780; 2781; GFX1064-LABEL: and_i32_varying: 2782; GFX1064: ; %bb.0: ; %entry 2783; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2784; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2785; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2786; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 2787; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 2788; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2789; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2790; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2791; GFX1064-NEXT: s_not_b64 exec, exec 2792; GFX1064-NEXT: v_mov_b32_e32 v2, -1 2793; GFX1064-NEXT: s_not_b64 exec, exec 2794; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2795; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2796; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2797; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2798; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2799; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2800; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2801; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2802; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2803; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2804; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2805; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2806; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2807; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2808; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2809; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2810; GFX1064-NEXT: s_mov_b32 s2, -1 2811; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2812; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2813; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2814; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2815; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 2816; GFX1064-NEXT: ; implicit-def: $vgpr0 2817; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2818; GFX1064-NEXT: s_cbranch_execz BB14_2 2819; GFX1064-NEXT: ; %bb.1: 2820; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2821; GFX1064-NEXT: v_mov_b32_e32 v7, s3 2822; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2823; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2824; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7 2825; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2826; GFX1064-NEXT: buffer_gl0_inv 2827; GFX1064-NEXT: buffer_gl1_inv 2828; GFX1064-NEXT: BB14_2: 2829; GFX1064-NEXT: v_nop 2830; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2831; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2832; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2833; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2834; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2835; GFX1064-NEXT: s_nop 1 2836; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2838; GFX1064-NEXT: s_endpgm 2839; 2840; GFX1032-LABEL: and_i32_varying: 2841; GFX1032: ; %bb.0: ; %entry 2842; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2843; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 2844; GFX1032-NEXT: ; implicit-def: $vcc_hi 2845; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2846; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 2847; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2848; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2849; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2850; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2851; GFX1032-NEXT: v_mov_b32_e32 v2, -1 2852; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2853; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2854; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2855; GFX1032-NEXT: s_mov_b32 s2, -1 2856; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2857; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2858; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2859; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2860; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2861; GFX1032-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2862; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2863; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2864; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2865; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2866; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2867; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 2868; GFX1032-NEXT: ; implicit-def: $vgpr0 2869; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2870; GFX1032-NEXT: s_cbranch_execz BB14_2 2871; GFX1032-NEXT: ; %bb.1: 2872; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2873; GFX1032-NEXT: v_mov_b32_e32 v7, s3 2874; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2875; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2876; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7 2877; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2878; GFX1032-NEXT: buffer_gl0_inv 2879; GFX1032-NEXT: buffer_gl1_inv 2880; GFX1032-NEXT: BB14_2: 2881; GFX1032-NEXT: v_nop 2882; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2883; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2884; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2885; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2886; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2887; GFX1032-NEXT: s_nop 1 2888; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2889; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2890; GFX1032-NEXT: s_endpgm 2891entry: 2892 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2893 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2894 store i32 %old, i32 addrspace(1)* %out 2895 ret void 2896} 2897 2898; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 2899; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 2900; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 2901define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2902; 2903; 2904; GFX7LESS-LABEL: or_i32_varying: 2905; GFX7LESS: ; %bb.0: ; %entry 2906; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2907; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2908; GFX7LESS-NEXT: s_mov_b32 m0, -1 2909; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2910; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2911; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2912; GFX7LESS-NEXT: buffer_wbinvl1 2913; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2914; GFX7LESS-NEXT: s_mov_b32 s2, -1 2915; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2916; GFX7LESS-NEXT: s_endpgm 2917; 2918; GFX8-LABEL: or_i32_varying: 2919; GFX8: ; %bb.0: ; %entry 2920; GFX8-NEXT: v_mov_b32_e32 v2, v0 2921; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2922; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2923; GFX8-NEXT: v_mov_b32_e32 v1, 0 2924; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2925; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2926; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2927; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2928; GFX8-NEXT: s_not_b64 exec, exec 2929; GFX8-NEXT: v_mov_b32_e32 v2, 0 2930; GFX8-NEXT: s_not_b64 exec, exec 2931; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2932; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2933; GFX8-NEXT: s_nop 1 2934; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2935; GFX8-NEXT: s_nop 1 2936; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2937; GFX8-NEXT: s_nop 1 2938; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2939; GFX8-NEXT: s_nop 1 2940; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2941; GFX8-NEXT: s_nop 1 2942; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2943; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2944; GFX8-NEXT: s_nop 0 2945; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2946; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2947; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2948; GFX8-NEXT: ; implicit-def: $vgpr0 2949; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2950; GFX8-NEXT: s_cbranch_execz BB15_2 2951; GFX8-NEXT: ; %bb.1: 2952; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2953; GFX8-NEXT: v_mov_b32_e32 v3, s2 2954; GFX8-NEXT: s_mov_b32 m0, -1 2955; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2956; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2957; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2958; GFX8-NEXT: buffer_wbinvl1_vol 2959; GFX8-NEXT: BB15_2: 2960; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2961; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2962; GFX8-NEXT: v_mov_b32_e32 v0, v1 2963; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2964; GFX8-NEXT: s_mov_b32 s3, 0xf000 2965; GFX8-NEXT: s_mov_b32 s2, -1 2966; GFX8-NEXT: s_nop 0 2967; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2968; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2969; GFX8-NEXT: s_endpgm 2970; 2971; GFX9-LABEL: or_i32_varying: 2972; GFX9: ; %bb.0: ; %entry 2973; GFX9-NEXT: v_mov_b32_e32 v2, v0 2974; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2975; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2976; GFX9-NEXT: v_mov_b32_e32 v1, 0 2977; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2978; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2979; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2980; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2981; GFX9-NEXT: s_not_b64 exec, exec 2982; GFX9-NEXT: v_mov_b32_e32 v2, 0 2983; GFX9-NEXT: s_not_b64 exec, exec 2984; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2985; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2986; GFX9-NEXT: s_nop 1 2987; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2988; GFX9-NEXT: s_nop 1 2989; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2990; GFX9-NEXT: s_nop 1 2991; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2992; GFX9-NEXT: s_nop 1 2993; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2994; GFX9-NEXT: s_nop 1 2995; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2996; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2997; GFX9-NEXT: s_nop 0 2998; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2999; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3000; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3001; GFX9-NEXT: ; implicit-def: $vgpr0 3002; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3003; GFX9-NEXT: s_cbranch_execz BB15_2 3004; GFX9-NEXT: ; %bb.1: 3005; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3006; GFX9-NEXT: v_mov_b32_e32 v3, s2 3007; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3008; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3009; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3010; GFX9-NEXT: buffer_wbinvl1_vol 3011; GFX9-NEXT: BB15_2: 3012; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3013; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3014; GFX9-NEXT: v_mov_b32_e32 v0, v1 3015; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3016; GFX9-NEXT: s_mov_b32 s3, 0xf000 3017; GFX9-NEXT: s_mov_b32 s2, -1 3018; GFX9-NEXT: s_nop 0 3019; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3020; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3021; GFX9-NEXT: s_endpgm 3022; 3023; GFX1064-LABEL: or_i32_varying: 3024; GFX1064: ; %bb.0: ; %entry 3025; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3026; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3027; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3028; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3029; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3030; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3031; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3032; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3033; GFX1064-NEXT: s_not_b64 exec, exec 3034; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3035; GFX1064-NEXT: s_not_b64 exec, exec 3036; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3037; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3038; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3039; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3040; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3041; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3042; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3043; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3044; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3045; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3046; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3047; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3048; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3049; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3050; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3051; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3052; GFX1064-NEXT: s_mov_b32 s2, -1 3053; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3054; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3055; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3056; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3057; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3058; GFX1064-NEXT: ; implicit-def: $vgpr0 3059; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3060; GFX1064-NEXT: s_cbranch_execz BB15_2 3061; GFX1064-NEXT: ; %bb.1: 3062; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3063; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3064; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3065; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3066; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7 3067; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3068; GFX1064-NEXT: buffer_gl0_inv 3069; GFX1064-NEXT: buffer_gl1_inv 3070; GFX1064-NEXT: BB15_2: 3071; GFX1064-NEXT: v_nop 3072; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3073; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3074; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3075; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3076; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3077; GFX1064-NEXT: s_nop 1 3078; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3079; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3080; GFX1064-NEXT: s_endpgm 3081; 3082; GFX1032-LABEL: or_i32_varying: 3083; GFX1032: ; %bb.0: ; %entry 3084; GFX1032-NEXT: ; implicit-def: $vcc_hi 3085; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3086; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3087; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3088; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3089; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3090; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3091; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3092; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3093; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3094; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3095; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3096; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3097; GFX1032-NEXT: s_mov_b32 s2, -1 3098; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3099; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3100; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3101; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3102; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3103; GFX1032-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3104; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3105; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3106; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3107; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3108; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3109; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3110; GFX1032-NEXT: ; implicit-def: $vgpr0 3111; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3112; GFX1032-NEXT: s_cbranch_execz BB15_2 3113; GFX1032-NEXT: ; %bb.1: 3114; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3115; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3116; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3117; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3118; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7 3119; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3120; GFX1032-NEXT: buffer_gl0_inv 3121; GFX1032-NEXT: buffer_gl1_inv 3122; GFX1032-NEXT: BB15_2: 3123; GFX1032-NEXT: v_nop 3124; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3125; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3126; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3127; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3128; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3129; GFX1032-NEXT: s_nop 1 3130; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3131; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3132; GFX1032-NEXT: s_endpgm 3133entry: 3134 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3135 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3136 store i32 %old, i32 addrspace(1)* %out 3137 ret void 3138} 3139 3140; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3141; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3142; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3143define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3144; 3145; 3146; GFX7LESS-LABEL: xor_i32_varying: 3147; GFX7LESS: ; %bb.0: ; %entry 3148; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3149; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3150; GFX7LESS-NEXT: s_mov_b32 m0, -1 3151; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3152; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3153; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3154; GFX7LESS-NEXT: buffer_wbinvl1 3155; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3156; GFX7LESS-NEXT: s_mov_b32 s2, -1 3157; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3158; GFX7LESS-NEXT: s_endpgm 3159; 3160; GFX8-LABEL: xor_i32_varying: 3161; GFX8: ; %bb.0: ; %entry 3162; GFX8-NEXT: v_mov_b32_e32 v2, v0 3163; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3164; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3165; GFX8-NEXT: v_mov_b32_e32 v1, 0 3166; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3167; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3168; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3169; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3170; GFX8-NEXT: s_not_b64 exec, exec 3171; GFX8-NEXT: v_mov_b32_e32 v2, 0 3172; GFX8-NEXT: s_not_b64 exec, exec 3173; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3174; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3175; GFX8-NEXT: s_nop 1 3176; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3177; GFX8-NEXT: s_nop 1 3178; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3179; GFX8-NEXT: s_nop 1 3180; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3181; GFX8-NEXT: s_nop 1 3182; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3183; GFX8-NEXT: s_nop 1 3184; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3185; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3186; GFX8-NEXT: s_nop 0 3187; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3188; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3189; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3190; GFX8-NEXT: ; implicit-def: $vgpr0 3191; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3192; GFX8-NEXT: s_cbranch_execz BB16_2 3193; GFX8-NEXT: ; %bb.1: 3194; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3195; GFX8-NEXT: v_mov_b32_e32 v3, s2 3196; GFX8-NEXT: s_mov_b32 m0, -1 3197; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3198; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3199; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3200; GFX8-NEXT: buffer_wbinvl1_vol 3201; GFX8-NEXT: BB16_2: 3202; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3203; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3204; GFX8-NEXT: v_mov_b32_e32 v0, v1 3205; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3206; GFX8-NEXT: s_mov_b32 s3, 0xf000 3207; GFX8-NEXT: s_mov_b32 s2, -1 3208; GFX8-NEXT: s_nop 0 3209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3210; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3211; GFX8-NEXT: s_endpgm 3212; 3213; GFX9-LABEL: xor_i32_varying: 3214; GFX9: ; %bb.0: ; %entry 3215; GFX9-NEXT: v_mov_b32_e32 v2, v0 3216; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3217; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3218; GFX9-NEXT: v_mov_b32_e32 v1, 0 3219; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3220; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3221; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3222; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3223; GFX9-NEXT: s_not_b64 exec, exec 3224; GFX9-NEXT: v_mov_b32_e32 v2, 0 3225; GFX9-NEXT: s_not_b64 exec, exec 3226; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3227; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3228; GFX9-NEXT: s_nop 1 3229; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3230; GFX9-NEXT: s_nop 1 3231; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3232; GFX9-NEXT: s_nop 1 3233; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3234; GFX9-NEXT: s_nop 1 3235; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3236; GFX9-NEXT: s_nop 1 3237; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3238; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3239; GFX9-NEXT: s_nop 0 3240; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3241; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3242; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3243; GFX9-NEXT: ; implicit-def: $vgpr0 3244; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3245; GFX9-NEXT: s_cbranch_execz BB16_2 3246; GFX9-NEXT: ; %bb.1: 3247; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3248; GFX9-NEXT: v_mov_b32_e32 v3, s2 3249; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3250; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3251; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3252; GFX9-NEXT: buffer_wbinvl1_vol 3253; GFX9-NEXT: BB16_2: 3254; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3255; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3256; GFX9-NEXT: v_mov_b32_e32 v0, v1 3257; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3258; GFX9-NEXT: s_mov_b32 s3, 0xf000 3259; GFX9-NEXT: s_mov_b32 s2, -1 3260; GFX9-NEXT: s_nop 0 3261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3262; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3263; GFX9-NEXT: s_endpgm 3264; 3265; GFX1064-LABEL: xor_i32_varying: 3266; GFX1064: ; %bb.0: ; %entry 3267; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3268; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3269; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3270; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3271; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3272; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3273; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3274; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3275; GFX1064-NEXT: s_not_b64 exec, exec 3276; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3277; GFX1064-NEXT: s_not_b64 exec, exec 3278; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3279; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3280; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3281; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3282; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3283; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3284; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3285; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3286; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3287; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3288; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3289; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3290; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3291; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3292; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3293; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3294; GFX1064-NEXT: s_mov_b32 s2, -1 3295; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3296; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3297; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3298; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3299; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3300; GFX1064-NEXT: ; implicit-def: $vgpr0 3301; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3302; GFX1064-NEXT: s_cbranch_execz BB16_2 3303; GFX1064-NEXT: ; %bb.1: 3304; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3305; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3306; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3307; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3308; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7 3309; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3310; GFX1064-NEXT: buffer_gl0_inv 3311; GFX1064-NEXT: buffer_gl1_inv 3312; GFX1064-NEXT: BB16_2: 3313; GFX1064-NEXT: v_nop 3314; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3315; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3316; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3317; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3318; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3319; GFX1064-NEXT: s_nop 1 3320; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3321; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3322; GFX1064-NEXT: s_endpgm 3323; 3324; GFX1032-LABEL: xor_i32_varying: 3325; GFX1032: ; %bb.0: ; %entry 3326; GFX1032-NEXT: ; implicit-def: $vcc_hi 3327; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3328; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3329; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3330; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3331; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3332; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3333; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3334; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3335; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3336; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3337; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3338; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3339; GFX1032-NEXT: s_mov_b32 s2, -1 3340; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3341; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3342; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3343; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3344; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3345; GFX1032-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3346; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3347; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3348; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3349; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3350; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3351; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3352; GFX1032-NEXT: ; implicit-def: $vgpr0 3353; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3354; GFX1032-NEXT: s_cbranch_execz BB16_2 3355; GFX1032-NEXT: ; %bb.1: 3356; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3357; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3358; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3359; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3360; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7 3361; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3362; GFX1032-NEXT: buffer_gl0_inv 3363; GFX1032-NEXT: buffer_gl1_inv 3364; GFX1032-NEXT: BB16_2: 3365; GFX1032-NEXT: v_nop 3366; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3367; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3368; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3369; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3370; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3371; GFX1032-NEXT: s_nop 1 3372; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3373; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3374; GFX1032-NEXT: s_endpgm 3375entry: 3376 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3377 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3378 store i32 %old, i32 addrspace(1)* %out 3379 ret void 3380} 3381 3382; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3383; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3384; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3385define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3386; 3387; 3388; GFX7LESS-LABEL: max_i32_varying: 3389; GFX7LESS: ; %bb.0: ; %entry 3390; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3391; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3392; GFX7LESS-NEXT: s_mov_b32 m0, -1 3393; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3394; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3395; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3396; GFX7LESS-NEXT: buffer_wbinvl1 3397; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3398; GFX7LESS-NEXT: s_mov_b32 s2, -1 3399; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3400; GFX7LESS-NEXT: s_endpgm 3401; 3402; GFX8-LABEL: max_i32_varying: 3403; GFX8: ; %bb.0: ; %entry 3404; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3405; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3406; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3407; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3408; GFX8-NEXT: v_mov_b32_e32 v2, v0 3409; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3410; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3411; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3412; GFX8-NEXT: s_not_b64 exec, exec 3413; GFX8-NEXT: v_mov_b32_e32 v2, v1 3414; GFX8-NEXT: s_not_b64 exec, exec 3415; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3416; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3417; GFX8-NEXT: s_nop 1 3418; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3419; GFX8-NEXT: s_nop 1 3420; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3421; GFX8-NEXT: s_nop 1 3422; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3423; GFX8-NEXT: s_nop 1 3424; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3425; GFX8-NEXT: s_nop 1 3426; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3427; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3428; GFX8-NEXT: s_nop 0 3429; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3430; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3431; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3432; GFX8-NEXT: ; implicit-def: $vgpr0 3433; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3434; GFX8-NEXT: s_cbranch_execz BB17_2 3435; GFX8-NEXT: ; %bb.1: 3436; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3437; GFX8-NEXT: v_mov_b32_e32 v3, s2 3438; GFX8-NEXT: s_mov_b32 m0, -1 3439; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3440; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3441; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3442; GFX8-NEXT: buffer_wbinvl1_vol 3443; GFX8-NEXT: BB17_2: 3444; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3445; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3446; GFX8-NEXT: v_mov_b32_e32 v0, v1 3447; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3448; GFX8-NEXT: s_mov_b32 s3, 0xf000 3449; GFX8-NEXT: s_mov_b32 s2, -1 3450; GFX8-NEXT: s_nop 0 3451; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3452; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3453; GFX8-NEXT: s_endpgm 3454; 3455; GFX9-LABEL: max_i32_varying: 3456; GFX9: ; %bb.0: ; %entry 3457; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3458; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3459; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3460; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3461; GFX9-NEXT: v_mov_b32_e32 v2, v0 3462; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3463; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3464; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3465; GFX9-NEXT: s_not_b64 exec, exec 3466; GFX9-NEXT: v_mov_b32_e32 v2, v1 3467; GFX9-NEXT: s_not_b64 exec, exec 3468; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3469; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3470; GFX9-NEXT: s_nop 1 3471; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3472; GFX9-NEXT: s_nop 1 3473; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3474; GFX9-NEXT: s_nop 1 3475; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3476; GFX9-NEXT: s_nop 1 3477; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3478; GFX9-NEXT: s_nop 1 3479; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3480; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3481; GFX9-NEXT: s_nop 0 3482; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3483; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3484; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3485; GFX9-NEXT: ; implicit-def: $vgpr0 3486; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3487; GFX9-NEXT: s_cbranch_execz BB17_2 3488; GFX9-NEXT: ; %bb.1: 3489; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3490; GFX9-NEXT: v_mov_b32_e32 v3, s2 3491; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3492; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3493; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3494; GFX9-NEXT: buffer_wbinvl1_vol 3495; GFX9-NEXT: BB17_2: 3496; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3497; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3498; GFX9-NEXT: v_mov_b32_e32 v0, v1 3499; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3500; GFX9-NEXT: s_mov_b32 s3, 0xf000 3501; GFX9-NEXT: s_mov_b32 s2, -1 3502; GFX9-NEXT: s_nop 0 3503; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3504; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3505; GFX9-NEXT: s_endpgm 3506; 3507; GFX1064-LABEL: max_i32_varying: 3508; GFX1064: ; %bb.0: ; %entry 3509; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3510; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3511; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3512; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 3513; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 3514; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3515; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3516; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3517; GFX1064-NEXT: s_not_b64 exec, exec 3518; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3519; GFX1064-NEXT: s_not_b64 exec, exec 3520; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3521; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3522; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3523; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3524; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3525; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3526; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3527; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3528; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3529; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3530; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3531; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3532; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3533; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3534; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3535; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3536; GFX1064-NEXT: s_mov_b32 s2, -1 3537; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3538; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3539; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3540; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3541; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3542; GFX1064-NEXT: ; implicit-def: $vgpr0 3543; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3544; GFX1064-NEXT: s_cbranch_execz BB17_2 3545; GFX1064-NEXT: ; %bb.1: 3546; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3547; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3548; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3549; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3550; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7 3551; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3552; GFX1064-NEXT: buffer_gl0_inv 3553; GFX1064-NEXT: buffer_gl1_inv 3554; GFX1064-NEXT: BB17_2: 3555; GFX1064-NEXT: v_nop 3556; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3557; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3558; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3559; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3560; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3561; GFX1064-NEXT: s_nop 1 3562; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3563; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3564; GFX1064-NEXT: s_endpgm 3565; 3566; GFX1032-LABEL: max_i32_varying: 3567; GFX1032: ; %bb.0: ; %entry 3568; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3569; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3570; GFX1032-NEXT: ; implicit-def: $vcc_hi 3571; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3572; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 3573; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3574; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3575; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3576; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3577; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3578; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3579; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3580; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3581; GFX1032-NEXT: s_mov_b32 s2, -1 3582; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3583; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3584; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3585; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3586; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3587; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3588; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3589; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3590; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3591; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3592; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3593; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 3594; GFX1032-NEXT: ; implicit-def: $vgpr0 3595; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3596; GFX1032-NEXT: s_cbranch_execz BB17_2 3597; GFX1032-NEXT: ; %bb.1: 3598; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3599; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3600; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3601; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3602; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7 3603; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3604; GFX1032-NEXT: buffer_gl0_inv 3605; GFX1032-NEXT: buffer_gl1_inv 3606; GFX1032-NEXT: BB17_2: 3607; GFX1032-NEXT: v_nop 3608; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3609; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3610; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3611; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3612; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3613; GFX1032-NEXT: s_nop 1 3614; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3615; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3616; GFX1032-NEXT: s_endpgm 3617entry: 3618 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3619 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3620 store i32 %old, i32 addrspace(1)* %out 3621 ret void 3622} 3623 3624define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3625; 3626; 3627; GFX7LESS-LABEL: max_i64_constant: 3628; GFX7LESS: ; %bb.0: ; %entry 3629; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3630; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3631; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3632; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 3633; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3634; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3635; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3636; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3637; GFX7LESS-NEXT: ; %bb.1: 3638; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3639; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3640; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3641; GFX7LESS-NEXT: s_mov_b32 m0, -1 3642; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3643; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3644; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3645; GFX7LESS-NEXT: buffer_wbinvl1 3646; GFX7LESS-NEXT: BB18_2: 3647; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3648; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3649; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3650; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3651; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3652; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3653; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3654; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3655; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3656; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3657; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3658; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3659; GFX7LESS-NEXT: s_mov_b32 s2, -1 3660; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3661; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3662; GFX7LESS-NEXT: s_endpgm 3663; 3664; GFX8-LABEL: max_i64_constant: 3665; GFX8: ; %bb.0: ; %entry 3666; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3667; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3668; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3669; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3670; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3671; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3672; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3673; GFX8-NEXT: s_cbranch_execz BB18_2 3674; GFX8-NEXT: ; %bb.1: 3675; GFX8-NEXT: v_mov_b32_e32 v0, 5 3676; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3677; GFX8-NEXT: v_mov_b32_e32 v1, 0 3678; GFX8-NEXT: s_mov_b32 m0, -1 3679; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3680; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3681; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3682; GFX8-NEXT: buffer_wbinvl1_vol 3683; GFX8-NEXT: BB18_2: 3684; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3685; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3686; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3687; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3688; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3689; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3690; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3691; GFX8-NEXT: v_mov_b32_e32 v2, s3 3692; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3693; GFX8-NEXT: v_mov_b32_e32 v2, s2 3694; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3695; GFX8-NEXT: s_mov_b32 s3, 0xf000 3696; GFX8-NEXT: s_mov_b32 s2, -1 3697; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3698; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3699; GFX8-NEXT: s_endpgm 3700; 3701; GFX9-LABEL: max_i64_constant: 3702; GFX9: ; %bb.0: ; %entry 3703; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3704; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3705; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3706; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3707; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3708; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3709; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3710; GFX9-NEXT: s_cbranch_execz BB18_2 3711; GFX9-NEXT: ; %bb.1: 3712; GFX9-NEXT: v_mov_b32_e32 v0, 5 3713; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3714; GFX9-NEXT: v_mov_b32_e32 v1, 0 3715; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3716; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3717; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3718; GFX9-NEXT: buffer_wbinvl1_vol 3719; GFX9-NEXT: BB18_2: 3720; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3721; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3722; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3723; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3724; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3725; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3726; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3727; GFX9-NEXT: v_mov_b32_e32 v2, s3 3728; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3729; GFX9-NEXT: v_mov_b32_e32 v2, s2 3730; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3731; GFX9-NEXT: s_mov_b32 s3, 0xf000 3732; GFX9-NEXT: s_mov_b32 s2, -1 3733; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3734; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3735; GFX9-NEXT: s_endpgm 3736; 3737; GFX1064-LABEL: max_i64_constant: 3738; GFX1064: ; %bb.0: ; %entry 3739; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3740; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3741; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3742; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3743; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3744; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3745; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3746; GFX1064-NEXT: s_cbranch_execz BB18_2 3747; GFX1064-NEXT: ; %bb.1: 3748; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3749; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3750; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3751; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3752; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3753; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3754; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3755; GFX1064-NEXT: buffer_gl0_inv 3756; GFX1064-NEXT: buffer_gl1_inv 3757; GFX1064-NEXT: BB18_2: 3758; GFX1064-NEXT: v_nop 3759; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3760; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 3761; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 3762; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3763; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3764; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3765; GFX1064-NEXT: s_mov_b32 s2, -1 3766; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3767; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 3768; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 3769; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3770; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3771; GFX1064-NEXT: s_endpgm 3772; 3773; GFX1032-LABEL: max_i64_constant: 3774; GFX1032: ; %bb.0: ; %entry 3775; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3776; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3777; GFX1032-NEXT: ; implicit-def: $vcc_hi 3778; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3779; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3780; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3781; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3782; GFX1032-NEXT: s_cbranch_execz BB18_2 3783; GFX1032-NEXT: ; %bb.1: 3784; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3785; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3786; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3787; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3788; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3789; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3790; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3791; GFX1032-NEXT: buffer_gl0_inv 3792; GFX1032-NEXT: buffer_gl1_inv 3793; GFX1032-NEXT: BB18_2: 3794; GFX1032-NEXT: v_nop 3795; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3796; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 3797; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 3798; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3799; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3800; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3801; GFX1032-NEXT: s_mov_b32 s2, -1 3802; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[0:1] 3803; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 3804; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 3805; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3806; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3807; GFX1032-NEXT: s_endpgm 3808entry: 3809 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3810 store i64 %old, i64 addrspace(1)* %out 3811 ret void 3812} 3813 3814; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3815; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3816; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3817define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3818; 3819; 3820; GFX7LESS-LABEL: min_i32_varying: 3821; GFX7LESS: ; %bb.0: ; %entry 3822; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3823; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3824; GFX7LESS-NEXT: s_mov_b32 m0, -1 3825; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3826; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3827; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3828; GFX7LESS-NEXT: buffer_wbinvl1 3829; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3830; GFX7LESS-NEXT: s_mov_b32 s2, -1 3831; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3832; GFX7LESS-NEXT: s_endpgm 3833; 3834; GFX8-LABEL: min_i32_varying: 3835; GFX8: ; %bb.0: ; %entry 3836; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3837; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3838; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3839; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3840; GFX8-NEXT: v_mov_b32_e32 v2, v0 3841; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3842; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3843; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3844; GFX8-NEXT: s_not_b64 exec, exec 3845; GFX8-NEXT: v_mov_b32_e32 v2, v1 3846; GFX8-NEXT: s_not_b64 exec, exec 3847; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3848; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3849; GFX8-NEXT: s_nop 1 3850; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3851; GFX8-NEXT: s_nop 1 3852; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3853; GFX8-NEXT: s_nop 1 3854; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3855; GFX8-NEXT: s_nop 1 3856; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3857; GFX8-NEXT: s_nop 1 3858; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3859; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3860; GFX8-NEXT: s_nop 0 3861; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3862; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3863; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3864; GFX8-NEXT: ; implicit-def: $vgpr0 3865; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3866; GFX8-NEXT: s_cbranch_execz BB19_2 3867; GFX8-NEXT: ; %bb.1: 3868; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3869; GFX8-NEXT: v_mov_b32_e32 v3, s2 3870; GFX8-NEXT: s_mov_b32 m0, -1 3871; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3872; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3873; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3874; GFX8-NEXT: buffer_wbinvl1_vol 3875; GFX8-NEXT: BB19_2: 3876; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3877; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3878; GFX8-NEXT: v_mov_b32_e32 v0, v1 3879; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3880; GFX8-NEXT: s_mov_b32 s3, 0xf000 3881; GFX8-NEXT: s_mov_b32 s2, -1 3882; GFX8-NEXT: s_nop 0 3883; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3884; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3885; GFX8-NEXT: s_endpgm 3886; 3887; GFX9-LABEL: min_i32_varying: 3888; GFX9: ; %bb.0: ; %entry 3889; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3890; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3891; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3892; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3893; GFX9-NEXT: v_mov_b32_e32 v2, v0 3894; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3895; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3896; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3897; GFX9-NEXT: s_not_b64 exec, exec 3898; GFX9-NEXT: v_mov_b32_e32 v2, v1 3899; GFX9-NEXT: s_not_b64 exec, exec 3900; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3901; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3902; GFX9-NEXT: s_nop 1 3903; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3904; GFX9-NEXT: s_nop 1 3905; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3906; GFX9-NEXT: s_nop 1 3907; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3908; GFX9-NEXT: s_nop 1 3909; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3910; GFX9-NEXT: s_nop 1 3911; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3912; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3913; GFX9-NEXT: s_nop 0 3914; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3915; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3916; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3917; GFX9-NEXT: ; implicit-def: $vgpr0 3918; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3919; GFX9-NEXT: s_cbranch_execz BB19_2 3920; GFX9-NEXT: ; %bb.1: 3921; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3922; GFX9-NEXT: v_mov_b32_e32 v3, s2 3923; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3924; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3925; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3926; GFX9-NEXT: buffer_wbinvl1_vol 3927; GFX9-NEXT: BB19_2: 3928; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3929; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3930; GFX9-NEXT: v_mov_b32_e32 v0, v1 3931; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3932; GFX9-NEXT: s_mov_b32 s3, 0xf000 3933; GFX9-NEXT: s_mov_b32 s2, -1 3934; GFX9-NEXT: s_nop 0 3935; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3936; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3937; GFX9-NEXT: s_endpgm 3938; 3939; GFX1064-LABEL: min_i32_varying: 3940; GFX1064: ; %bb.0: ; %entry 3941; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3942; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3943; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3944; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 3945; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 3946; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3947; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3948; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3949; GFX1064-NEXT: s_not_b64 exec, exec 3950; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3951; GFX1064-NEXT: s_not_b64 exec, exec 3952; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3953; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3954; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3955; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3956; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3957; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3958; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3959; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3960; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3961; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3962; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3963; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3964; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3965; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3966; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3967; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3968; GFX1064-NEXT: s_mov_b32 s2, -1 3969; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3970; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3971; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3972; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3973; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3974; GFX1064-NEXT: ; implicit-def: $vgpr0 3975; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3976; GFX1064-NEXT: s_cbranch_execz BB19_2 3977; GFX1064-NEXT: ; %bb.1: 3978; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3979; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3980; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3981; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3982; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7 3983; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3984; GFX1064-NEXT: buffer_gl0_inv 3985; GFX1064-NEXT: buffer_gl1_inv 3986; GFX1064-NEXT: BB19_2: 3987; GFX1064-NEXT: v_nop 3988; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3989; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3990; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3991; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3992; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3993; GFX1064-NEXT: s_nop 1 3994; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3995; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3996; GFX1064-NEXT: s_endpgm 3997; 3998; GFX1032-LABEL: min_i32_varying: 3999; GFX1032: ; %bb.0: ; %entry 4000; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4001; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4002; GFX1032-NEXT: ; implicit-def: $vcc_hi 4003; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4004; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4005; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4006; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 4007; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4008; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4009; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4010; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4011; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4012; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4013; GFX1032-NEXT: s_mov_b32 s2, -1 4014; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4015; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4016; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4017; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4018; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4019; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4020; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4021; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4022; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4023; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4024; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4025; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4026; GFX1032-NEXT: ; implicit-def: $vgpr0 4027; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4028; GFX1032-NEXT: s_cbranch_execz BB19_2 4029; GFX1032-NEXT: ; %bb.1: 4030; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4031; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4032; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4033; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4034; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7 4035; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4036; GFX1032-NEXT: buffer_gl0_inv 4037; GFX1032-NEXT: buffer_gl1_inv 4038; GFX1032-NEXT: BB19_2: 4039; GFX1032-NEXT: v_nop 4040; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4041; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4042; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4043; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 4044; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4045; GFX1032-NEXT: s_nop 1 4046; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4047; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4048; GFX1032-NEXT: s_endpgm 4049entry: 4050 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4051 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4052 store i32 %old, i32 addrspace(1)* %out 4053 ret void 4054} 4055 4056define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 4057; 4058; 4059; GFX7LESS-LABEL: min_i64_constant: 4060; GFX7LESS: ; %bb.0: ; %entry 4061; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4062; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4063; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4064; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 4065; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4066; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4067; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4068; GFX7LESS-NEXT: s_cbranch_execz BB20_2 4069; GFX7LESS-NEXT: ; %bb.1: 4070; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4071; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4072; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4073; GFX7LESS-NEXT: s_mov_b32 m0, -1 4074; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4075; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4076; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4077; GFX7LESS-NEXT: buffer_wbinvl1 4078; GFX7LESS-NEXT: BB20_2: 4079; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4080; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4081; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4082; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 4083; GFX7LESS-NEXT: s_mov_b32 s2, -1 4084; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4085; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4086; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4087; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4088; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4089; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4090; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4091; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4092; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4093; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4094; GFX7LESS-NEXT: s_endpgm 4095; 4096; GFX8-LABEL: min_i64_constant: 4097; GFX8: ; %bb.0: ; %entry 4098; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4099; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4100; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4101; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4102; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4103; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4104; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4105; GFX8-NEXT: s_cbranch_execz BB20_2 4106; GFX8-NEXT: ; %bb.1: 4107; GFX8-NEXT: v_mov_b32_e32 v0, 5 4108; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4109; GFX8-NEXT: v_mov_b32_e32 v1, 0 4110; GFX8-NEXT: s_mov_b32 m0, -1 4111; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4112; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4113; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4114; GFX8-NEXT: buffer_wbinvl1_vol 4115; GFX8-NEXT: BB20_2: 4116; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4117; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4118; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 4119; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4120; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4121; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4122; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4123; GFX8-NEXT: v_mov_b32_e32 v2, s5 4124; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4125; GFX8-NEXT: v_mov_b32_e32 v2, s4 4126; GFX8-NEXT: s_mov_b32 s2, -1 4127; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4128; GFX8-NEXT: s_mov_b32 s3, 0xf000 4129; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4130; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4131; GFX8-NEXT: s_endpgm 4132; 4133; GFX9-LABEL: min_i64_constant: 4134; GFX9: ; %bb.0: ; %entry 4135; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4136; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4137; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4138; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4139; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4140; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4141; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4142; GFX9-NEXT: s_cbranch_execz BB20_2 4143; GFX9-NEXT: ; %bb.1: 4144; GFX9-NEXT: v_mov_b32_e32 v0, 5 4145; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4146; GFX9-NEXT: v_mov_b32_e32 v1, 0 4147; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4148; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4149; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4150; GFX9-NEXT: buffer_wbinvl1_vol 4151; GFX9-NEXT: BB20_2: 4152; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4153; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4154; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4155; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4156; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4157; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4158; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4159; GFX9-NEXT: v_mov_b32_e32 v2, s5 4160; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4161; GFX9-NEXT: v_mov_b32_e32 v2, s4 4162; GFX9-NEXT: s_mov_b32 s2, -1 4163; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4164; GFX9-NEXT: s_mov_b32 s3, 0xf000 4165; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4166; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4167; GFX9-NEXT: s_endpgm 4168; 4169; GFX1064-LABEL: min_i64_constant: 4170; GFX1064: ; %bb.0: ; %entry 4171; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4172; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4173; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4174; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4175; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4176; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4177; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4178; GFX1064-NEXT: s_cbranch_execz BB20_2 4179; GFX1064-NEXT: ; %bb.1: 4180; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4181; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4182; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4183; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4184; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4185; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4186; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4187; GFX1064-NEXT: buffer_gl0_inv 4188; GFX1064-NEXT: buffer_gl1_inv 4189; GFX1064-NEXT: BB20_2: 4190; GFX1064-NEXT: v_nop 4191; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4192; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4193; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4194; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4195; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4196; GFX1064-NEXT: s_mov_b32 s2, -1 4197; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4198; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4199; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 4200; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4201; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4202; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4203; GFX1064-NEXT: s_endpgm 4204; 4205; GFX1032-LABEL: min_i64_constant: 4206; GFX1032: ; %bb.0: ; %entry 4207; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4208; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4209; GFX1032-NEXT: ; implicit-def: $vcc_hi 4210; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4211; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4212; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4213; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4214; GFX1032-NEXT: s_cbranch_execz BB20_2 4215; GFX1032-NEXT: ; %bb.1: 4216; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4217; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4218; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4219; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4220; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4221; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4222; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4223; GFX1032-NEXT: buffer_gl0_inv 4224; GFX1032-NEXT: buffer_gl1_inv 4225; GFX1032-NEXT: BB20_2: 4226; GFX1032-NEXT: v_nop 4227; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4228; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 4229; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 4230; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4231; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4232; GFX1032-NEXT: s_mov_b32 s2, -1 4233; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4234; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] 4235; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 4236; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 4237; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4238; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4239; GFX1032-NEXT: s_endpgm 4240entry: 4241 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4242 store i64 %old, i64 addrspace(1)* %out 4243 ret void 4244} 4245 4246; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 4247; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 4248; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 4249define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4250; 4251; 4252; GFX7LESS-LABEL: umax_i32_varying: 4253; GFX7LESS: ; %bb.0: ; %entry 4254; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4255; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4256; GFX7LESS-NEXT: s_mov_b32 m0, -1 4257; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4258; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4259; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4260; GFX7LESS-NEXT: buffer_wbinvl1 4261; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4262; GFX7LESS-NEXT: s_mov_b32 s2, -1 4263; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4264; GFX7LESS-NEXT: s_endpgm 4265; 4266; GFX8-LABEL: umax_i32_varying: 4267; GFX8: ; %bb.0: ; %entry 4268; GFX8-NEXT: v_mov_b32_e32 v2, v0 4269; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4270; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4271; GFX8-NEXT: v_mov_b32_e32 v1, 0 4272; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4273; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4274; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4275; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4276; GFX8-NEXT: s_not_b64 exec, exec 4277; GFX8-NEXT: v_mov_b32_e32 v2, 0 4278; GFX8-NEXT: s_not_b64 exec, exec 4279; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4280; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4281; GFX8-NEXT: s_nop 1 4282; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4283; GFX8-NEXT: s_nop 1 4284; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4285; GFX8-NEXT: s_nop 1 4286; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4287; GFX8-NEXT: s_nop 1 4288; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4289; GFX8-NEXT: s_nop 1 4290; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4291; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4292; GFX8-NEXT: s_nop 0 4293; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4294; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4295; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4296; GFX8-NEXT: ; implicit-def: $vgpr0 4297; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4298; GFX8-NEXT: s_cbranch_execz BB21_2 4299; GFX8-NEXT: ; %bb.1: 4300; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4301; GFX8-NEXT: v_mov_b32_e32 v3, s2 4302; GFX8-NEXT: s_mov_b32 m0, -1 4303; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4304; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4305; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4306; GFX8-NEXT: buffer_wbinvl1_vol 4307; GFX8-NEXT: BB21_2: 4308; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4309; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4310; GFX8-NEXT: v_mov_b32_e32 v0, v1 4311; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4312; GFX8-NEXT: s_mov_b32 s3, 0xf000 4313; GFX8-NEXT: s_mov_b32 s2, -1 4314; GFX8-NEXT: s_nop 0 4315; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4316; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4317; GFX8-NEXT: s_endpgm 4318; 4319; GFX9-LABEL: umax_i32_varying: 4320; GFX9: ; %bb.0: ; %entry 4321; GFX9-NEXT: v_mov_b32_e32 v2, v0 4322; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4323; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4324; GFX9-NEXT: v_mov_b32_e32 v1, 0 4325; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4326; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4327; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4328; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4329; GFX9-NEXT: s_not_b64 exec, exec 4330; GFX9-NEXT: v_mov_b32_e32 v2, 0 4331; GFX9-NEXT: s_not_b64 exec, exec 4332; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4333; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4334; GFX9-NEXT: s_nop 1 4335; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4336; GFX9-NEXT: s_nop 1 4337; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4338; GFX9-NEXT: s_nop 1 4339; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4340; GFX9-NEXT: s_nop 1 4341; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4342; GFX9-NEXT: s_nop 1 4343; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4344; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4345; GFX9-NEXT: s_nop 0 4346; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4347; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4348; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4349; GFX9-NEXT: ; implicit-def: $vgpr0 4350; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4351; GFX9-NEXT: s_cbranch_execz BB21_2 4352; GFX9-NEXT: ; %bb.1: 4353; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4354; GFX9-NEXT: v_mov_b32_e32 v3, s2 4355; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4356; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4357; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4358; GFX9-NEXT: buffer_wbinvl1_vol 4359; GFX9-NEXT: BB21_2: 4360; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4361; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4362; GFX9-NEXT: v_mov_b32_e32 v0, v1 4363; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4364; GFX9-NEXT: s_mov_b32 s3, 0xf000 4365; GFX9-NEXT: s_mov_b32 s2, -1 4366; GFX9-NEXT: s_nop 0 4367; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4368; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4369; GFX9-NEXT: s_endpgm 4370; 4371; GFX1064-LABEL: umax_i32_varying: 4372; GFX1064: ; %bb.0: ; %entry 4373; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4374; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4375; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4376; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4377; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4378; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4379; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4380; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4381; GFX1064-NEXT: s_not_b64 exec, exec 4382; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4383; GFX1064-NEXT: s_not_b64 exec, exec 4384; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4385; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4386; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4387; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4388; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4389; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4390; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4391; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4392; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4393; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4394; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4395; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4396; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4397; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4398; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4399; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4400; GFX1064-NEXT: s_mov_b32 s2, -1 4401; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4402; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4403; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4404; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4405; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4406; GFX1064-NEXT: ; implicit-def: $vgpr0 4407; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4408; GFX1064-NEXT: s_cbranch_execz BB21_2 4409; GFX1064-NEXT: ; %bb.1: 4410; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4411; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4412; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4413; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4414; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7 4415; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4416; GFX1064-NEXT: buffer_gl0_inv 4417; GFX1064-NEXT: buffer_gl1_inv 4418; GFX1064-NEXT: BB21_2: 4419; GFX1064-NEXT: v_nop 4420; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4421; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4422; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4423; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4424; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4425; GFX1064-NEXT: s_nop 1 4426; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4427; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4428; GFX1064-NEXT: s_endpgm 4429; 4430; GFX1032-LABEL: umax_i32_varying: 4431; GFX1032: ; %bb.0: ; %entry 4432; GFX1032-NEXT: ; implicit-def: $vcc_hi 4433; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4434; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4435; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4436; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4437; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4438; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4439; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4440; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4441; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4442; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4443; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4444; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4445; GFX1032-NEXT: s_mov_b32 s2, -1 4446; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4447; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4448; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4449; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4450; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4451; GFX1032-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4452; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4453; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4454; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4455; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4456; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4457; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4458; GFX1032-NEXT: ; implicit-def: $vgpr0 4459; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4460; GFX1032-NEXT: s_cbranch_execz BB21_2 4461; GFX1032-NEXT: ; %bb.1: 4462; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4463; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4464; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4465; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4466; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7 4467; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4468; GFX1032-NEXT: buffer_gl0_inv 4469; GFX1032-NEXT: buffer_gl1_inv 4470; GFX1032-NEXT: BB21_2: 4471; GFX1032-NEXT: v_nop 4472; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4473; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4474; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4475; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4476; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4477; GFX1032-NEXT: s_nop 1 4478; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4479; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4480; GFX1032-NEXT: s_endpgm 4481entry: 4482 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4483 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4484 store i32 %old, i32 addrspace(1)* %out 4485 ret void 4486} 4487 4488define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4489; 4490; 4491; GFX7LESS-LABEL: umax_i64_constant: 4492; GFX7LESS: ; %bb.0: ; %entry 4493; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4494; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4495; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4496; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 4497; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4498; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4499; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4500; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4501; GFX7LESS-NEXT: ; %bb.1: 4502; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4503; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4504; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4505; GFX7LESS-NEXT: s_mov_b32 m0, -1 4506; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4507; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4508; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4509; GFX7LESS-NEXT: buffer_wbinvl1 4510; GFX7LESS-NEXT: BB22_2: 4511; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4512; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4513; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4514; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4515; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4516; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4517; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4518; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4519; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4520; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4521; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4522; GFX7LESS-NEXT: s_mov_b32 s2, -1 4523; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4524; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4525; GFX7LESS-NEXT: s_endpgm 4526; 4527; GFX8-LABEL: umax_i64_constant: 4528; GFX8: ; %bb.0: ; %entry 4529; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4530; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4531; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4532; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4533; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4534; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4535; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4536; GFX8-NEXT: s_cbranch_execz BB22_2 4537; GFX8-NEXT: ; %bb.1: 4538; GFX8-NEXT: v_mov_b32_e32 v0, 5 4539; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4540; GFX8-NEXT: v_mov_b32_e32 v1, 0 4541; GFX8-NEXT: s_mov_b32 m0, -1 4542; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4543; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4544; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4545; GFX8-NEXT: buffer_wbinvl1_vol 4546; GFX8-NEXT: BB22_2: 4547; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4548; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4549; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4550; GFX8-NEXT: v_mov_b32_e32 v1, 0 4551; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4552; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4553; GFX8-NEXT: v_mov_b32_e32 v1, s3 4554; GFX8-NEXT: v_mov_b32_e32 v2, s2 4555; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4556; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4557; GFX8-NEXT: s_mov_b32 s3, 0xf000 4558; GFX8-NEXT: s_mov_b32 s2, -1 4559; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4560; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4561; GFX8-NEXT: s_endpgm 4562; 4563; GFX9-LABEL: umax_i64_constant: 4564; GFX9: ; %bb.0: ; %entry 4565; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4566; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4567; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4568; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4569; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4570; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4571; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4572; GFX9-NEXT: s_cbranch_execz BB22_2 4573; GFX9-NEXT: ; %bb.1: 4574; GFX9-NEXT: v_mov_b32_e32 v0, 5 4575; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4576; GFX9-NEXT: v_mov_b32_e32 v1, 0 4577; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4578; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4579; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4580; GFX9-NEXT: buffer_wbinvl1_vol 4581; GFX9-NEXT: BB22_2: 4582; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4583; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4584; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4585; GFX9-NEXT: v_mov_b32_e32 v1, 0 4586; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4587; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4588; GFX9-NEXT: v_mov_b32_e32 v1, s3 4589; GFX9-NEXT: v_mov_b32_e32 v2, s2 4590; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4591; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4592; GFX9-NEXT: s_mov_b32 s3, 0xf000 4593; GFX9-NEXT: s_mov_b32 s2, -1 4594; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4595; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4596; GFX9-NEXT: s_endpgm 4597; 4598; GFX1064-LABEL: umax_i64_constant: 4599; GFX1064: ; %bb.0: ; %entry 4600; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4601; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4602; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4603; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4604; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4605; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4606; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4607; GFX1064-NEXT: s_cbranch_execz BB22_2 4608; GFX1064-NEXT: ; %bb.1: 4609; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4610; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4611; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4612; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4613; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4614; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4615; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4616; GFX1064-NEXT: buffer_gl0_inv 4617; GFX1064-NEXT: buffer_gl1_inv 4618; GFX1064-NEXT: BB22_2: 4619; GFX1064-NEXT: v_nop 4620; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4621; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4622; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4623; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4624; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4625; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4626; GFX1064-NEXT: s_mov_b32 s2, -1 4627; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4628; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4629; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc 4630; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4631; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4632; GFX1064-NEXT: s_endpgm 4633; 4634; GFX1032-LABEL: umax_i64_constant: 4635; GFX1032: ; %bb.0: ; %entry 4636; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4637; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4638; GFX1032-NEXT: ; implicit-def: $vcc_hi 4639; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4640; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4641; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4642; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4643; GFX1032-NEXT: s_cbranch_execz BB22_2 4644; GFX1032-NEXT: ; %bb.1: 4645; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4646; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4647; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4648; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4649; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4650; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4651; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4652; GFX1032-NEXT: buffer_gl0_inv 4653; GFX1032-NEXT: buffer_gl1_inv 4654; GFX1032-NEXT: BB22_2: 4655; GFX1032-NEXT: v_nop 4656; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4657; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 4658; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 4659; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4660; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4661; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4662; GFX1032-NEXT: s_mov_b32 s2, -1 4663; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] 4664; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 4665; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo 4666; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4667; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4668; GFX1032-NEXT: s_endpgm 4669entry: 4670 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4671 store i64 %old, i64 addrspace(1)* %out 4672 ret void 4673} 4674 4675; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 4676; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 4677; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 4678define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4679; 4680; 4681; GFX7LESS-LABEL: umin_i32_varying: 4682; GFX7LESS: ; %bb.0: ; %entry 4683; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4684; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4685; GFX7LESS-NEXT: s_mov_b32 m0, -1 4686; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4687; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4688; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4689; GFX7LESS-NEXT: buffer_wbinvl1 4690; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4691; GFX7LESS-NEXT: s_mov_b32 s2, -1 4692; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4693; GFX7LESS-NEXT: s_endpgm 4694; 4695; GFX8-LABEL: umin_i32_varying: 4696; GFX8: ; %bb.0: ; %entry 4697; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4698; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4699; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 4700; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 4701; GFX8-NEXT: v_mov_b32_e32 v2, v0 4702; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4703; GFX8-NEXT: v_mov_b32_e32 v1, -1 4704; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4705; GFX8-NEXT: s_not_b64 exec, exec 4706; GFX8-NEXT: v_mov_b32_e32 v2, -1 4707; GFX8-NEXT: s_not_b64 exec, exec 4708; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4709; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4710; GFX8-NEXT: s_nop 1 4711; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4712; GFX8-NEXT: s_nop 1 4713; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4714; GFX8-NEXT: s_nop 1 4715; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4716; GFX8-NEXT: s_nop 1 4717; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4718; GFX8-NEXT: s_nop 1 4719; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4720; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4721; GFX8-NEXT: s_nop 0 4722; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4723; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4724; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4725; GFX8-NEXT: ; implicit-def: $vgpr0 4726; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4727; GFX8-NEXT: s_cbranch_execz BB23_2 4728; GFX8-NEXT: ; %bb.1: 4729; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4730; GFX8-NEXT: v_mov_b32_e32 v3, s2 4731; GFX8-NEXT: s_mov_b32 m0, -1 4732; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4733; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4734; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4735; GFX8-NEXT: buffer_wbinvl1_vol 4736; GFX8-NEXT: BB23_2: 4737; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4738; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4739; GFX8-NEXT: v_mov_b32_e32 v0, v1 4740; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4741; GFX8-NEXT: s_mov_b32 s3, 0xf000 4742; GFX8-NEXT: s_mov_b32 s2, -1 4743; GFX8-NEXT: s_nop 0 4744; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4745; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4746; GFX8-NEXT: s_endpgm 4747; 4748; GFX9-LABEL: umin_i32_varying: 4749; GFX9: ; %bb.0: ; %entry 4750; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4751; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4752; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 4753; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 4754; GFX9-NEXT: v_mov_b32_e32 v2, v0 4755; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4756; GFX9-NEXT: v_mov_b32_e32 v1, -1 4757; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4758; GFX9-NEXT: s_not_b64 exec, exec 4759; GFX9-NEXT: v_mov_b32_e32 v2, -1 4760; GFX9-NEXT: s_not_b64 exec, exec 4761; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4762; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4763; GFX9-NEXT: s_nop 1 4764; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4765; GFX9-NEXT: s_nop 1 4766; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4767; GFX9-NEXT: s_nop 1 4768; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4769; GFX9-NEXT: s_nop 1 4770; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4771; GFX9-NEXT: s_nop 1 4772; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4773; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4774; GFX9-NEXT: s_nop 0 4775; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4776; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4777; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4778; GFX9-NEXT: ; implicit-def: $vgpr0 4779; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4780; GFX9-NEXT: s_cbranch_execz BB23_2 4781; GFX9-NEXT: ; %bb.1: 4782; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4783; GFX9-NEXT: v_mov_b32_e32 v3, s2 4784; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4785; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4786; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4787; GFX9-NEXT: buffer_wbinvl1_vol 4788; GFX9-NEXT: BB23_2: 4789; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4790; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4791; GFX9-NEXT: v_mov_b32_e32 v0, v1 4792; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4793; GFX9-NEXT: s_mov_b32 s3, 0xf000 4794; GFX9-NEXT: s_mov_b32 s2, -1 4795; GFX9-NEXT: s_nop 0 4796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4797; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4798; GFX9-NEXT: s_endpgm 4799; 4800; GFX1064-LABEL: umin_i32_varying: 4801; GFX1064: ; %bb.0: ; %entry 4802; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4803; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4804; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4805; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4806; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 4807; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4808; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4809; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4810; GFX1064-NEXT: s_not_b64 exec, exec 4811; GFX1064-NEXT: v_mov_b32_e32 v2, -1 4812; GFX1064-NEXT: s_not_b64 exec, exec 4813; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4814; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4815; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4816; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4817; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4818; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4819; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4820; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4821; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4822; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4823; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4824; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4825; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4826; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4827; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4828; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4829; GFX1064-NEXT: s_mov_b32 s2, -1 4830; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4831; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4832; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4833; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4834; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 4835; GFX1064-NEXT: ; implicit-def: $vgpr0 4836; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4837; GFX1064-NEXT: s_cbranch_execz BB23_2 4838; GFX1064-NEXT: ; %bb.1: 4839; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4840; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4841; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4842; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4843; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7 4844; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4845; GFX1064-NEXT: buffer_gl0_inv 4846; GFX1064-NEXT: buffer_gl1_inv 4847; GFX1064-NEXT: BB23_2: 4848; GFX1064-NEXT: v_nop 4849; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4850; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4851; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4852; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4853; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4854; GFX1064-NEXT: s_nop 1 4855; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4856; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4857; GFX1064-NEXT: s_endpgm 4858; 4859; GFX1032-LABEL: umin_i32_varying: 4860; GFX1032: ; %bb.0: ; %entry 4861; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4862; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4863; GFX1032-NEXT: ; implicit-def: $vcc_hi 4864; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4865; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4866; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4867; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4868; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4869; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4870; GFX1032-NEXT: v_mov_b32_e32 v2, -1 4871; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4872; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4873; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4874; GFX1032-NEXT: s_mov_b32 s2, -1 4875; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4876; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4877; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4878; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4879; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4880; GFX1032-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4881; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4882; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4883; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4884; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4885; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4886; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4887; GFX1032-NEXT: ; implicit-def: $vgpr0 4888; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4889; GFX1032-NEXT: s_cbranch_execz BB23_2 4890; GFX1032-NEXT: ; %bb.1: 4891; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4892; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4893; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4894; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4895; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7 4896; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4897; GFX1032-NEXT: buffer_gl0_inv 4898; GFX1032-NEXT: buffer_gl1_inv 4899; GFX1032-NEXT: BB23_2: 4900; GFX1032-NEXT: v_nop 4901; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4902; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4903; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4904; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4905; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4906; GFX1032-NEXT: s_nop 1 4907; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4908; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4909; GFX1032-NEXT: s_endpgm 4910entry: 4911 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4912 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4913 store i32 %old, i32 addrspace(1)* %out 4914 ret void 4915} 4916 4917define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4918; 4919; 4920; GFX7LESS-LABEL: umin_i64_constant: 4921; GFX7LESS: ; %bb.0: ; %entry 4922; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4923; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4924; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4925; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 4926; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4927; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4928; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4929; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4930; GFX7LESS-NEXT: ; %bb.1: 4931; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4932; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4933; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4934; GFX7LESS-NEXT: s_mov_b32 m0, -1 4935; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4936; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4937; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4938; GFX7LESS-NEXT: buffer_wbinvl1 4939; GFX7LESS-NEXT: BB24_2: 4940; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4941; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4942; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4943; GFX7LESS-NEXT: s_mov_b32 s2, -1 4944; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4945; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4946; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4947; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4948; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4949; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4950; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4951; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4952; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4953; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4954; GFX7LESS-NEXT: s_endpgm 4955; 4956; GFX8-LABEL: umin_i64_constant: 4957; GFX8: ; %bb.0: ; %entry 4958; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4959; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4960; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4961; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4962; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4963; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4964; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4965; GFX8-NEXT: s_cbranch_execz BB24_2 4966; GFX8-NEXT: ; %bb.1: 4967; GFX8-NEXT: v_mov_b32_e32 v0, 5 4968; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4969; GFX8-NEXT: v_mov_b32_e32 v1, 0 4970; GFX8-NEXT: s_mov_b32 m0, -1 4971; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4972; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4973; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4974; GFX8-NEXT: buffer_wbinvl1_vol 4975; GFX8-NEXT: BB24_2: 4976; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4977; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4978; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4979; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4980; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4981; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4982; GFX8-NEXT: v_mov_b32_e32 v2, s5 4983; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4984; GFX8-NEXT: v_mov_b32_e32 v2, s4 4985; GFX8-NEXT: s_mov_b32 s2, -1 4986; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4987; GFX8-NEXT: s_mov_b32 s3, 0xf000 4988; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4989; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4990; GFX8-NEXT: s_endpgm 4991; 4992; GFX9-LABEL: umin_i64_constant: 4993; GFX9: ; %bb.0: ; %entry 4994; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4995; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4996; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4997; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4998; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4999; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5000; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5001; GFX9-NEXT: s_cbranch_execz BB24_2 5002; GFX9-NEXT: ; %bb.1: 5003; GFX9-NEXT: v_mov_b32_e32 v0, 5 5004; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5005; GFX9-NEXT: v_mov_b32_e32 v1, 0 5006; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5007; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5008; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5009; GFX9-NEXT: buffer_wbinvl1_vol 5010; GFX9-NEXT: BB24_2: 5011; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5012; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5013; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5014; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5015; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5016; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5017; GFX9-NEXT: v_mov_b32_e32 v2, s5 5018; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5019; GFX9-NEXT: v_mov_b32_e32 v2, s4 5020; GFX9-NEXT: s_mov_b32 s2, -1 5021; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5022; GFX9-NEXT: s_mov_b32 s3, 0xf000 5023; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5024; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5025; GFX9-NEXT: s_endpgm 5026; 5027; GFX1064-LABEL: umin_i64_constant: 5028; GFX1064: ; %bb.0: ; %entry 5029; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 5030; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5031; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 5032; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 5033; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5034; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5035; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5036; GFX1064-NEXT: s_cbranch_execz BB24_2 5037; GFX1064-NEXT: ; %bb.1: 5038; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5039; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5040; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5041; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5042; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5043; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5044; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5045; GFX1064-NEXT: buffer_gl0_inv 5046; GFX1064-NEXT: buffer_gl1_inv 5047; GFX1064-NEXT: BB24_2: 5048; GFX1064-NEXT: v_nop 5049; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5050; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 5051; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 5052; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5053; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5054; GFX1064-NEXT: s_mov_b32 s2, -1 5055; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5056; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5057; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 5058; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 5059; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5060; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5061; GFX1064-NEXT: s_endpgm 5062; 5063; GFX1032-LABEL: umin_i64_constant: 5064; GFX1032: ; %bb.0: ; %entry 5065; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5066; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 5067; GFX1032-NEXT: ; implicit-def: $vcc_hi 5068; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 5069; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5070; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5071; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5072; GFX1032-NEXT: s_cbranch_execz BB24_2 5073; GFX1032-NEXT: ; %bb.1: 5074; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5075; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5076; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5077; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5078; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5079; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5080; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5081; GFX1032-NEXT: buffer_gl0_inv 5082; GFX1032-NEXT: buffer_gl1_inv 5083; GFX1032-NEXT: BB24_2: 5084; GFX1032-NEXT: v_nop 5085; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5086; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 5087; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 5088; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 5089; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5090; GFX1032-NEXT: s_mov_b32 s2, -1 5091; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5092; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] 5093; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 5094; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 5095; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5096; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5097; GFX1032-NEXT: s_endpgm 5098entry: 5099 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 5100 store i64 %old, i64 addrspace(1)* %out 5101 ret void 5102} 5103