1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show that what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 21; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 27; GFX7LESS-NEXT: ; mask branch BB0_2 28; GFX7LESS-NEXT: s_cbranch_execz BB0_2 29; GFX7LESS-NEXT: BB0_1: 30; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 32; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s4, 5 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 37; GFX7LESS-NEXT: buffer_wbinvl1 38; GFX7LESS-NEXT: BB0_2: 39; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 45; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 46; GFX7LESS-NEXT: s_endpgm 47; 48; GFX8-LABEL: add_i32_constant: 49; GFX8: ; %bb.0: ; %entry 50; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 51; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 52; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 53; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 54; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 55; GFX8-NEXT: ; implicit-def: $vgpr1 56; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 57; GFX8-NEXT: ; mask branch BB0_2 58; GFX8-NEXT: s_cbranch_execz BB0_2 59; GFX8-NEXT: BB0_1: 60; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 61; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 62; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 63; GFX8-NEXT: s_mov_b32 m0, -1 64; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 65; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 66; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 67; GFX8-NEXT: buffer_wbinvl1_vol 68; GFX8-NEXT: BB0_2: 69; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 70; GFX8-NEXT: v_readfirstlane_b32 s2, v1 71; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 72; GFX8-NEXT: s_mov_b32 s3, 0xf000 73; GFX8-NEXT: s_mov_b32 s2, -1 74; GFX8-NEXT: s_nop 1 75; GFX8-NEXT: s_waitcnt lgkmcnt(0) 76; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 77; GFX8-NEXT: s_endpgm 78; 79; GFX9-LABEL: add_i32_constant: 80; GFX9: ; %bb.0: ; %entry 81; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 82; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 83; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 84; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 85; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 86; GFX9-NEXT: ; implicit-def: $vgpr1 87; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 88; GFX9-NEXT: ; mask branch BB0_2 89; GFX9-NEXT: s_cbranch_execz BB0_2 90; GFX9-NEXT: BB0_1: 91; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 92; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 93; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 94; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 95; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 96; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 97; GFX9-NEXT: buffer_wbinvl1_vol 98; GFX9-NEXT: BB0_2: 99; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 100; GFX9-NEXT: v_readfirstlane_b32 s2, v1 101; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 102; GFX9-NEXT: s_mov_b32 s3, 0xf000 103; GFX9-NEXT: s_mov_b32 s2, -1 104; GFX9-NEXT: s_nop 1 105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 106; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 107; GFX9-NEXT: s_endpgm 108; 109; GFX1064-LABEL: add_i32_constant: 110; GFX1064: ; %bb.0: ; %entry 111; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 112; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 113; GFX1064-NEXT: ; implicit-def: $vgpr1 114; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 115; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 116; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 117; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 118; GFX1064-NEXT: ; mask branch BB0_2 119; GFX1064-NEXT: s_cbranch_execz BB0_2 120; GFX1064-NEXT: BB0_1: 121; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 122; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 123; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 124; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 125; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 126; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 127; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 128; GFX1064-NEXT: buffer_gl0_inv 129; GFX1064-NEXT: buffer_gl1_inv 130; GFX1064-NEXT: BB0_2: 131; GFX1064-NEXT: v_nop 132; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 133; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 134; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 135; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 136; GFX1064-NEXT: s_mov_b32 s2, -1 137; GFX1064-NEXT: s_nop 1 138; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 139; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 140; GFX1064-NEXT: s_endpgm 141; 142; GFX1032-LABEL: add_i32_constant: 143; GFX1032: ; %bb.0: ; %entry 144; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 145; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 146; GFX1032-NEXT: ; implicit-def: $vcc_hi 147; GFX1032-NEXT: ; implicit-def: $vgpr1 148; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 149; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 150; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 151; GFX1032-NEXT: ; mask branch BB0_2 152; GFX1032-NEXT: s_cbranch_execz BB0_2 153; GFX1032-NEXT: BB0_1: 154; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 155; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 156; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 157; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 158; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 159; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 160; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 161; GFX1032-NEXT: buffer_gl0_inv 162; GFX1032-NEXT: buffer_gl1_inv 163; GFX1032-NEXT: BB0_2: 164; GFX1032-NEXT: v_nop 165; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 166; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 167; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 168; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 169; GFX1032-NEXT: s_mov_b32 s2, -1 170; GFX1032-NEXT: s_nop 1 171; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 172; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 173; GFX1032-NEXT: s_endpgm 174entry: 175 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 176 store i32 %old, i32 addrspace(1)* %out 177 ret void 178} 179 180define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 181; 182; 183; GFX7LESS-LABEL: add_i32_uniform: 184; GFX7LESS: ; %bb.0: ; %entry 185; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 186; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 187; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 188; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 189; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 190; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 191; GFX7LESS-NEXT: ; implicit-def: $vgpr1 192; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 193; GFX7LESS-NEXT: ; mask branch BB1_2 194; GFX7LESS-NEXT: s_cbranch_execz BB1_2 195; GFX7LESS-NEXT: BB1_1: 196; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 197; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 198; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 199; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 200; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 201; GFX7LESS-NEXT: s_mov_b32 m0, -1 202; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 203; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 204; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 205; GFX7LESS-NEXT: buffer_wbinvl1 206; GFX7LESS-NEXT: BB1_2: 207; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 208; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 209; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 210; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 211; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 212; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 213; GFX7LESS-NEXT: s_mov_b32 s6, -1 214; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 215; GFX7LESS-NEXT: s_endpgm 216; 217; GFX8-LABEL: add_i32_uniform: 218; GFX8: ; %bb.0: ; %entry 219; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 220; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 221; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 222; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 223; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 224; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 225; GFX8-NEXT: ; implicit-def: $vgpr1 226; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 227; GFX8-NEXT: ; mask branch BB1_2 228; GFX8-NEXT: s_cbranch_execz BB1_2 229; GFX8-NEXT: BB1_1: 230; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 231; GFX8-NEXT: s_waitcnt lgkmcnt(0) 232; GFX8-NEXT: s_mul_i32 s1, s0, s1 233; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 234; GFX8-NEXT: v_mov_b32_e32 v2, s1 235; GFX8-NEXT: s_mov_b32 m0, -1 236; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 237; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 238; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 239; GFX8-NEXT: buffer_wbinvl1_vol 240; GFX8-NEXT: BB1_2: 241; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 242; GFX8-NEXT: s_waitcnt lgkmcnt(0) 243; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 244; GFX8-NEXT: v_readfirstlane_b32 s0, v1 245; GFX8-NEXT: s_mov_b32 s7, 0xf000 246; GFX8-NEXT: s_mov_b32 s6, -1 247; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 248; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 249; GFX8-NEXT: s_endpgm 250; 251; GFX9-LABEL: add_i32_uniform: 252; GFX9: ; %bb.0: ; %entry 253; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 254; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 255; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 256; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 257; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 258; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 259; GFX9-NEXT: ; implicit-def: $vgpr1 260; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 261; GFX9-NEXT: ; mask branch BB1_2 262; GFX9-NEXT: s_cbranch_execz BB1_2 263; GFX9-NEXT: BB1_1: 264; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 265; GFX9-NEXT: s_waitcnt lgkmcnt(0) 266; GFX9-NEXT: s_mul_i32 s1, s0, s1 267; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 268; GFX9-NEXT: v_mov_b32_e32 v2, s1 269; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 270; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 271; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 272; GFX9-NEXT: buffer_wbinvl1_vol 273; GFX9-NEXT: BB1_2: 274; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 275; GFX9-NEXT: s_waitcnt lgkmcnt(0) 276; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 277; GFX9-NEXT: v_readfirstlane_b32 s0, v1 278; GFX9-NEXT: s_mov_b32 s7, 0xf000 279; GFX9-NEXT: s_mov_b32 s6, -1 280; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 281; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 282; GFX9-NEXT: s_endpgm 283; 284; GFX1064-LABEL: add_i32_uniform: 285; GFX1064: ; %bb.0: ; %entry 286; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 287; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 288; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 289; GFX1064-NEXT: ; implicit-def: $vgpr1 290; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 291; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 292; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 293; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 294; GFX1064-NEXT: ; mask branch BB1_2 295; GFX1064-NEXT: s_cbranch_execz BB1_2 296; GFX1064-NEXT: BB1_1: 297; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 298; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 299; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 300; GFX1064-NEXT: s_mul_i32 s1, s0, s1 301; GFX1064-NEXT: v_mov_b32_e32 v2, s1 302; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 303; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 304; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 305; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 306; GFX1064-NEXT: buffer_gl0_inv 307; GFX1064-NEXT: buffer_gl1_inv 308; GFX1064-NEXT: BB1_2: 309; GFX1064-NEXT: v_nop 310; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 311; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 312; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 313; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 314; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 315; GFX1064-NEXT: s_mov_b32 s6, -1 316; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 317; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 318; GFX1064-NEXT: s_endpgm 319; 320; GFX1032-LABEL: add_i32_uniform: 321; GFX1032: ; %bb.0: ; %entry 322; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 323; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 324; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 325; GFX1032-NEXT: ; implicit-def: $vcc_hi 326; GFX1032-NEXT: ; implicit-def: $vgpr1 327; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 328; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 329; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 330; GFX1032-NEXT: ; mask branch BB1_2 331; GFX1032-NEXT: s_cbranch_execz BB1_2 332; GFX1032-NEXT: BB1_1: 333; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 334; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 335; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 336; GFX1032-NEXT: s_mul_i32 s2, s0, s2 337; GFX1032-NEXT: v_mov_b32_e32 v2, s2 338; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 339; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 340; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 341; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 342; GFX1032-NEXT: buffer_gl0_inv 343; GFX1032-NEXT: buffer_gl1_inv 344; GFX1032-NEXT: BB1_2: 345; GFX1032-NEXT: v_nop 346; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 347; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 348; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 349; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 350; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 351; GFX1032-NEXT: s_mov_b32 s6, -1 352; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 353; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 354; GFX1032-NEXT: s_endpgm 355entry: 356 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 357 store i32 %old, i32 addrspace(1)* %out 358 ret void 359} 360 361; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 362; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 363; GFX7LESS-NOT: s_bcnt1_i32_b64 364; DPPCOMB: v_add_u32_dpp 365; DPPCOMB: v_add_u32_dpp 366; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 367; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 368; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 369define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 370; 371; 372; GFX7LESS-LABEL: add_i32_varying: 373; GFX7LESS: ; %bb.0: ; %entry 374; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 375; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 376; GFX7LESS-NEXT: s_mov_b32 m0, -1 377; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 378; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 379; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 380; GFX7LESS-NEXT: buffer_wbinvl1 381; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 382; GFX7LESS-NEXT: s_mov_b32 s2, -1 383; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 384; GFX7LESS-NEXT: s_endpgm 385; 386; GFX8-LABEL: add_i32_varying: 387; GFX8: ; %bb.0: ; %entry 388; GFX8-NEXT: v_mov_b32_e32 v2, v0 389; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 390; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 391; GFX8-NEXT: v_mov_b32_e32 v1, 0 392; GFX8-NEXT: s_mov_b64 exec, s[2:3] 393; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 394; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 395; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 396; GFX8-NEXT: s_not_b64 exec, exec 397; GFX8-NEXT: v_mov_b32_e32 v2, 0 398; GFX8-NEXT: s_not_b64 exec, exec 399; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 400; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 401; GFX8-NEXT: s_nop 1 402; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 403; GFX8-NEXT: s_nop 1 404; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 405; GFX8-NEXT: s_nop 1 406; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 407; GFX8-NEXT: s_nop 1 408; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 409; GFX8-NEXT: s_nop 1 410; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 411; GFX8-NEXT: v_readlane_b32 s2, v2, 63 412; GFX8-NEXT: s_nop 0 413; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 414; GFX8-NEXT: s_mov_b64 exec, s[4:5] 415; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 416; GFX8-NEXT: ; implicit-def: $vgpr0 417; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 418; GFX8-NEXT: ; mask branch BB2_2 419; GFX8-NEXT: s_cbranch_execz BB2_2 420; GFX8-NEXT: BB2_1: 421; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 422; GFX8-NEXT: v_mov_b32_e32 v3, s2 423; GFX8-NEXT: s_mov_b32 m0, -1 424; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 425; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 426; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 427; GFX8-NEXT: buffer_wbinvl1_vol 428; GFX8-NEXT: BB2_2: 429; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 430; GFX8-NEXT: v_readfirstlane_b32 s2, v0 431; GFX8-NEXT: v_mov_b32_e32 v0, v1 432; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 433; GFX8-NEXT: s_mov_b32 s3, 0xf000 434; GFX8-NEXT: s_mov_b32 s2, -1 435; GFX8-NEXT: s_nop 0 436; GFX8-NEXT: s_waitcnt lgkmcnt(0) 437; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 438; GFX8-NEXT: s_endpgm 439; 440; GFX9-LABEL: add_i32_varying: 441; GFX9: ; %bb.0: ; %entry 442; GFX9-NEXT: v_mov_b32_e32 v2, v0 443; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 444; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 445; GFX9-NEXT: v_mov_b32_e32 v1, 0 446; GFX9-NEXT: s_mov_b64 exec, s[2:3] 447; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 448; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 449; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 450; GFX9-NEXT: s_not_b64 exec, exec 451; GFX9-NEXT: v_mov_b32_e32 v2, 0 452; GFX9-NEXT: s_not_b64 exec, exec 453; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 454; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 455; GFX9-NEXT: s_nop 1 456; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 457; GFX9-NEXT: s_nop 1 458; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 459; GFX9-NEXT: s_nop 1 460; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 461; GFX9-NEXT: s_nop 1 462; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 463; GFX9-NEXT: s_nop 1 464; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 465; GFX9-NEXT: v_readlane_b32 s2, v2, 63 466; GFX9-NEXT: s_nop 0 467; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 468; GFX9-NEXT: s_mov_b64 exec, s[4:5] 469; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 470; GFX9-NEXT: ; implicit-def: $vgpr0 471; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 472; GFX9-NEXT: ; mask branch BB2_2 473; GFX9-NEXT: s_cbranch_execz BB2_2 474; GFX9-NEXT: BB2_1: 475; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 476; GFX9-NEXT: v_mov_b32_e32 v3, s2 477; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 478; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 479; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 480; GFX9-NEXT: buffer_wbinvl1_vol 481; GFX9-NEXT: BB2_2: 482; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 483; GFX9-NEXT: v_readfirstlane_b32 s2, v0 484; GFX9-NEXT: v_mov_b32_e32 v0, v1 485; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 486; GFX9-NEXT: s_mov_b32 s3, 0xf000 487; GFX9-NEXT: s_mov_b32 s2, -1 488; GFX9-NEXT: s_nop 0 489; GFX9-NEXT: s_waitcnt lgkmcnt(0) 490; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 491; GFX9-NEXT: s_endpgm 492; 493; GFX1064-LABEL: add_i32_varying: 494; GFX1064: ; %bb.0: ; %entry 495; GFX1064-NEXT: v_mov_b32_e32 v2, v0 496; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 497; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 498; GFX1064-NEXT: v_mov_b32_e32 v1, 0 499; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 500; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 501; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 502; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 503; GFX1064-NEXT: s_not_b64 exec, exec 504; GFX1064-NEXT: v_mov_b32_e32 v2, 0 505; GFX1064-NEXT: s_not_b64 exec, exec 506; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 507; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 508; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 509; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 510; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 511; GFX1064-NEXT: v_mov_b32_e32 v3, v2 512; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 513; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 514; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 515; GFX1064-NEXT: v_mov_b32_e32 v3, s2 516; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 517; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 518; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 519; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 520; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 521; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 522; GFX1064-NEXT: s_mov_b32 s2, -1 523; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 524; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 525; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 526; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 527; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 528; GFX1064-NEXT: ; implicit-def: $vgpr0 529; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 530; GFX1064-NEXT: ; mask branch BB2_2 531; GFX1064-NEXT: s_cbranch_execz BB2_2 532; GFX1064-NEXT: BB2_1: 533; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 534; GFX1064-NEXT: v_mov_b32_e32 v7, s3 535; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 536; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 537; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 538; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 539; GFX1064-NEXT: buffer_gl0_inv 540; GFX1064-NEXT: buffer_gl1_inv 541; GFX1064-NEXT: BB2_2: 542; GFX1064-NEXT: v_nop 543; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 544; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 545; GFX1064-NEXT: v_mov_b32_e32 v0, v1 546; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 547; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 548; GFX1064-NEXT: s_nop 1 549; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 550; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 551; GFX1064-NEXT: s_endpgm 552; 553; GFX1032-LABEL: add_i32_varying: 554; GFX1032: ; %bb.0: ; %entry 555; GFX1032-NEXT: ; implicit-def: $vcc_hi 556; GFX1032-NEXT: v_mov_b32_e32 v2, v0 557; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 558; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 559; GFX1032-NEXT: v_mov_b32_e32 v1, 0 560; GFX1032-NEXT: s_mov_b32 exec_lo, s2 561; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 562; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 563; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 564; GFX1032-NEXT: v_mov_b32_e32 v2, 0 565; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 566; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 567; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 568; GFX1032-NEXT: s_mov_b32 s2, -1 569; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 570; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 571; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 572; GFX1032-NEXT: v_mov_b32_e32 v3, v2 573; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 574; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 575; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 576; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 577; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 578; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 579; GFX1032-NEXT: s_mov_b32 exec_lo, s4 580; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 581; GFX1032-NEXT: ; implicit-def: $vgpr0 582; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 583; GFX1032-NEXT: ; mask branch BB2_2 584; GFX1032-NEXT: s_cbranch_execz BB2_2 585; GFX1032-NEXT: BB2_1: 586; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 587; GFX1032-NEXT: v_mov_b32_e32 v7, s3 588; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 589; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 590; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 591; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 592; GFX1032-NEXT: buffer_gl0_inv 593; GFX1032-NEXT: buffer_gl1_inv 594; GFX1032-NEXT: BB2_2: 595; GFX1032-NEXT: v_nop 596; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 597; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 598; GFX1032-NEXT: v_mov_b32_e32 v0, v1 599; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 600; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 601; GFX1032-NEXT: s_nop 1 602; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 603; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 604; GFX1032-NEXT: s_endpgm 605entry: 606 %lane = call i32 @llvm.amdgcn.workitem.id.x() 607 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 608 store i32 %old, i32 addrspace(1)* %out 609 ret void 610} 611 612define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 613; 614; 615; GFX7LESS-LABEL: add_i32_varying_gfx1032: 616; GFX7LESS: ; %bb.0: ; %entry 617; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 618; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 619; GFX7LESS-NEXT: s_mov_b32 m0, -1 620; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 621; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 622; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 623; GFX7LESS-NEXT: buffer_wbinvl1 624; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 625; GFX7LESS-NEXT: s_mov_b32 s2, -1 626; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 627; GFX7LESS-NEXT: s_endpgm 628; 629; GFX8-LABEL: add_i32_varying_gfx1032: 630; GFX8: ; %bb.0: ; %entry 631; GFX8-NEXT: v_mov_b32_e32 v2, v0 632; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 633; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 634; GFX8-NEXT: v_mov_b32_e32 v1, 0 635; GFX8-NEXT: s_mov_b64 exec, s[2:3] 636; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 637; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 638; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 639; GFX8-NEXT: s_not_b64 exec, exec 640; GFX8-NEXT: v_mov_b32_e32 v2, 0 641; GFX8-NEXT: s_not_b64 exec, exec 642; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 643; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 644; GFX8-NEXT: s_nop 1 645; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 646; GFX8-NEXT: s_nop 1 647; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 648; GFX8-NEXT: s_nop 1 649; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 650; GFX8-NEXT: s_nop 1 651; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 652; GFX8-NEXT: s_nop 1 653; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 654; GFX8-NEXT: v_readlane_b32 s2, v2, 63 655; GFX8-NEXT: s_nop 0 656; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 657; GFX8-NEXT: s_mov_b64 exec, s[4:5] 658; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 659; GFX8-NEXT: ; implicit-def: $vgpr0 660; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 661; GFX8-NEXT: ; mask branch BB3_2 662; GFX8-NEXT: s_cbranch_execz BB3_2 663; GFX8-NEXT: BB3_1: 664; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 665; GFX8-NEXT: v_mov_b32_e32 v3, s2 666; GFX8-NEXT: s_mov_b32 m0, -1 667; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 668; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 669; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 670; GFX8-NEXT: buffer_wbinvl1_vol 671; GFX8-NEXT: BB3_2: 672; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 673; GFX8-NEXT: v_readfirstlane_b32 s2, v0 674; GFX8-NEXT: v_mov_b32_e32 v0, v1 675; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 676; GFX8-NEXT: s_mov_b32 s3, 0xf000 677; GFX8-NEXT: s_mov_b32 s2, -1 678; GFX8-NEXT: s_nop 0 679; GFX8-NEXT: s_waitcnt lgkmcnt(0) 680; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 681; GFX8-NEXT: s_endpgm 682; 683; GFX9-LABEL: add_i32_varying_gfx1032: 684; GFX9: ; %bb.0: ; %entry 685; GFX9-NEXT: v_mov_b32_e32 v2, v0 686; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 687; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 688; GFX9-NEXT: v_mov_b32_e32 v1, 0 689; GFX9-NEXT: s_mov_b64 exec, s[2:3] 690; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 691; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 692; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 693; GFX9-NEXT: s_not_b64 exec, exec 694; GFX9-NEXT: v_mov_b32_e32 v2, 0 695; GFX9-NEXT: s_not_b64 exec, exec 696; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 697; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 698; GFX9-NEXT: s_nop 1 699; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 700; GFX9-NEXT: s_nop 1 701; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 702; GFX9-NEXT: s_nop 1 703; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 704; GFX9-NEXT: s_nop 1 705; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 706; GFX9-NEXT: s_nop 1 707; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 708; GFX9-NEXT: v_readlane_b32 s2, v2, 63 709; GFX9-NEXT: s_nop 0 710; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 711; GFX9-NEXT: s_mov_b64 exec, s[4:5] 712; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 713; GFX9-NEXT: ; implicit-def: $vgpr0 714; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 715; GFX9-NEXT: ; mask branch BB3_2 716; GFX9-NEXT: s_cbranch_execz BB3_2 717; GFX9-NEXT: BB3_1: 718; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 719; GFX9-NEXT: v_mov_b32_e32 v3, s2 720; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 721; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 722; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 723; GFX9-NEXT: buffer_wbinvl1_vol 724; GFX9-NEXT: BB3_2: 725; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 726; GFX9-NEXT: v_readfirstlane_b32 s2, v0 727; GFX9-NEXT: v_mov_b32_e32 v0, v1 728; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 729; GFX9-NEXT: s_mov_b32 s3, 0xf000 730; GFX9-NEXT: s_mov_b32 s2, -1 731; GFX9-NEXT: s_nop 0 732; GFX9-NEXT: s_waitcnt lgkmcnt(0) 733; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 734; GFX9-NEXT: s_endpgm 735; 736; GFX1064-LABEL: add_i32_varying_gfx1032: 737; GFX1064: ; %bb.0: ; %entry 738; GFX1064-NEXT: v_mov_b32_e32 v2, v0 739; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 740; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 741; GFX1064-NEXT: v_mov_b32_e32 v1, 0 742; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 743; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 744; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 745; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 746; GFX1064-NEXT: s_not_b64 exec, exec 747; GFX1064-NEXT: v_mov_b32_e32 v2, 0 748; GFX1064-NEXT: s_not_b64 exec, exec 749; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 750; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 751; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 752; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 753; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 754; GFX1064-NEXT: v_mov_b32_e32 v3, v2 755; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 756; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 757; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 758; GFX1064-NEXT: v_mov_b32_e32 v3, s2 759; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 760; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 761; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 762; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 763; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 764; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 765; GFX1064-NEXT: s_mov_b32 s2, -1 766; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 767; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 768; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 769; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 770; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 771; GFX1064-NEXT: ; implicit-def: $vgpr0 772; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 773; GFX1064-NEXT: ; mask branch BB3_2 774; GFX1064-NEXT: s_cbranch_execz BB3_2 775; GFX1064-NEXT: BB3_1: 776; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 777; GFX1064-NEXT: v_mov_b32_e32 v7, s3 778; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 779; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 780; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 781; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 782; GFX1064-NEXT: buffer_gl0_inv 783; GFX1064-NEXT: buffer_gl1_inv 784; GFX1064-NEXT: BB3_2: 785; GFX1064-NEXT: v_nop 786; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 787; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 788; GFX1064-NEXT: v_mov_b32_e32 v0, v1 789; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 790; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 791; GFX1064-NEXT: s_nop 1 792; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 793; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 794; GFX1064-NEXT: s_endpgm 795; 796; GFX1032-LABEL: add_i32_varying_gfx1032: 797; GFX1032: ; %bb.0: ; %entry 798; GFX1032-NEXT: ; implicit-def: $vcc_hi 799; GFX1032-NEXT: v_mov_b32_e32 v2, v0 800; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 801; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 802; GFX1032-NEXT: v_mov_b32_e32 v1, 0 803; GFX1032-NEXT: s_mov_b32 exec_lo, s2 804; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 805; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 806; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 807; GFX1032-NEXT: v_mov_b32_e32 v2, 0 808; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 809; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 810; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 811; GFX1032-NEXT: s_mov_b32 s2, -1 812; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 813; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 814; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 815; GFX1032-NEXT: v_mov_b32_e32 v3, v2 816; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 817; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 818; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 819; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 820; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 821; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 822; GFX1032-NEXT: s_mov_b32 exec_lo, s4 823; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 824; GFX1032-NEXT: ; implicit-def: $vgpr0 825; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 826; GFX1032-NEXT: ; mask branch BB3_2 827; GFX1032-NEXT: s_cbranch_execz BB3_2 828; GFX1032-NEXT: BB3_1: 829; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 830; GFX1032-NEXT: v_mov_b32_e32 v7, s3 831; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 832; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 833; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 834; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 835; GFX1032-NEXT: buffer_gl0_inv 836; GFX1032-NEXT: buffer_gl1_inv 837; GFX1032-NEXT: BB3_2: 838; GFX1032-NEXT: v_nop 839; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 840; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 841; GFX1032-NEXT: v_mov_b32_e32 v0, v1 842; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 843; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 844; GFX1032-NEXT: s_nop 1 845; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 846; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 847; GFX1032-NEXT: s_endpgm 848entry: 849 %lane = call i32 @llvm.amdgcn.workitem.id.x() 850 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 851 store i32 %old, i32 addrspace(1)* %out 852 ret void 853} 854 855define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 856; 857; 858; GFX7LESS-LABEL: add_i32_varying_gfx1064: 859; GFX7LESS: ; %bb.0: ; %entry 860; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 861; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 862; GFX7LESS-NEXT: s_mov_b32 m0, -1 863; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 864; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 865; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 866; GFX7LESS-NEXT: buffer_wbinvl1 867; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 868; GFX7LESS-NEXT: s_mov_b32 s2, -1 869; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 870; GFX7LESS-NEXT: s_endpgm 871; 872; GFX8-LABEL: add_i32_varying_gfx1064: 873; GFX8: ; %bb.0: ; %entry 874; GFX8-NEXT: v_mov_b32_e32 v2, v0 875; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 876; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 877; GFX8-NEXT: v_mov_b32_e32 v1, 0 878; GFX8-NEXT: s_mov_b64 exec, s[2:3] 879; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 880; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 881; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 882; GFX8-NEXT: s_not_b64 exec, exec 883; GFX8-NEXT: v_mov_b32_e32 v2, 0 884; GFX8-NEXT: s_not_b64 exec, exec 885; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 886; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 887; GFX8-NEXT: s_nop 1 888; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 889; GFX8-NEXT: s_nop 1 890; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 891; GFX8-NEXT: s_nop 1 892; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 893; GFX8-NEXT: s_nop 1 894; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 895; GFX8-NEXT: s_nop 1 896; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 897; GFX8-NEXT: v_readlane_b32 s2, v2, 63 898; GFX8-NEXT: s_nop 0 899; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 900; GFX8-NEXT: s_mov_b64 exec, s[4:5] 901; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 902; GFX8-NEXT: ; implicit-def: $vgpr0 903; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 904; GFX8-NEXT: ; mask branch BB4_2 905; GFX8-NEXT: s_cbranch_execz BB4_2 906; GFX8-NEXT: BB4_1: 907; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 908; GFX8-NEXT: v_mov_b32_e32 v3, s2 909; GFX8-NEXT: s_mov_b32 m0, -1 910; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 911; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 912; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 913; GFX8-NEXT: buffer_wbinvl1_vol 914; GFX8-NEXT: BB4_2: 915; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 916; GFX8-NEXT: v_readfirstlane_b32 s2, v0 917; GFX8-NEXT: v_mov_b32_e32 v0, v1 918; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 919; GFX8-NEXT: s_mov_b32 s3, 0xf000 920; GFX8-NEXT: s_mov_b32 s2, -1 921; GFX8-NEXT: s_nop 0 922; GFX8-NEXT: s_waitcnt lgkmcnt(0) 923; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 924; GFX8-NEXT: s_endpgm 925; 926; GFX9-LABEL: add_i32_varying_gfx1064: 927; GFX9: ; %bb.0: ; %entry 928; GFX9-NEXT: v_mov_b32_e32 v2, v0 929; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 930; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 931; GFX9-NEXT: v_mov_b32_e32 v1, 0 932; GFX9-NEXT: s_mov_b64 exec, s[2:3] 933; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 934; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 935; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 936; GFX9-NEXT: s_not_b64 exec, exec 937; GFX9-NEXT: v_mov_b32_e32 v2, 0 938; GFX9-NEXT: s_not_b64 exec, exec 939; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 940; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 941; GFX9-NEXT: s_nop 1 942; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 943; GFX9-NEXT: s_nop 1 944; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 945; GFX9-NEXT: s_nop 1 946; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 947; GFX9-NEXT: s_nop 1 948; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 949; GFX9-NEXT: s_nop 1 950; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 951; GFX9-NEXT: v_readlane_b32 s2, v2, 63 952; GFX9-NEXT: s_nop 0 953; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 954; GFX9-NEXT: s_mov_b64 exec, s[4:5] 955; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 956; GFX9-NEXT: ; implicit-def: $vgpr0 957; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 958; GFX9-NEXT: ; mask branch BB4_2 959; GFX9-NEXT: s_cbranch_execz BB4_2 960; GFX9-NEXT: BB4_1: 961; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 962; GFX9-NEXT: v_mov_b32_e32 v3, s2 963; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 964; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 965; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 966; GFX9-NEXT: buffer_wbinvl1_vol 967; GFX9-NEXT: BB4_2: 968; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 969; GFX9-NEXT: v_readfirstlane_b32 s2, v0 970; GFX9-NEXT: v_mov_b32_e32 v0, v1 971; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 972; GFX9-NEXT: s_mov_b32 s3, 0xf000 973; GFX9-NEXT: s_mov_b32 s2, -1 974; GFX9-NEXT: s_nop 0 975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 976; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 977; GFX9-NEXT: s_endpgm 978; 979; GFX1064-LABEL: add_i32_varying_gfx1064: 980; GFX1064: ; %bb.0: ; %entry 981; GFX1064-NEXT: v_mov_b32_e32 v2, v0 982; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 983; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 984; GFX1064-NEXT: v_mov_b32_e32 v1, 0 985; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 986; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 987; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 988; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 989; GFX1064-NEXT: s_not_b64 exec, exec 990; GFX1064-NEXT: v_mov_b32_e32 v2, 0 991; GFX1064-NEXT: s_not_b64 exec, exec 992; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 993; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 994; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 995; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 996; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 997; GFX1064-NEXT: v_mov_b32_e32 v3, v2 998; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 999; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1000; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 1001; GFX1064-NEXT: v_mov_b32_e32 v3, s2 1002; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1003; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 1004; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 1005; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 1006; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 1007; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 1008; GFX1064-NEXT: s_mov_b32 s2, -1 1009; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 1010; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 1011; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 1012; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1013; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1014; GFX1064-NEXT: ; implicit-def: $vgpr0 1015; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1016; GFX1064-NEXT: ; mask branch BB4_2 1017; GFX1064-NEXT: s_cbranch_execz BB4_2 1018; GFX1064-NEXT: BB4_1: 1019; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1020; GFX1064-NEXT: v_mov_b32_e32 v7, s3 1021; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1022; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1023; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 1024; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1025; GFX1064-NEXT: buffer_gl0_inv 1026; GFX1064-NEXT: buffer_gl1_inv 1027; GFX1064-NEXT: BB4_2: 1028; GFX1064-NEXT: v_nop 1029; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1030; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1031; GFX1064-NEXT: v_mov_b32_e32 v0, v1 1032; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 1033; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1034; GFX1064-NEXT: s_nop 1 1035; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1036; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1037; GFX1064-NEXT: s_endpgm 1038; 1039; GFX1032-LABEL: add_i32_varying_gfx1064: 1040; GFX1032: ; %bb.0: ; %entry 1041; GFX1032-NEXT: ; implicit-def: $vcc_hi 1042; GFX1032-NEXT: v_mov_b32_e32 v2, v0 1043; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1044; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1045; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1046; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1047; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 1048; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1049; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1050; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1051; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1052; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1053; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1054; GFX1032-NEXT: s_mov_b32 s2, -1 1055; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1056; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1057; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1058; GFX1032-NEXT: v_mov_b32_e32 v3, v2 1059; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 1060; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1061; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 1062; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 1063; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 1064; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 1065; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1066; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1067; GFX1032-NEXT: ; implicit-def: $vgpr0 1068; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1069; GFX1032-NEXT: ; mask branch BB4_2 1070; GFX1032-NEXT: s_cbranch_execz BB4_2 1071; GFX1032-NEXT: BB4_1: 1072; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1073; GFX1032-NEXT: v_mov_b32_e32 v7, s3 1074; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1075; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1076; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 1077; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1078; GFX1032-NEXT: buffer_gl0_inv 1079; GFX1032-NEXT: buffer_gl1_inv 1080; GFX1032-NEXT: BB4_2: 1081; GFX1032-NEXT: v_nop 1082; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1083; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1084; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1085; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1086; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1087; GFX1032-NEXT: s_nop 1 1088; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1089; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1090; GFX1032-NEXT: s_endpgm 1091entry: 1092 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1093 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1094 store i32 %old, i32 addrspace(1)* %out 1095 ret void 1096} 1097 1098define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1099; 1100; 1101; GFX7LESS-LABEL: add_i64_constant: 1102; GFX7LESS: ; %bb.0: ; %entry 1103; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1104; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1105; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1106; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1107; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1108; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1109; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1110; GFX7LESS-NEXT: ; mask branch BB5_2 1111; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1112; GFX7LESS-NEXT: BB5_1: 1113; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1114; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1115; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1116; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1117; GFX7LESS-NEXT: s_mov_b32 m0, -1 1118; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1119; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1120; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1121; GFX7LESS-NEXT: buffer_wbinvl1 1122; GFX7LESS-NEXT: BB5_2: 1123; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1124; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1125; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1126; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1127; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1128; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1129; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1130; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1131; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1132; GFX7LESS-NEXT: s_mov_b32 s2, -1 1133; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1134; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1135; GFX7LESS-NEXT: s_endpgm 1136; 1137; GFX8-LABEL: add_i64_constant: 1138; GFX8: ; %bb.0: ; %entry 1139; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1140; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1141; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1142; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1143; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1144; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1145; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1146; GFX8-NEXT: ; mask branch BB5_2 1147; GFX8-NEXT: s_cbranch_execz BB5_2 1148; GFX8-NEXT: BB5_1: 1149; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1150; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1151; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1152; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1153; GFX8-NEXT: s_mov_b32 m0, -1 1154; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1155; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1156; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1157; GFX8-NEXT: buffer_wbinvl1_vol 1158; GFX8-NEXT: BB5_2: 1159; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1160; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1161; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1162; GFX8-NEXT: v_mov_b32_e32 v1, s2 1163; GFX8-NEXT: v_mov_b32_e32 v2, s3 1164; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1165; GFX8-NEXT: s_mov_b32 s3, 0xf000 1166; GFX8-NEXT: s_mov_b32 s2, -1 1167; GFX8-NEXT: s_nop 2 1168; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1169; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1170; GFX8-NEXT: s_endpgm 1171; 1172; GFX9-LABEL: add_i64_constant: 1173; GFX9: ; %bb.0: ; %entry 1174; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1175; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1176; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1177; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1178; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1179; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1180; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1181; GFX9-NEXT: ; mask branch BB5_2 1182; GFX9-NEXT: s_cbranch_execz BB5_2 1183; GFX9-NEXT: BB5_1: 1184; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1185; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1186; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1187; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1188; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1189; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1190; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1191; GFX9-NEXT: buffer_wbinvl1_vol 1192; GFX9-NEXT: BB5_2: 1193; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1194; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1195; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1196; GFX9-NEXT: v_mov_b32_e32 v1, s2 1197; GFX9-NEXT: v_mov_b32_e32 v2, s3 1198; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1199; GFX9-NEXT: s_mov_b32 s3, 0xf000 1200; GFX9-NEXT: s_mov_b32 s2, -1 1201; GFX9-NEXT: s_nop 2 1202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1204; GFX9-NEXT: s_endpgm 1205; 1206; GFX1064-LABEL: add_i64_constant: 1207; GFX1064: ; %bb.0: ; %entry 1208; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1209; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1210; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1211; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1212; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1213; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1214; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1215; GFX1064-NEXT: ; mask branch BB5_2 1216; GFX1064-NEXT: s_cbranch_execz BB5_2 1217; GFX1064-NEXT: BB5_1: 1218; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1219; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1220; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 1221; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1222; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1223; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1224; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1225; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1226; GFX1064-NEXT: buffer_gl0_inv 1227; GFX1064-NEXT: buffer_gl1_inv 1228; GFX1064-NEXT: BB5_2: 1229; GFX1064-NEXT: v_nop 1230; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1231; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1232; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1233; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1234; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1235; GFX1064-NEXT: s_mov_b32 s2, -1 1236; GFX1064-NEXT: s_nop 2 1237; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1239; GFX1064-NEXT: s_endpgm 1240; 1241; GFX1032-LABEL: add_i64_constant: 1242; GFX1032: ; %bb.0: ; %entry 1243; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1244; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 1245; GFX1032-NEXT: ; implicit-def: $vcc_hi 1246; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1247; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1248; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1249; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1250; GFX1032-NEXT: ; mask branch BB5_2 1251; GFX1032-NEXT: s_cbranch_execz BB5_2 1252; GFX1032-NEXT: BB5_1: 1253; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1254; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1255; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1256; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1257; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1258; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1259; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1260; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1261; GFX1032-NEXT: buffer_gl0_inv 1262; GFX1032-NEXT: buffer_gl1_inv 1263; GFX1032-NEXT: BB5_2: 1264; GFX1032-NEXT: v_nop 1265; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1266; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1267; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1268; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1269; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1270; GFX1032-NEXT: s_mov_b32 s2, -1 1271; GFX1032-NEXT: s_nop 2 1272; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1273; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1274; GFX1032-NEXT: s_endpgm 1275entry: 1276 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1277 store i64 %old, i64 addrspace(1)* %out 1278 ret void 1279} 1280 1281define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1282; 1283; 1284; GFX7LESS-LABEL: add_i64_uniform: 1285; GFX7LESS: ; %bb.0: ; %entry 1286; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1287; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1288; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1289; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1290; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1291; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1292; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1293; GFX7LESS-NEXT: ; mask branch BB6_2 1294; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1295; GFX7LESS-NEXT: BB6_1: 1296; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1297; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1298; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1299; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1300; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1301; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1302; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1303; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1304; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1305; GFX7LESS-NEXT: s_mov_b32 m0, -1 1306; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1307; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1308; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1309; GFX7LESS-NEXT: buffer_wbinvl1 1310; GFX7LESS-NEXT: BB6_2: 1311; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1312; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1313; GFX7LESS-NEXT: s_mov_b32 s6, -1 1314; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1315; GFX7LESS-NEXT: s_mov_b32 s4, s0 1316; GFX7LESS-NEXT: s_mov_b32 s5, s1 1317; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1318; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1319; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1320; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1321; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1322; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1323; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1324; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1325; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1326; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1327; GFX7LESS-NEXT: s_endpgm 1328; 1329; GFX8-LABEL: add_i64_uniform: 1330; GFX8: ; %bb.0: ; %entry 1331; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1332; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1333; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1334; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1335; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1336; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1337; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1338; GFX8-NEXT: ; mask branch BB6_2 1339; GFX8-NEXT: s_cbranch_execz BB6_2 1340; GFX8-NEXT: BB6_1: 1341; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1342; GFX8-NEXT: v_mov_b32_e32 v1, s6 1343; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1344; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1345; GFX8-NEXT: s_mul_i32 s7, s3, s6 1346; GFX8-NEXT: s_mul_i32 s6, s2, s6 1347; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1348; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1349; GFX8-NEXT: v_mov_b32_e32 v1, s6 1350; GFX8-NEXT: s_mov_b32 m0, -1 1351; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1352; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1353; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1354; GFX8-NEXT: buffer_wbinvl1_vol 1355; GFX8-NEXT: BB6_2: 1356; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1357; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1358; GFX8-NEXT: s_mov_b32 s4, s0 1359; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1360; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1361; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1362; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1363; GFX8-NEXT: s_mov_b32 s5, s1 1364; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1365; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1366; GFX8-NEXT: v_mov_b32_e32 v2, s1 1367; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1368; GFX8-NEXT: s_mov_b32 s7, 0xf000 1369; GFX8-NEXT: s_mov_b32 s6, -1 1370; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1371; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1372; GFX8-NEXT: s_endpgm 1373; 1374; GFX9-LABEL: add_i64_uniform: 1375; GFX9: ; %bb.0: ; %entry 1376; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1377; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1378; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1379; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1380; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1381; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1382; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1383; GFX9-NEXT: ; mask branch BB6_2 1384; GFX9-NEXT: s_cbranch_execz BB6_2 1385; GFX9-NEXT: BB6_1: 1386; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1387; GFX9-NEXT: v_mov_b32_e32 v1, s6 1388; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX9-NEXT: v_mul_hi_u32 v2, s2, v1 1390; GFX9-NEXT: s_mul_i32 s7, s3, s6 1391; GFX9-NEXT: s_mul_i32 s6, s2, s6 1392; GFX9-NEXT: v_mov_b32_e32 v1, s6 1393; GFX9-NEXT: v_add_u32_e32 v2, s7, v2 1394; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1395; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1396; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1397; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1398; GFX9-NEXT: buffer_wbinvl1_vol 1399; GFX9-NEXT: BB6_2: 1400; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1401; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1402; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1403; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1404; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1405; GFX9-NEXT: s_mov_b32 s4, s0 1406; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1407; GFX9-NEXT: s_mov_b32 s5, s1 1408; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1409; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1410; GFX9-NEXT: v_mov_b32_e32 v2, s1 1411; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1412; GFX9-NEXT: s_mov_b32 s7, 0xf000 1413; GFX9-NEXT: s_mov_b32 s6, -1 1414; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1415; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1416; GFX9-NEXT: s_endpgm 1417; 1418; GFX1064-LABEL: add_i64_uniform: 1419; GFX1064: ; %bb.0: ; %entry 1420; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1421; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1422; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1423; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1424; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1425; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1426; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1427; GFX1064-NEXT: ; mask branch BB6_2 1428; GFX1064-NEXT: s_cbranch_execz BB6_2 1429; GFX1064-NEXT: BB6_1: 1430; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1431; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1432; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX1064-NEXT: v_mul_hi_u32 v2, s2, s6 1434; GFX1064-NEXT: s_mul_i32 s7, s2, s6 1435; GFX1064-NEXT: s_mul_i32 s6, s3, s6 1436; GFX1064-NEXT: v_mov_b32_e32 v1, s7 1437; GFX1064-NEXT: v_add_nc_u32_e32 v2, s6, v2 1438; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1439; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1440; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1441; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1442; GFX1064-NEXT: buffer_gl0_inv 1443; GFX1064-NEXT: buffer_gl1_inv 1444; GFX1064-NEXT: BB6_2: 1445; GFX1064-NEXT: v_nop 1446; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1447; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1448; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1449; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1450; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1451; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 1452; GFX1064-NEXT: v_readfirstlane_b32 s5, v2 1453; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1454; GFX1064-NEXT: s_mov_b32 s2, -1 1455; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1456; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s4, v0 1457; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s5, v1, vcc 1458; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1459; GFX1064-NEXT: s_endpgm 1460; 1461; GFX1032-LABEL: add_i64_uniform: 1462; GFX1032: ; %bb.0: ; %entry 1463; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1464; GFX1032-NEXT: v_cmp_ne_u32_e64 s5, 1, 0 1465; GFX1032-NEXT: ; implicit-def: $vcc_hi 1466; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1467; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1468; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1469; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1470; GFX1032-NEXT: ; mask branch BB6_2 1471; GFX1032-NEXT: s_cbranch_execz BB6_2 1472; GFX1032-NEXT: BB6_1: 1473; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1474; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1475; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1476; GFX1032-NEXT: v_mul_hi_u32 v2, s2, s5 1477; GFX1032-NEXT: s_mul_i32 s6, s2, s5 1478; GFX1032-NEXT: s_mul_i32 s5, s3, s5 1479; GFX1032-NEXT: v_mov_b32_e32 v1, s6 1480; GFX1032-NEXT: v_add_nc_u32_e32 v2, s5, v2 1481; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1482; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1483; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1484; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1485; GFX1032-NEXT: buffer_gl0_inv 1486; GFX1032-NEXT: buffer_gl1_inv 1487; GFX1032-NEXT: BB6_2: 1488; GFX1032-NEXT: v_nop 1489; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1490; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1492; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1493; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1494; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 1495; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 1496; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1497; GFX1032-NEXT: s_mov_b32 s2, -1 1498; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1499; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s4, v0 1500; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo 1501; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1502; GFX1032-NEXT: s_endpgm 1503entry: 1504 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1505 store i64 %old, i64 addrspace(1)* %out 1506 ret void 1507} 1508 1509; GCN-NOT: v_mbcnt_lo_u32_b32 1510; GCN-NOT: v_mbcnt_hi_u32_b32 1511; GCN-NOT: s_bcnt1_i32_b64 1512define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1513; 1514; 1515; GFX7LESS-LABEL: add_i64_varying: 1516; GFX7LESS: ; %bb.0: ; %entry 1517; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1518; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1519; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1520; GFX7LESS-NEXT: s_mov_b32 m0, -1 1521; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1522; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1523; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1524; GFX7LESS-NEXT: buffer_wbinvl1 1525; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1526; GFX7LESS-NEXT: s_mov_b32 s2, -1 1527; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1528; GFX7LESS-NEXT: s_endpgm 1529; 1530; GFX8-LABEL: add_i64_varying: 1531; GFX8: ; %bb.0: ; %entry 1532; GFX8-NEXT: v_mov_b32_e32 v1, 0 1533; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1534; GFX8-NEXT: s_mov_b32 m0, -1 1535; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1536; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1537; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1538; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1539; GFX8-NEXT: buffer_wbinvl1_vol 1540; GFX8-NEXT: s_mov_b32 s3, 0xf000 1541; GFX8-NEXT: s_mov_b32 s2, -1 1542; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1543; GFX8-NEXT: s_endpgm 1544; 1545; GFX9-LABEL: add_i64_varying: 1546; GFX9: ; %bb.0: ; %entry 1547; GFX9-NEXT: v_mov_b32_e32 v1, 0 1548; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1549; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1550; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1551; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1552; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1553; GFX9-NEXT: buffer_wbinvl1_vol 1554; GFX9-NEXT: s_mov_b32 s3, 0xf000 1555; GFX9-NEXT: s_mov_b32 s2, -1 1556; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1557; GFX9-NEXT: s_endpgm 1558; 1559; GFX1064-LABEL: add_i64_varying: 1560; GFX1064: ; %bb.0: ; %entry 1561; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1562; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1563; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1564; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1565; GFX1064-NEXT: s_mov_b32 s2, -1 1566; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1567; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1568; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1569; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1570; GFX1064-NEXT: buffer_gl0_inv 1571; GFX1064-NEXT: buffer_gl1_inv 1572; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1573; GFX1064-NEXT: s_endpgm 1574; 1575; GFX1032-LABEL: add_i64_varying: 1576; GFX1032: ; %bb.0: ; %entry 1577; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1578; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1579; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1580; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1581; GFX1032-NEXT: s_mov_b32 s2, -1 1582; GFX1032-NEXT: ; implicit-def: $vcc_hi 1583; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1584; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1585; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1586; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1587; GFX1032-NEXT: buffer_gl0_inv 1588; GFX1032-NEXT: buffer_gl1_inv 1589; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1590; GFX1032-NEXT: s_endpgm 1591entry: 1592 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1593 %zext = zext i32 %lane to i64 1594 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1595 store i64 %old, i64 addrspace(1)* %out 1596 ret void 1597} 1598 1599define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1600; 1601; 1602; GFX7LESS-LABEL: sub_i32_constant: 1603; GFX7LESS: ; %bb.0: ; %entry 1604; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1605; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1606; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1607; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1608; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1609; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1610; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1611; GFX7LESS-NEXT: ; mask branch BB8_2 1612; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1613; GFX7LESS-NEXT: BB8_1: 1614; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1615; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1616; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s4, 5 1617; GFX7LESS-NEXT: s_mov_b32 m0, -1 1618; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1619; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1620; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1621; GFX7LESS-NEXT: buffer_wbinvl1 1622; GFX7LESS-NEXT: BB8_2: 1623; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1624; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1625; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1626; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1627; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1628; GFX7LESS-NEXT: s_mov_b32 s2, -1 1629; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1630; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1631; GFX7LESS-NEXT: s_endpgm 1632; 1633; GFX8-LABEL: sub_i32_constant: 1634; GFX8: ; %bb.0: ; %entry 1635; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1636; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1637; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1638; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1639; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1640; GFX8-NEXT: ; implicit-def: $vgpr1 1641; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1642; GFX8-NEXT: ; mask branch BB8_2 1643; GFX8-NEXT: s_cbranch_execz BB8_2 1644; GFX8-NEXT: BB8_1: 1645; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1646; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1647; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1648; GFX8-NEXT: s_mov_b32 m0, -1 1649; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1650; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 1651; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1652; GFX8-NEXT: buffer_wbinvl1_vol 1653; GFX8-NEXT: BB8_2: 1654; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1655; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1656; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1657; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1658; GFX8-NEXT: s_mov_b32 s3, 0xf000 1659; GFX8-NEXT: s_mov_b32 s2, -1 1660; GFX8-NEXT: s_nop 0 1661; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1662; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1663; GFX8-NEXT: s_endpgm 1664; 1665; GFX9-LABEL: sub_i32_constant: 1666; GFX9: ; %bb.0: ; %entry 1667; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1668; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 1669; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1670; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1671; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1672; GFX9-NEXT: ; implicit-def: $vgpr1 1673; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1674; GFX9-NEXT: ; mask branch BB8_2 1675; GFX9-NEXT: s_cbranch_execz BB8_2 1676; GFX9-NEXT: BB8_1: 1677; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1678; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1679; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1680; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1681; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 1682; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1683; GFX9-NEXT: buffer_wbinvl1_vol 1684; GFX9-NEXT: BB8_2: 1685; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1686; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1687; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1688; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1689; GFX9-NEXT: s_mov_b32 s3, 0xf000 1690; GFX9-NEXT: s_mov_b32 s2, -1 1691; GFX9-NEXT: s_nop 0 1692; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1693; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1694; GFX9-NEXT: s_endpgm 1695; 1696; GFX1064-LABEL: sub_i32_constant: 1697; GFX1064: ; %bb.0: ; %entry 1698; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1699; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1700; GFX1064-NEXT: ; implicit-def: $vgpr1 1701; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1702; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1703; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1704; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1705; GFX1064-NEXT: ; mask branch BB8_2 1706; GFX1064-NEXT: s_cbranch_execz BB8_2 1707; GFX1064-NEXT: BB8_1: 1708; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1709; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1710; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1711; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1712; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1713; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 1714; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1715; GFX1064-NEXT: buffer_gl0_inv 1716; GFX1064-NEXT: buffer_gl1_inv 1717; GFX1064-NEXT: BB8_2: 1718; GFX1064-NEXT: v_nop 1719; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1720; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1721; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1722; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1723; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1724; GFX1064-NEXT: s_mov_b32 s2, -1 1725; GFX1064-NEXT: s_nop 0 1726; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1728; GFX1064-NEXT: s_endpgm 1729; 1730; GFX1032-LABEL: sub_i32_constant: 1731; GFX1032: ; %bb.0: ; %entry 1732; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1733; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 1734; GFX1032-NEXT: ; implicit-def: $vcc_hi 1735; GFX1032-NEXT: ; implicit-def: $vgpr1 1736; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1737; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1738; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1739; GFX1032-NEXT: ; mask branch BB8_2 1740; GFX1032-NEXT: s_cbranch_execz BB8_2 1741; GFX1032-NEXT: BB8_1: 1742; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1743; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1744; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1745; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1746; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1747; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 1748; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1749; GFX1032-NEXT: buffer_gl0_inv 1750; GFX1032-NEXT: buffer_gl1_inv 1751; GFX1032-NEXT: BB8_2: 1752; GFX1032-NEXT: v_nop 1753; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1754; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1755; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1756; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1757; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1758; GFX1032-NEXT: s_mov_b32 s2, -1 1759; GFX1032-NEXT: s_nop 0 1760; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1761; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1762; GFX1032-NEXT: s_endpgm 1763entry: 1764 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1765 store i32 %old, i32 addrspace(1)* %out 1766 ret void 1767} 1768 1769define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1770; 1771; 1772; GFX7LESS-LABEL: sub_i32_uniform: 1773; GFX7LESS: ; %bb.0: ; %entry 1774; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1775; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 1776; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1777; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1778; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1779; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1780; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1781; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1782; GFX7LESS-NEXT: ; mask branch BB9_2 1783; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1784; GFX7LESS-NEXT: BB9_1: 1785; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1786; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1787; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 1788; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1789; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 1790; GFX7LESS-NEXT: s_mov_b32 m0, -1 1791; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1792; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1793; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1794; GFX7LESS-NEXT: buffer_wbinvl1 1795; GFX7LESS-NEXT: BB9_2: 1796; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1797; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1798; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1799; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1800; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1801; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1802; GFX7LESS-NEXT: s_mov_b32 s6, -1 1803; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1804; GFX7LESS-NEXT: s_endpgm 1805; 1806; GFX8-LABEL: sub_i32_uniform: 1807; GFX8: ; %bb.0: ; %entry 1808; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1809; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1810; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1811; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1812; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1813; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1814; GFX8-NEXT: ; implicit-def: $vgpr1 1815; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1816; GFX8-NEXT: ; mask branch BB9_2 1817; GFX8-NEXT: s_cbranch_execz BB9_2 1818; GFX8-NEXT: BB9_1: 1819; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 1820; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1821; GFX8-NEXT: s_mul_i32 s1, s0, s1 1822; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1823; GFX8-NEXT: v_mov_b32_e32 v2, s1 1824; GFX8-NEXT: s_mov_b32 m0, -1 1825; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1826; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1827; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1828; GFX8-NEXT: buffer_wbinvl1_vol 1829; GFX8-NEXT: BB9_2: 1830; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1831; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1832; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1833; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1834; GFX8-NEXT: s_mov_b32 s7, 0xf000 1835; GFX8-NEXT: s_mov_b32 s6, -1 1836; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1837; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1838; GFX8-NEXT: s_endpgm 1839; 1840; GFX9-LABEL: sub_i32_uniform: 1841; GFX9: ; %bb.0: ; %entry 1842; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1843; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 1844; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 1845; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1846; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1847; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1848; GFX9-NEXT: ; implicit-def: $vgpr1 1849; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1850; GFX9-NEXT: ; mask branch BB9_2 1851; GFX9-NEXT: s_cbranch_execz BB9_2 1852; GFX9-NEXT: BB9_1: 1853; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[6:7] 1854; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX9-NEXT: s_mul_i32 s1, s0, s1 1856; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1857; GFX9-NEXT: v_mov_b32_e32 v2, s1 1858; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1859; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1860; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1861; GFX9-NEXT: buffer_wbinvl1_vol 1862; GFX9-NEXT: BB9_2: 1863; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1864; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1865; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 1866; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1867; GFX9-NEXT: s_mov_b32 s7, 0xf000 1868; GFX9-NEXT: s_mov_b32 s6, -1 1869; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1870; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1871; GFX9-NEXT: s_endpgm 1872; 1873; GFX1064-LABEL: sub_i32_uniform: 1874; GFX1064: ; %bb.0: ; %entry 1875; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1876; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1877; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 1878; GFX1064-NEXT: ; implicit-def: $vgpr1 1879; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1880; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1881; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1882; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 1883; GFX1064-NEXT: ; mask branch BB9_2 1884; GFX1064-NEXT: s_cbranch_execz BB9_2 1885; GFX1064-NEXT: BB9_1: 1886; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1887; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1888; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1889; GFX1064-NEXT: s_mul_i32 s1, s0, s1 1890; GFX1064-NEXT: v_mov_b32_e32 v2, s1 1891; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1892; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1893; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1894; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1895; GFX1064-NEXT: buffer_gl0_inv 1896; GFX1064-NEXT: buffer_gl1_inv 1897; GFX1064-NEXT: BB9_2: 1898; GFX1064-NEXT: v_nop 1899; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 1900; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1901; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 1902; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1903; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1904; GFX1064-NEXT: s_mov_b32 s6, -1 1905; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1906; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1907; GFX1064-NEXT: s_endpgm 1908; 1909; GFX1032-LABEL: sub_i32_uniform: 1910; GFX1032: ; %bb.0: ; %entry 1911; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1912; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 1913; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 1914; GFX1032-NEXT: ; implicit-def: $vcc_hi 1915; GFX1032-NEXT: ; implicit-def: $vgpr1 1916; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1917; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1918; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1919; GFX1032-NEXT: ; mask branch BB9_2 1920; GFX1032-NEXT: s_cbranch_execz BB9_2 1921; GFX1032-NEXT: BB9_1: 1922; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1923; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1924; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1925; GFX1032-NEXT: s_mul_i32 s2, s0, s2 1926; GFX1032-NEXT: v_mov_b32_e32 v2, s2 1927; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1928; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1929; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1930; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1931; GFX1032-NEXT: buffer_gl0_inv 1932; GFX1032-NEXT: buffer_gl1_inv 1933; GFX1032-NEXT: BB9_2: 1934; GFX1032-NEXT: v_nop 1935; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 1936; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1937; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 1938; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1939; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1940; GFX1032-NEXT: s_mov_b32 s6, -1 1941; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1942; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1943; GFX1032-NEXT: s_endpgm 1944entry: 1945 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1946 store i32 %old, i32 addrspace(1)* %out 1947 ret void 1948} 1949 1950; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 1951; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 1952; GFX7LESS-NOT: s_bcnt1_i32_b64 1953; DPPCOMB: v_add_u32_dpp 1954; DPPCOMB: v_add_u32_dpp 1955; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 1956; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 1957; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 1958define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1959; 1960; 1961; GFX7LESS-LABEL: sub_i32_varying: 1962; GFX7LESS: ; %bb.0: ; %entry 1963; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1964; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1965; GFX7LESS-NEXT: s_mov_b32 m0, -1 1966; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1967; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1968; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1969; GFX7LESS-NEXT: buffer_wbinvl1 1970; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1971; GFX7LESS-NEXT: s_mov_b32 s2, -1 1972; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1973; GFX7LESS-NEXT: s_endpgm 1974; 1975; GFX8-LABEL: sub_i32_varying: 1976; GFX8: ; %bb.0: ; %entry 1977; GFX8-NEXT: v_mov_b32_e32 v2, v0 1978; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1979; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1980; GFX8-NEXT: v_mov_b32_e32 v1, 0 1981; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1982; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 1983; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1984; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1985; GFX8-NEXT: s_not_b64 exec, exec 1986; GFX8-NEXT: v_mov_b32_e32 v2, 0 1987; GFX8-NEXT: s_not_b64 exec, exec 1988; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1989; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1990; GFX8-NEXT: s_nop 1 1991; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1992; GFX8-NEXT: s_nop 1 1993; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1994; GFX8-NEXT: s_nop 1 1995; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1996; GFX8-NEXT: s_nop 1 1997; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1998; GFX8-NEXT: s_nop 1 1999; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2000; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2001; GFX8-NEXT: s_nop 0 2002; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2003; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2004; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2005; GFX8-NEXT: ; implicit-def: $vgpr0 2006; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2007; GFX8-NEXT: ; mask branch BB10_2 2008; GFX8-NEXT: s_cbranch_execz BB10_2 2009; GFX8-NEXT: BB10_1: 2010; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2011; GFX8-NEXT: v_mov_b32_e32 v3, s2 2012; GFX8-NEXT: s_mov_b32 m0, -1 2013; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2014; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 2015; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2016; GFX8-NEXT: buffer_wbinvl1_vol 2017; GFX8-NEXT: BB10_2: 2018; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2019; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2020; GFX8-NEXT: v_mov_b32_e32 v0, v1 2021; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2022; GFX8-NEXT: s_mov_b32 s3, 0xf000 2023; GFX8-NEXT: s_mov_b32 s2, -1 2024; GFX8-NEXT: s_nop 0 2025; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2026; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2027; GFX8-NEXT: s_endpgm 2028; 2029; GFX9-LABEL: sub_i32_varying: 2030; GFX9: ; %bb.0: ; %entry 2031; GFX9-NEXT: v_mov_b32_e32 v2, v0 2032; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2033; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2034; GFX9-NEXT: v_mov_b32_e32 v1, 0 2035; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2036; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2037; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2038; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2039; GFX9-NEXT: s_not_b64 exec, exec 2040; GFX9-NEXT: v_mov_b32_e32 v2, 0 2041; GFX9-NEXT: s_not_b64 exec, exec 2042; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2043; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2044; GFX9-NEXT: s_nop 1 2045; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2046; GFX9-NEXT: s_nop 1 2047; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2048; GFX9-NEXT: s_nop 1 2049; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2050; GFX9-NEXT: s_nop 1 2051; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2052; GFX9-NEXT: s_nop 1 2053; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2054; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2055; GFX9-NEXT: s_nop 0 2056; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2057; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2058; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2059; GFX9-NEXT: ; implicit-def: $vgpr0 2060; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2061; GFX9-NEXT: ; mask branch BB10_2 2062; GFX9-NEXT: s_cbranch_execz BB10_2 2063; GFX9-NEXT: BB10_1: 2064; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2065; GFX9-NEXT: v_mov_b32_e32 v3, s2 2066; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2067; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2068; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2069; GFX9-NEXT: buffer_wbinvl1_vol 2070; GFX9-NEXT: BB10_2: 2071; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2072; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2073; GFX9-NEXT: v_mov_b32_e32 v0, v1 2074; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2075; GFX9-NEXT: s_mov_b32 s3, 0xf000 2076; GFX9-NEXT: s_mov_b32 s2, -1 2077; GFX9-NEXT: s_nop 0 2078; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2079; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2080; GFX9-NEXT: s_endpgm 2081; 2082; GFX1064-LABEL: sub_i32_varying: 2083; GFX1064: ; %bb.0: ; %entry 2084; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2085; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2086; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2087; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2088; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2089; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2090; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2091; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 2092; GFX1064-NEXT: s_not_b64 exec, exec 2093; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2094; GFX1064-NEXT: s_not_b64 exec, exec 2095; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2096; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2097; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2098; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2099; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2100; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2101; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2102; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2103; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2104; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2105; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2106; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2107; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2108; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2109; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2110; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2111; GFX1064-NEXT: s_mov_b32 s2, -1 2112; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2113; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2114; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2115; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2116; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2117; GFX1064-NEXT: ; implicit-def: $vgpr0 2118; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2119; GFX1064-NEXT: ; mask branch BB10_2 2120; GFX1064-NEXT: s_cbranch_execz BB10_2 2121; GFX1064-NEXT: BB10_1: 2122; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2123; GFX1064-NEXT: v_mov_b32_e32 v7, s3 2124; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2125; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2126; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7 2127; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2128; GFX1064-NEXT: buffer_gl0_inv 2129; GFX1064-NEXT: buffer_gl1_inv 2130; GFX1064-NEXT: BB10_2: 2131; GFX1064-NEXT: v_nop 2132; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2133; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2134; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2135; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2136; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2137; GFX1064-NEXT: s_nop 1 2138; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2139; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2140; GFX1064-NEXT: s_endpgm 2141; 2142; GFX1032-LABEL: sub_i32_varying: 2143; GFX1032: ; %bb.0: ; %entry 2144; GFX1032-NEXT: ; implicit-def: $vcc_hi 2145; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2146; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2147; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2148; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2149; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2150; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 2151; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2152; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2153; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2154; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2155; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2156; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2157; GFX1032-NEXT: s_mov_b32 s2, -1 2158; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2159; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2160; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2161; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2162; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2163; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2164; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2165; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2166; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2167; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2168; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2169; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2170; GFX1032-NEXT: ; implicit-def: $vgpr0 2171; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2172; GFX1032-NEXT: ; mask branch BB10_2 2173; GFX1032-NEXT: s_cbranch_execz BB10_2 2174; GFX1032-NEXT: BB10_1: 2175; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2176; GFX1032-NEXT: v_mov_b32_e32 v7, s3 2177; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2178; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2179; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7 2180; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2181; GFX1032-NEXT: buffer_gl0_inv 2182; GFX1032-NEXT: buffer_gl1_inv 2183; GFX1032-NEXT: BB10_2: 2184; GFX1032-NEXT: v_nop 2185; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2186; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2187; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2188; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2189; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2190; GFX1032-NEXT: s_nop 1 2191; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2192; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2193; GFX1032-NEXT: s_endpgm 2194entry: 2195 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2196 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2197 store i32 %old, i32 addrspace(1)* %out 2198 ret void 2199} 2200 2201define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2202; 2203; 2204; GFX7LESS-LABEL: sub_i64_constant: 2205; GFX7LESS: ; %bb.0: ; %entry 2206; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2207; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2208; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2209; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2210; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2211; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2212; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2213; GFX7LESS-NEXT: ; mask branch BB11_2 2214; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2215; GFX7LESS-NEXT: BB11_1: 2216; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2217; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2218; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2219; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2220; GFX7LESS-NEXT: s_mov_b32 m0, -1 2221; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2222; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2223; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2224; GFX7LESS-NEXT: buffer_wbinvl1 2225; GFX7LESS-NEXT: BB11_2: 2226; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2227; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2228; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2229; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2230; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2231; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2232; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2233; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2234; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2235; GFX7LESS-NEXT: s_mov_b32 s2, -1 2236; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2237; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2238; GFX7LESS-NEXT: s_endpgm 2239; 2240; GFX8-LABEL: sub_i64_constant: 2241; GFX8: ; %bb.0: ; %entry 2242; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2243; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2244; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2245; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2246; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2247; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2248; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2249; GFX8-NEXT: ; mask branch BB11_2 2250; GFX8-NEXT: s_cbranch_execz BB11_2 2251; GFX8-NEXT: BB11_1: 2252; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2253; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2254; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2255; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2256; GFX8-NEXT: s_mov_b32 m0, -1 2257; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2258; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2259; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2260; GFX8-NEXT: buffer_wbinvl1_vol 2261; GFX8-NEXT: BB11_2: 2262; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2263; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2264; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2265; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2266; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2267; GFX8-NEXT: v_mov_b32_e32 v2, s3 2268; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2269; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2270; GFX8-NEXT: s_mov_b32 s3, 0xf000 2271; GFX8-NEXT: s_mov_b32 s2, -1 2272; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2273; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2274; GFX8-NEXT: s_endpgm 2275; 2276; GFX9-LABEL: sub_i64_constant: 2277; GFX9: ; %bb.0: ; %entry 2278; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2279; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 2280; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2281; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2282; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2283; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2284; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2285; GFX9-NEXT: ; mask branch BB11_2 2286; GFX9-NEXT: s_cbranch_execz BB11_2 2287; GFX9-NEXT: BB11_1: 2288; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2289; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2290; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2291; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2292; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2293; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2294; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2295; GFX9-NEXT: buffer_wbinvl1_vol 2296; GFX9-NEXT: BB11_2: 2297; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2298; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2299; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2300; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2301; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2302; GFX9-NEXT: v_mov_b32_e32 v2, s3 2303; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2304; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2305; GFX9-NEXT: s_mov_b32 s3, 0xf000 2306; GFX9-NEXT: s_mov_b32 s2, -1 2307; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2308; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2309; GFX9-NEXT: s_endpgm 2310; 2311; GFX1064-LABEL: sub_i64_constant: 2312; GFX1064: ; %bb.0: ; %entry 2313; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2314; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2315; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2316; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2317; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 2318; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2319; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2320; GFX1064-NEXT: ; mask branch BB11_2 2321; GFX1064-NEXT: s_cbranch_execz BB11_2 2322; GFX1064-NEXT: BB11_1: 2323; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 2324; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2325; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 2326; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 2327; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2328; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2329; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2330; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2331; GFX1064-NEXT: buffer_gl0_inv 2332; GFX1064-NEXT: buffer_gl1_inv 2333; GFX1064-NEXT: BB11_2: 2334; GFX1064-NEXT: v_nop 2335; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2336; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2337; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2338; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2339; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2340; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2341; GFX1064-NEXT: s_mov_b32 s2, -1 2342; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2343; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2344; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2345; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2346; GFX1064-NEXT: s_endpgm 2347; 2348; GFX1032-LABEL: sub_i64_constant: 2349; GFX1032: ; %bb.0: ; %entry 2350; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2351; GFX1032-NEXT: v_cmp_ne_u32_e64 s3, 1, 0 2352; GFX1032-NEXT: ; implicit-def: $vcc_hi 2353; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2354; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2355; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2356; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2357; GFX1032-NEXT: ; mask branch BB11_2 2358; GFX1032-NEXT: s_cbranch_execz BB11_2 2359; GFX1032-NEXT: BB11_1: 2360; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2361; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2362; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2363; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 2364; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2365; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2366; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2367; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2368; GFX1032-NEXT: buffer_gl0_inv 2369; GFX1032-NEXT: buffer_gl1_inv 2370; GFX1032-NEXT: BB11_2: 2371; GFX1032-NEXT: v_nop 2372; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2373; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2374; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2375; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2376; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2377; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2378; GFX1032-NEXT: s_mov_b32 s2, -1 2379; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2380; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2381; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2382; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2383; GFX1032-NEXT: s_endpgm 2384entry: 2385 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2386 store i64 %old, i64 addrspace(1)* %out 2387 ret void 2388} 2389 2390define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2391; 2392; 2393; GFX7LESS-LABEL: sub_i64_uniform: 2394; GFX7LESS: ; %bb.0: ; %entry 2395; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2396; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2397; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2398; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2399; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2400; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2401; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2402; GFX7LESS-NEXT: ; mask branch BB12_2 2403; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2404; GFX7LESS-NEXT: BB12_1: 2405; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2406; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2407; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2408; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2409; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2410; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2411; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2412; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2413; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2414; GFX7LESS-NEXT: s_mov_b32 m0, -1 2415; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2416; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2417; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2418; GFX7LESS-NEXT: buffer_wbinvl1 2419; GFX7LESS-NEXT: BB12_2: 2420; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2421; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2422; GFX7LESS-NEXT: s_mov_b32 s6, -1 2423; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2424; GFX7LESS-NEXT: s_mov_b32 s4, s0 2425; GFX7LESS-NEXT: s_mov_b32 s5, s1 2426; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2427; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2428; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2429; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2430; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2431; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2432; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2433; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2434; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2435; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2436; GFX7LESS-NEXT: s_endpgm 2437; 2438; GFX8-LABEL: sub_i64_uniform: 2439; GFX8: ; %bb.0: ; %entry 2440; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2441; GFX8-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2442; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2443; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2444; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2445; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2446; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2447; GFX8-NEXT: ; mask branch BB12_2 2448; GFX8-NEXT: s_cbranch_execz BB12_2 2449; GFX8-NEXT: BB12_1: 2450; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2451; GFX8-NEXT: v_mov_b32_e32 v1, s6 2452; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2453; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2454; GFX8-NEXT: s_mul_i32 s7, s3, s6 2455; GFX8-NEXT: s_mul_i32 s6, s2, s6 2456; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2457; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2458; GFX8-NEXT: v_mov_b32_e32 v1, s6 2459; GFX8-NEXT: s_mov_b32 m0, -1 2460; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2461; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2462; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2463; GFX8-NEXT: buffer_wbinvl1_vol 2464; GFX8-NEXT: BB12_2: 2465; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2466; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2467; GFX8-NEXT: s_mov_b32 s4, s0 2468; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2469; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2470; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2471; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2472; GFX8-NEXT: s_mov_b32 s5, s1 2473; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2474; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2475; GFX8-NEXT: v_mov_b32_e32 v2, s1 2476; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2477; GFX8-NEXT: s_mov_b32 s7, 0xf000 2478; GFX8-NEXT: s_mov_b32 s6, -1 2479; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2480; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2481; GFX8-NEXT: s_endpgm 2482; 2483; GFX9-LABEL: sub_i64_uniform: 2484; GFX9: ; %bb.0: ; %entry 2485; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2486; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2487; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2488; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2489; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2490; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2491; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2492; GFX9-NEXT: ; mask branch BB12_2 2493; GFX9-NEXT: s_cbranch_execz BB12_2 2494; GFX9-NEXT: BB12_1: 2495; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2496; GFX9-NEXT: v_mov_b32_e32 v1, s6 2497; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2498; GFX9-NEXT: v_mul_hi_u32 v2, s2, v1 2499; GFX9-NEXT: s_mul_i32 s7, s3, s6 2500; GFX9-NEXT: s_mul_i32 s6, s2, s6 2501; GFX9-NEXT: v_mov_b32_e32 v1, s6 2502; GFX9-NEXT: v_add_u32_e32 v2, s7, v2 2503; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2504; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2505; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2506; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2507; GFX9-NEXT: buffer_wbinvl1_vol 2508; GFX9-NEXT: BB12_2: 2509; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2510; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2511; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2512; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2513; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2514; GFX9-NEXT: s_mov_b32 s4, s0 2515; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2516; GFX9-NEXT: s_mov_b32 s5, s1 2517; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2518; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2519; GFX9-NEXT: v_mov_b32_e32 v2, s1 2520; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2521; GFX9-NEXT: s_mov_b32 s7, 0xf000 2522; GFX9-NEXT: s_mov_b32 s6, -1 2523; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2524; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2525; GFX9-NEXT: s_endpgm 2526; 2527; GFX1064-LABEL: sub_i64_uniform: 2528; GFX1064: ; %bb.0: ; %entry 2529; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 2530; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2531; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2532; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2533; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2534; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2535; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2536; GFX1064-NEXT: ; mask branch BB12_2 2537; GFX1064-NEXT: s_cbranch_execz BB12_2 2538; GFX1064-NEXT: BB12_1: 2539; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2540; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2541; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2542; GFX1064-NEXT: v_mul_hi_u32 v2, s2, s6 2543; GFX1064-NEXT: s_mul_i32 s7, s2, s6 2544; GFX1064-NEXT: s_mul_i32 s6, s3, s6 2545; GFX1064-NEXT: v_mov_b32_e32 v1, s7 2546; GFX1064-NEXT: v_add_nc_u32_e32 v2, s6, v2 2547; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2548; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2549; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2550; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2551; GFX1064-NEXT: buffer_gl0_inv 2552; GFX1064-NEXT: buffer_gl1_inv 2553; GFX1064-NEXT: BB12_2: 2554; GFX1064-NEXT: v_nop 2555; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2556; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2557; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2558; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2559; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2560; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 2561; GFX1064-NEXT: v_readfirstlane_b32 s5, v2 2562; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2563; GFX1064-NEXT: s_mov_b32 s2, -1 2564; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2565; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s4, v0 2566; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc 2567; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2568; GFX1064-NEXT: s_endpgm 2569; 2570; GFX1032-LABEL: sub_i64_uniform: 2571; GFX1032: ; %bb.0: ; %entry 2572; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2573; GFX1032-NEXT: v_cmp_ne_u32_e64 s5, 1, 0 2574; GFX1032-NEXT: ; implicit-def: $vcc_hi 2575; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2576; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2577; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2578; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2579; GFX1032-NEXT: ; mask branch BB12_2 2580; GFX1032-NEXT: s_cbranch_execz BB12_2 2581; GFX1032-NEXT: BB12_1: 2582; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2583; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2584; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2585; GFX1032-NEXT: v_mul_hi_u32 v2, s2, s5 2586; GFX1032-NEXT: s_mul_i32 s6, s2, s5 2587; GFX1032-NEXT: s_mul_i32 s5, s3, s5 2588; GFX1032-NEXT: v_mov_b32_e32 v1, s6 2589; GFX1032-NEXT: v_add_nc_u32_e32 v2, s5, v2 2590; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2591; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2592; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2593; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2594; GFX1032-NEXT: buffer_gl0_inv 2595; GFX1032-NEXT: buffer_gl1_inv 2596; GFX1032-NEXT: BB12_2: 2597; GFX1032-NEXT: v_nop 2598; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2599; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2600; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2601; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2602; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2603; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 2604; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 2605; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2606; GFX1032-NEXT: s_mov_b32 s2, -1 2607; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2608; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s4, v0 2609; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo 2610; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2611; GFX1032-NEXT: s_endpgm 2612entry: 2613 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2614 store i64 %old, i64 addrspace(1)* %out 2615 ret void 2616} 2617 2618; GCN-NOT: v_mbcnt_lo_u32_b32 2619; GCN-NOT: v_mbcnt_hi_u32_b32 2620; GCN-NOT: s_bcnt1_i32_b64 2621define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2622; 2623; 2624; GFX7LESS-LABEL: sub_i64_varying: 2625; GFX7LESS: ; %bb.0: ; %entry 2626; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2627; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2628; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2629; GFX7LESS-NEXT: s_mov_b32 m0, -1 2630; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2631; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2632; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2633; GFX7LESS-NEXT: buffer_wbinvl1 2634; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2635; GFX7LESS-NEXT: s_mov_b32 s2, -1 2636; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2637; GFX7LESS-NEXT: s_endpgm 2638; 2639; GFX8-LABEL: sub_i64_varying: 2640; GFX8: ; %bb.0: ; %entry 2641; GFX8-NEXT: v_mov_b32_e32 v1, 0 2642; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2643; GFX8-NEXT: s_mov_b32 m0, -1 2644; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2645; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2646; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2647; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2648; GFX8-NEXT: buffer_wbinvl1_vol 2649; GFX8-NEXT: s_mov_b32 s3, 0xf000 2650; GFX8-NEXT: s_mov_b32 s2, -1 2651; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2652; GFX8-NEXT: s_endpgm 2653; 2654; GFX9-LABEL: sub_i64_varying: 2655; GFX9: ; %bb.0: ; %entry 2656; GFX9-NEXT: v_mov_b32_e32 v1, 0 2657; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2658; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2659; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2660; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2661; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2662; GFX9-NEXT: buffer_wbinvl1_vol 2663; GFX9-NEXT: s_mov_b32 s3, 0xf000 2664; GFX9-NEXT: s_mov_b32 s2, -1 2665; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2666; GFX9-NEXT: s_endpgm 2667; 2668; GFX1064-LABEL: sub_i64_varying: 2669; GFX1064: ; %bb.0: ; %entry 2670; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2671; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2672; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2673; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2674; GFX1064-NEXT: s_mov_b32 s2, -1 2675; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2676; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2677; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2678; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2679; GFX1064-NEXT: buffer_gl0_inv 2680; GFX1064-NEXT: buffer_gl1_inv 2681; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2682; GFX1064-NEXT: s_endpgm 2683; 2684; GFX1032-LABEL: sub_i64_varying: 2685; GFX1032: ; %bb.0: ; %entry 2686; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2687; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2688; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2689; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2690; GFX1032-NEXT: s_mov_b32 s2, -1 2691; GFX1032-NEXT: ; implicit-def: $vcc_hi 2692; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2693; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2694; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2695; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2696; GFX1032-NEXT: buffer_gl0_inv 2697; GFX1032-NEXT: buffer_gl1_inv 2698; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2699; GFX1032-NEXT: s_endpgm 2700entry: 2701 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2702 %zext = zext i32 %lane to i64 2703 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2704 store i64 %old, i64 addrspace(1)* %out 2705 ret void 2706} 2707 2708; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 2709; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 2710; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 2711define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2712; 2713; 2714; GFX7LESS-LABEL: and_i32_varying: 2715; GFX7LESS: ; %bb.0: ; %entry 2716; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2717; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2718; GFX7LESS-NEXT: s_mov_b32 m0, -1 2719; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2720; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2721; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2722; GFX7LESS-NEXT: buffer_wbinvl1 2723; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2724; GFX7LESS-NEXT: s_mov_b32 s2, -1 2725; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2726; GFX7LESS-NEXT: s_endpgm 2727; 2728; GFX8-LABEL: and_i32_varying: 2729; GFX8: ; %bb.0: ; %entry 2730; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2731; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2732; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 2733; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 2734; GFX8-NEXT: v_mov_b32_e32 v2, v0 2735; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2736; GFX8-NEXT: v_mov_b32_e32 v1, -1 2737; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2738; GFX8-NEXT: s_not_b64 exec, exec 2739; GFX8-NEXT: v_mov_b32_e32 v2, -1 2740; GFX8-NEXT: s_not_b64 exec, exec 2741; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2742; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2743; GFX8-NEXT: s_nop 1 2744; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2745; GFX8-NEXT: s_nop 1 2746; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2747; GFX8-NEXT: s_nop 1 2748; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2749; GFX8-NEXT: s_nop 1 2750; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2751; GFX8-NEXT: s_nop 1 2752; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2753; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2754; GFX8-NEXT: s_nop 0 2755; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2756; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2757; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2758; GFX8-NEXT: ; implicit-def: $vgpr0 2759; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2760; GFX8-NEXT: ; mask branch BB14_2 2761; GFX8-NEXT: s_cbranch_execz BB14_2 2762; GFX8-NEXT: BB14_1: 2763; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2764; GFX8-NEXT: v_mov_b32_e32 v3, s2 2765; GFX8-NEXT: s_mov_b32 m0, -1 2766; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2767; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2768; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2769; GFX8-NEXT: buffer_wbinvl1_vol 2770; GFX8-NEXT: BB14_2: 2771; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2772; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2773; GFX8-NEXT: v_mov_b32_e32 v0, v1 2774; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2775; GFX8-NEXT: s_mov_b32 s3, 0xf000 2776; GFX8-NEXT: s_mov_b32 s2, -1 2777; GFX8-NEXT: s_nop 0 2778; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2779; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2780; GFX8-NEXT: s_endpgm 2781; 2782; GFX9-LABEL: and_i32_varying: 2783; GFX9: ; %bb.0: ; %entry 2784; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2785; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2786; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 2787; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 2788; GFX9-NEXT: v_mov_b32_e32 v2, v0 2789; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2790; GFX9-NEXT: v_mov_b32_e32 v1, -1 2791; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2792; GFX9-NEXT: s_not_b64 exec, exec 2793; GFX9-NEXT: v_mov_b32_e32 v2, -1 2794; GFX9-NEXT: s_not_b64 exec, exec 2795; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2796; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2797; GFX9-NEXT: s_nop 1 2798; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2799; GFX9-NEXT: s_nop 1 2800; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2801; GFX9-NEXT: s_nop 1 2802; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2803; GFX9-NEXT: s_nop 1 2804; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2805; GFX9-NEXT: s_nop 1 2806; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2807; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2808; GFX9-NEXT: s_nop 0 2809; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2810; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2811; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2812; GFX9-NEXT: ; implicit-def: $vgpr0 2813; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2814; GFX9-NEXT: ; mask branch BB14_2 2815; GFX9-NEXT: s_cbranch_execz BB14_2 2816; GFX9-NEXT: BB14_1: 2817; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2818; GFX9-NEXT: v_mov_b32_e32 v3, s2 2819; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2820; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2821; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2822; GFX9-NEXT: buffer_wbinvl1_vol 2823; GFX9-NEXT: BB14_2: 2824; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2825; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2826; GFX9-NEXT: v_mov_b32_e32 v0, v1 2827; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2828; GFX9-NEXT: s_mov_b32 s3, 0xf000 2829; GFX9-NEXT: s_mov_b32 s2, -1 2830; GFX9-NEXT: s_nop 0 2831; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2832; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2833; GFX9-NEXT: s_endpgm 2834; 2835; GFX1064-LABEL: and_i32_varying: 2836; GFX1064: ; %bb.0: ; %entry 2837; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2838; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2839; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2840; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 2841; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 2842; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2843; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2844; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2845; GFX1064-NEXT: s_not_b64 exec, exec 2846; GFX1064-NEXT: v_mov_b32_e32 v2, -1 2847; GFX1064-NEXT: s_not_b64 exec, exec 2848; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2849; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2850; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2851; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2852; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2853; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2854; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2855; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2856; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2857; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2858; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2859; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2860; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2861; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2862; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2863; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2864; GFX1064-NEXT: s_mov_b32 s2, -1 2865; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2866; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2867; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2868; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2869; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 2870; GFX1064-NEXT: ; implicit-def: $vgpr0 2871; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2872; GFX1064-NEXT: ; mask branch BB14_2 2873; GFX1064-NEXT: s_cbranch_execz BB14_2 2874; GFX1064-NEXT: BB14_1: 2875; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2876; GFX1064-NEXT: v_mov_b32_e32 v7, s3 2877; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2878; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2879; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7 2880; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2881; GFX1064-NEXT: buffer_gl0_inv 2882; GFX1064-NEXT: buffer_gl1_inv 2883; GFX1064-NEXT: BB14_2: 2884; GFX1064-NEXT: v_nop 2885; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2886; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2887; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2888; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2889; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2890; GFX1064-NEXT: s_nop 1 2891; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2892; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2893; GFX1064-NEXT: s_endpgm 2894; 2895; GFX1032-LABEL: and_i32_varying: 2896; GFX1032: ; %bb.0: ; %entry 2897; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2898; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 2899; GFX1032-NEXT: ; implicit-def: $vcc_hi 2900; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2901; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 2902; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2903; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2904; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2905; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2906; GFX1032-NEXT: v_mov_b32_e32 v2, -1 2907; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2908; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2909; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2910; GFX1032-NEXT: s_mov_b32 s2, -1 2911; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2912; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2913; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2914; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2915; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2916; GFX1032-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2917; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2918; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2919; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2920; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2921; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2922; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 2923; GFX1032-NEXT: ; implicit-def: $vgpr0 2924; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2925; GFX1032-NEXT: ; mask branch BB14_2 2926; GFX1032-NEXT: s_cbranch_execz BB14_2 2927; GFX1032-NEXT: BB14_1: 2928; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2929; GFX1032-NEXT: v_mov_b32_e32 v7, s3 2930; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2931; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2932; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7 2933; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2934; GFX1032-NEXT: buffer_gl0_inv 2935; GFX1032-NEXT: buffer_gl1_inv 2936; GFX1032-NEXT: BB14_2: 2937; GFX1032-NEXT: v_nop 2938; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2939; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2940; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2941; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2942; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2943; GFX1032-NEXT: s_nop 1 2944; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2945; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2946; GFX1032-NEXT: s_endpgm 2947entry: 2948 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2949 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2950 store i32 %old, i32 addrspace(1)* %out 2951 ret void 2952} 2953 2954; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 2955; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 2956; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 2957define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2958; 2959; 2960; GFX7LESS-LABEL: or_i32_varying: 2961; GFX7LESS: ; %bb.0: ; %entry 2962; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2963; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2964; GFX7LESS-NEXT: s_mov_b32 m0, -1 2965; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2966; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2967; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2968; GFX7LESS-NEXT: buffer_wbinvl1 2969; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2970; GFX7LESS-NEXT: s_mov_b32 s2, -1 2971; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2972; GFX7LESS-NEXT: s_endpgm 2973; 2974; GFX8-LABEL: or_i32_varying: 2975; GFX8: ; %bb.0: ; %entry 2976; GFX8-NEXT: v_mov_b32_e32 v2, v0 2977; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2978; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2979; GFX8-NEXT: v_mov_b32_e32 v1, 0 2980; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2981; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 2982; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2983; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2984; GFX8-NEXT: s_not_b64 exec, exec 2985; GFX8-NEXT: v_mov_b32_e32 v2, 0 2986; GFX8-NEXT: s_not_b64 exec, exec 2987; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2988; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2989; GFX8-NEXT: s_nop 1 2990; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2991; GFX8-NEXT: s_nop 1 2992; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2993; GFX8-NEXT: s_nop 1 2994; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2995; GFX8-NEXT: s_nop 1 2996; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2997; GFX8-NEXT: s_nop 1 2998; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2999; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3000; GFX8-NEXT: s_nop 0 3001; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3002; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3003; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3004; GFX8-NEXT: ; implicit-def: $vgpr0 3005; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3006; GFX8-NEXT: ; mask branch BB15_2 3007; GFX8-NEXT: s_cbranch_execz BB15_2 3008; GFX8-NEXT: BB15_1: 3009; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3010; GFX8-NEXT: v_mov_b32_e32 v3, s2 3011; GFX8-NEXT: s_mov_b32 m0, -1 3012; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3013; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 3014; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3015; GFX8-NEXT: buffer_wbinvl1_vol 3016; GFX8-NEXT: BB15_2: 3017; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3018; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3019; GFX8-NEXT: v_mov_b32_e32 v0, v1 3020; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 3021; GFX8-NEXT: s_mov_b32 s3, 0xf000 3022; GFX8-NEXT: s_mov_b32 s2, -1 3023; GFX8-NEXT: s_nop 0 3024; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3025; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3026; GFX8-NEXT: s_endpgm 3027; 3028; GFX9-LABEL: or_i32_varying: 3029; GFX9: ; %bb.0: ; %entry 3030; GFX9-NEXT: v_mov_b32_e32 v2, v0 3031; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3032; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3033; GFX9-NEXT: v_mov_b32_e32 v1, 0 3034; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3035; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3036; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3037; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3038; GFX9-NEXT: s_not_b64 exec, exec 3039; GFX9-NEXT: v_mov_b32_e32 v2, 0 3040; GFX9-NEXT: s_not_b64 exec, exec 3041; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3042; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3043; GFX9-NEXT: s_nop 1 3044; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3045; GFX9-NEXT: s_nop 1 3046; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3047; GFX9-NEXT: s_nop 1 3048; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3049; GFX9-NEXT: s_nop 1 3050; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3051; GFX9-NEXT: s_nop 1 3052; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3053; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3054; GFX9-NEXT: s_nop 0 3055; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3056; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3057; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3058; GFX9-NEXT: ; implicit-def: $vgpr0 3059; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3060; GFX9-NEXT: ; mask branch BB15_2 3061; GFX9-NEXT: s_cbranch_execz BB15_2 3062; GFX9-NEXT: BB15_1: 3063; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3064; GFX9-NEXT: v_mov_b32_e32 v3, s2 3065; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3066; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 3067; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3068; GFX9-NEXT: buffer_wbinvl1_vol 3069; GFX9-NEXT: BB15_2: 3070; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3071; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3072; GFX9-NEXT: v_mov_b32_e32 v0, v1 3073; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 3074; GFX9-NEXT: s_mov_b32 s3, 0xf000 3075; GFX9-NEXT: s_mov_b32 s2, -1 3076; GFX9-NEXT: s_nop 0 3077; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3078; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3079; GFX9-NEXT: s_endpgm 3080; 3081; GFX1064-LABEL: or_i32_varying: 3082; GFX1064: ; %bb.0: ; %entry 3083; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3084; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3085; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3086; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3087; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3088; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3089; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3090; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3091; GFX1064-NEXT: s_not_b64 exec, exec 3092; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3093; GFX1064-NEXT: s_not_b64 exec, exec 3094; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3095; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3096; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3097; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3098; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3099; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3100; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3101; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3102; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3103; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3104; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3105; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3106; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3107; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3108; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3109; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3110; GFX1064-NEXT: s_mov_b32 s2, -1 3111; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3112; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3113; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3114; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3115; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3116; GFX1064-NEXT: ; implicit-def: $vgpr0 3117; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3118; GFX1064-NEXT: ; mask branch BB15_2 3119; GFX1064-NEXT: s_cbranch_execz BB15_2 3120; GFX1064-NEXT: BB15_1: 3121; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3122; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3124; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3125; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7 3126; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3127; GFX1064-NEXT: buffer_gl0_inv 3128; GFX1064-NEXT: buffer_gl1_inv 3129; GFX1064-NEXT: BB15_2: 3130; GFX1064-NEXT: v_nop 3131; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3132; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3133; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3134; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3135; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3136; GFX1064-NEXT: s_nop 1 3137; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3138; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3139; GFX1064-NEXT: s_endpgm 3140; 3141; GFX1032-LABEL: or_i32_varying: 3142; GFX1032: ; %bb.0: ; %entry 3143; GFX1032-NEXT: ; implicit-def: $vcc_hi 3144; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3145; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3146; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3147; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3148; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3149; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3150; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3151; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3152; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3153; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3154; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3155; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3156; GFX1032-NEXT: s_mov_b32 s2, -1 3157; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3158; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3159; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3160; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3161; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3162; GFX1032-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3163; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3164; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3165; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3166; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3167; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3168; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3169; GFX1032-NEXT: ; implicit-def: $vgpr0 3170; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3171; GFX1032-NEXT: ; mask branch BB15_2 3172; GFX1032-NEXT: s_cbranch_execz BB15_2 3173; GFX1032-NEXT: BB15_1: 3174; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3175; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3176; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3177; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3178; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7 3179; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3180; GFX1032-NEXT: buffer_gl0_inv 3181; GFX1032-NEXT: buffer_gl1_inv 3182; GFX1032-NEXT: BB15_2: 3183; GFX1032-NEXT: v_nop 3184; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3185; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3186; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3187; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3188; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3189; GFX1032-NEXT: s_nop 1 3190; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3191; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3192; GFX1032-NEXT: s_endpgm 3193entry: 3194 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3195 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3196 store i32 %old, i32 addrspace(1)* %out 3197 ret void 3198} 3199 3200; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3201; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3202; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3203define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3204; 3205; 3206; GFX7LESS-LABEL: xor_i32_varying: 3207; GFX7LESS: ; %bb.0: ; %entry 3208; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3209; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3210; GFX7LESS-NEXT: s_mov_b32 m0, -1 3211; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3212; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3213; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3214; GFX7LESS-NEXT: buffer_wbinvl1 3215; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3216; GFX7LESS-NEXT: s_mov_b32 s2, -1 3217; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3218; GFX7LESS-NEXT: s_endpgm 3219; 3220; GFX8-LABEL: xor_i32_varying: 3221; GFX8: ; %bb.0: ; %entry 3222; GFX8-NEXT: v_mov_b32_e32 v2, v0 3223; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3224; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3225; GFX8-NEXT: v_mov_b32_e32 v1, 0 3226; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3227; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3228; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3229; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3230; GFX8-NEXT: s_not_b64 exec, exec 3231; GFX8-NEXT: v_mov_b32_e32 v2, 0 3232; GFX8-NEXT: s_not_b64 exec, exec 3233; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3234; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3235; GFX8-NEXT: s_nop 1 3236; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3237; GFX8-NEXT: s_nop 1 3238; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3239; GFX8-NEXT: s_nop 1 3240; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3241; GFX8-NEXT: s_nop 1 3242; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3243; GFX8-NEXT: s_nop 1 3244; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3245; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3246; GFX8-NEXT: s_nop 0 3247; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3248; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3249; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3250; GFX8-NEXT: ; implicit-def: $vgpr0 3251; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3252; GFX8-NEXT: ; mask branch BB16_2 3253; GFX8-NEXT: s_cbranch_execz BB16_2 3254; GFX8-NEXT: BB16_1: 3255; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3256; GFX8-NEXT: v_mov_b32_e32 v3, s2 3257; GFX8-NEXT: s_mov_b32 m0, -1 3258; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3259; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3260; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3261; GFX8-NEXT: buffer_wbinvl1_vol 3262; GFX8-NEXT: BB16_2: 3263; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3264; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3265; GFX8-NEXT: v_mov_b32_e32 v0, v1 3266; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3267; GFX8-NEXT: s_mov_b32 s3, 0xf000 3268; GFX8-NEXT: s_mov_b32 s2, -1 3269; GFX8-NEXT: s_nop 0 3270; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3271; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3272; GFX8-NEXT: s_endpgm 3273; 3274; GFX9-LABEL: xor_i32_varying: 3275; GFX9: ; %bb.0: ; %entry 3276; GFX9-NEXT: v_mov_b32_e32 v2, v0 3277; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3278; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3279; GFX9-NEXT: v_mov_b32_e32 v1, 0 3280; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3281; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3282; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3283; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3284; GFX9-NEXT: s_not_b64 exec, exec 3285; GFX9-NEXT: v_mov_b32_e32 v2, 0 3286; GFX9-NEXT: s_not_b64 exec, exec 3287; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3288; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3289; GFX9-NEXT: s_nop 1 3290; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3291; GFX9-NEXT: s_nop 1 3292; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3293; GFX9-NEXT: s_nop 1 3294; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3295; GFX9-NEXT: s_nop 1 3296; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3297; GFX9-NEXT: s_nop 1 3298; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3299; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3300; GFX9-NEXT: s_nop 0 3301; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3302; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3303; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3304; GFX9-NEXT: ; implicit-def: $vgpr0 3305; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3306; GFX9-NEXT: ; mask branch BB16_2 3307; GFX9-NEXT: s_cbranch_execz BB16_2 3308; GFX9-NEXT: BB16_1: 3309; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3310; GFX9-NEXT: v_mov_b32_e32 v3, s2 3311; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3312; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3313; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3314; GFX9-NEXT: buffer_wbinvl1_vol 3315; GFX9-NEXT: BB16_2: 3316; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3317; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3318; GFX9-NEXT: v_mov_b32_e32 v0, v1 3319; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3320; GFX9-NEXT: s_mov_b32 s3, 0xf000 3321; GFX9-NEXT: s_mov_b32 s2, -1 3322; GFX9-NEXT: s_nop 0 3323; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3324; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3325; GFX9-NEXT: s_endpgm 3326; 3327; GFX1064-LABEL: xor_i32_varying: 3328; GFX1064: ; %bb.0: ; %entry 3329; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3330; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3331; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3332; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3333; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3334; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3335; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3336; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3337; GFX1064-NEXT: s_not_b64 exec, exec 3338; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3339; GFX1064-NEXT: s_not_b64 exec, exec 3340; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3341; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3342; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3343; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3344; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3345; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3346; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3347; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3348; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3349; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3350; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3351; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3352; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3353; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3354; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3355; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3356; GFX1064-NEXT: s_mov_b32 s2, -1 3357; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3358; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3359; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3360; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3361; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3362; GFX1064-NEXT: ; implicit-def: $vgpr0 3363; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3364; GFX1064-NEXT: ; mask branch BB16_2 3365; GFX1064-NEXT: s_cbranch_execz BB16_2 3366; GFX1064-NEXT: BB16_1: 3367; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3368; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3369; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3370; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3371; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7 3372; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3373; GFX1064-NEXT: buffer_gl0_inv 3374; GFX1064-NEXT: buffer_gl1_inv 3375; GFX1064-NEXT: BB16_2: 3376; GFX1064-NEXT: v_nop 3377; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3378; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3379; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3380; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3381; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3382; GFX1064-NEXT: s_nop 1 3383; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3384; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3385; GFX1064-NEXT: s_endpgm 3386; 3387; GFX1032-LABEL: xor_i32_varying: 3388; GFX1032: ; %bb.0: ; %entry 3389; GFX1032-NEXT: ; implicit-def: $vcc_hi 3390; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3391; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3392; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3393; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3394; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3395; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3396; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3397; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3398; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3399; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3400; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3401; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3402; GFX1032-NEXT: s_mov_b32 s2, -1 3403; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3404; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3405; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3406; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3407; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3408; GFX1032-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3409; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3410; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3411; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3412; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3413; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3414; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3415; GFX1032-NEXT: ; implicit-def: $vgpr0 3416; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3417; GFX1032-NEXT: ; mask branch BB16_2 3418; GFX1032-NEXT: s_cbranch_execz BB16_2 3419; GFX1032-NEXT: BB16_1: 3420; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3421; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3422; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3423; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3424; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7 3425; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3426; GFX1032-NEXT: buffer_gl0_inv 3427; GFX1032-NEXT: buffer_gl1_inv 3428; GFX1032-NEXT: BB16_2: 3429; GFX1032-NEXT: v_nop 3430; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3431; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3432; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3433; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3434; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3435; GFX1032-NEXT: s_nop 1 3436; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3437; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3438; GFX1032-NEXT: s_endpgm 3439entry: 3440 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3441 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3442 store i32 %old, i32 addrspace(1)* %out 3443 ret void 3444} 3445 3446; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3447; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3448; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3449define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3450; 3451; 3452; GFX7LESS-LABEL: max_i32_varying: 3453; GFX7LESS: ; %bb.0: ; %entry 3454; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3455; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3456; GFX7LESS-NEXT: s_mov_b32 m0, -1 3457; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3458; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3459; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3460; GFX7LESS-NEXT: buffer_wbinvl1 3461; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3462; GFX7LESS-NEXT: s_mov_b32 s2, -1 3463; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3464; GFX7LESS-NEXT: s_endpgm 3465; 3466; GFX8-LABEL: max_i32_varying: 3467; GFX8: ; %bb.0: ; %entry 3468; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3469; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3470; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3471; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3472; GFX8-NEXT: v_mov_b32_e32 v2, v0 3473; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3474; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3475; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3476; GFX8-NEXT: s_not_b64 exec, exec 3477; GFX8-NEXT: v_mov_b32_e32 v2, v1 3478; GFX8-NEXT: s_not_b64 exec, exec 3479; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3480; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3481; GFX8-NEXT: s_nop 1 3482; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3483; GFX8-NEXT: s_nop 1 3484; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3485; GFX8-NEXT: s_nop 1 3486; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3487; GFX8-NEXT: s_nop 1 3488; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3489; GFX8-NEXT: s_nop 1 3490; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3491; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3492; GFX8-NEXT: s_nop 0 3493; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3494; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3495; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3496; GFX8-NEXT: ; implicit-def: $vgpr0 3497; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3498; GFX8-NEXT: ; mask branch BB17_2 3499; GFX8-NEXT: s_cbranch_execz BB17_2 3500; GFX8-NEXT: BB17_1: 3501; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3502; GFX8-NEXT: v_mov_b32_e32 v3, s2 3503; GFX8-NEXT: s_mov_b32 m0, -1 3504; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3505; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3506; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3507; GFX8-NEXT: buffer_wbinvl1_vol 3508; GFX8-NEXT: BB17_2: 3509; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3510; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3511; GFX8-NEXT: v_mov_b32_e32 v0, v1 3512; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3513; GFX8-NEXT: s_mov_b32 s3, 0xf000 3514; GFX8-NEXT: s_mov_b32 s2, -1 3515; GFX8-NEXT: s_nop 0 3516; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3517; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3518; GFX8-NEXT: s_endpgm 3519; 3520; GFX9-LABEL: max_i32_varying: 3521; GFX9: ; %bb.0: ; %entry 3522; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3523; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3524; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3525; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3526; GFX9-NEXT: v_mov_b32_e32 v2, v0 3527; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3528; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3529; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3530; GFX9-NEXT: s_not_b64 exec, exec 3531; GFX9-NEXT: v_mov_b32_e32 v2, v1 3532; GFX9-NEXT: s_not_b64 exec, exec 3533; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3534; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3535; GFX9-NEXT: s_nop 1 3536; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3537; GFX9-NEXT: s_nop 1 3538; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3539; GFX9-NEXT: s_nop 1 3540; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3541; GFX9-NEXT: s_nop 1 3542; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3543; GFX9-NEXT: s_nop 1 3544; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3545; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3546; GFX9-NEXT: s_nop 0 3547; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3548; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3549; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3550; GFX9-NEXT: ; implicit-def: $vgpr0 3551; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3552; GFX9-NEXT: ; mask branch BB17_2 3553; GFX9-NEXT: s_cbranch_execz BB17_2 3554; GFX9-NEXT: BB17_1: 3555; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3556; GFX9-NEXT: v_mov_b32_e32 v3, s2 3557; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3558; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3559; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3560; GFX9-NEXT: buffer_wbinvl1_vol 3561; GFX9-NEXT: BB17_2: 3562; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3563; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3564; GFX9-NEXT: v_mov_b32_e32 v0, v1 3565; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3566; GFX9-NEXT: s_mov_b32 s3, 0xf000 3567; GFX9-NEXT: s_mov_b32 s2, -1 3568; GFX9-NEXT: s_nop 0 3569; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3570; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3571; GFX9-NEXT: s_endpgm 3572; 3573; GFX1064-LABEL: max_i32_varying: 3574; GFX1064: ; %bb.0: ; %entry 3575; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3576; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3577; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3578; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 3579; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 3580; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3581; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3582; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3583; GFX1064-NEXT: s_not_b64 exec, exec 3584; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3585; GFX1064-NEXT: s_not_b64 exec, exec 3586; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3587; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3588; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3589; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3590; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3591; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3592; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3593; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3594; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3595; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3596; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3597; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3598; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3599; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3600; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3601; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3602; GFX1064-NEXT: s_mov_b32 s2, -1 3603; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3604; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3605; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3606; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3607; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3608; GFX1064-NEXT: ; implicit-def: $vgpr0 3609; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3610; GFX1064-NEXT: ; mask branch BB17_2 3611; GFX1064-NEXT: s_cbranch_execz BB17_2 3612; GFX1064-NEXT: BB17_1: 3613; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3614; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3615; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3616; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3617; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7 3618; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3619; GFX1064-NEXT: buffer_gl0_inv 3620; GFX1064-NEXT: buffer_gl1_inv 3621; GFX1064-NEXT: BB17_2: 3622; GFX1064-NEXT: v_nop 3623; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3624; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3625; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3626; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3627; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3628; GFX1064-NEXT: s_nop 1 3629; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3630; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3631; GFX1064-NEXT: s_endpgm 3632; 3633; GFX1032-LABEL: max_i32_varying: 3634; GFX1032: ; %bb.0: ; %entry 3635; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3636; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3637; GFX1032-NEXT: ; implicit-def: $vcc_hi 3638; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3639; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 3640; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3641; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3642; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3643; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3644; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3645; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3646; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3647; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3648; GFX1032-NEXT: s_mov_b32 s2, -1 3649; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3650; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3651; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3652; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3653; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3654; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3655; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3656; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3657; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3658; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3659; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3660; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 3661; GFX1032-NEXT: ; implicit-def: $vgpr0 3662; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3663; GFX1032-NEXT: ; mask branch BB17_2 3664; GFX1032-NEXT: s_cbranch_execz BB17_2 3665; GFX1032-NEXT: BB17_1: 3666; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3667; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3668; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3669; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3670; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7 3671; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3672; GFX1032-NEXT: buffer_gl0_inv 3673; GFX1032-NEXT: buffer_gl1_inv 3674; GFX1032-NEXT: BB17_2: 3675; GFX1032-NEXT: v_nop 3676; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3677; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3678; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3679; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3680; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3681; GFX1032-NEXT: s_nop 1 3682; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3683; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3684; GFX1032-NEXT: s_endpgm 3685entry: 3686 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3687 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3688 store i32 %old, i32 addrspace(1)* %out 3689 ret void 3690} 3691 3692define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3693; 3694; 3695; GFX7LESS-LABEL: max_i64_constant: 3696; GFX7LESS: ; %bb.0: ; %entry 3697; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3698; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3699; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3700; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 3701; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3702; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3703; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3704; GFX7LESS-NEXT: ; mask branch BB18_2 3705; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3706; GFX7LESS-NEXT: BB18_1: 3707; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3708; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3709; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3710; GFX7LESS-NEXT: s_mov_b32 m0, -1 3711; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3712; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3713; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3714; GFX7LESS-NEXT: buffer_wbinvl1 3715; GFX7LESS-NEXT: BB18_2: 3716; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3717; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3718; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3719; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3720; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3721; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3722; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3723; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3724; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3725; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3726; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3727; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3728; GFX7LESS-NEXT: s_mov_b32 s2, -1 3729; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3730; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3731; GFX7LESS-NEXT: s_endpgm 3732; 3733; GFX8-LABEL: max_i64_constant: 3734; GFX8: ; %bb.0: ; %entry 3735; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3736; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3737; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3738; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3739; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3740; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3741; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3742; GFX8-NEXT: ; mask branch BB18_2 3743; GFX8-NEXT: s_cbranch_execz BB18_2 3744; GFX8-NEXT: BB18_1: 3745; GFX8-NEXT: v_mov_b32_e32 v0, 5 3746; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3747; GFX8-NEXT: v_mov_b32_e32 v1, 0 3748; GFX8-NEXT: s_mov_b32 m0, -1 3749; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3750; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3751; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3752; GFX8-NEXT: buffer_wbinvl1_vol 3753; GFX8-NEXT: BB18_2: 3754; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3755; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3756; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3757; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3758; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3759; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3760; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3761; GFX8-NEXT: v_mov_b32_e32 v2, s3 3762; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3763; GFX8-NEXT: v_mov_b32_e32 v2, s2 3764; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3765; GFX8-NEXT: s_mov_b32 s3, 0xf000 3766; GFX8-NEXT: s_mov_b32 s2, -1 3767; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3768; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3769; GFX8-NEXT: s_endpgm 3770; 3771; GFX9-LABEL: max_i64_constant: 3772; GFX9: ; %bb.0: ; %entry 3773; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3774; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3775; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3776; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3777; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3778; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3779; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3780; GFX9-NEXT: ; mask branch BB18_2 3781; GFX9-NEXT: s_cbranch_execz BB18_2 3782; GFX9-NEXT: BB18_1: 3783; GFX9-NEXT: v_mov_b32_e32 v0, 5 3784; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3785; GFX9-NEXT: v_mov_b32_e32 v1, 0 3786; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3787; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3788; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3789; GFX9-NEXT: buffer_wbinvl1_vol 3790; GFX9-NEXT: BB18_2: 3791; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3792; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3793; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3794; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3795; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3796; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3797; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3798; GFX9-NEXT: v_mov_b32_e32 v2, s3 3799; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3800; GFX9-NEXT: v_mov_b32_e32 v2, s2 3801; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3802; GFX9-NEXT: s_mov_b32 s3, 0xf000 3803; GFX9-NEXT: s_mov_b32 s2, -1 3804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3805; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3806; GFX9-NEXT: s_endpgm 3807; 3808; GFX1064-LABEL: max_i64_constant: 3809; GFX1064: ; %bb.0: ; %entry 3810; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3811; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3812; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3813; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3814; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3815; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3816; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3817; GFX1064-NEXT: ; mask branch BB18_2 3818; GFX1064-NEXT: s_cbranch_execz BB18_2 3819; GFX1064-NEXT: BB18_1: 3820; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3821; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3822; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3823; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3824; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3825; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3826; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3827; GFX1064-NEXT: buffer_gl0_inv 3828; GFX1064-NEXT: buffer_gl1_inv 3829; GFX1064-NEXT: BB18_2: 3830; GFX1064-NEXT: v_nop 3831; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3832; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 3833; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 3834; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3835; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3836; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3837; GFX1064-NEXT: s_mov_b32 s2, -1 3838; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3839; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 3840; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 3841; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3842; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3843; GFX1064-NEXT: s_endpgm 3844; 3845; GFX1032-LABEL: max_i64_constant: 3846; GFX1032: ; %bb.0: ; %entry 3847; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3848; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 3849; GFX1032-NEXT: ; implicit-def: $vcc_hi 3850; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3851; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3852; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3853; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3854; GFX1032-NEXT: ; mask branch BB18_2 3855; GFX1032-NEXT: s_cbranch_execz BB18_2 3856; GFX1032-NEXT: BB18_1: 3857; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3858; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3859; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3860; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3861; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3862; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3863; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3864; GFX1032-NEXT: buffer_gl0_inv 3865; GFX1032-NEXT: buffer_gl1_inv 3866; GFX1032-NEXT: BB18_2: 3867; GFX1032-NEXT: v_nop 3868; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3869; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 3870; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 3871; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3872; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3873; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3874; GFX1032-NEXT: s_mov_b32 s2, -1 3875; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[0:1] 3876; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 3877; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 3878; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3879; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3880; GFX1032-NEXT: s_endpgm 3881entry: 3882 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3883 store i64 %old, i64 addrspace(1)* %out 3884 ret void 3885} 3886 3887; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 3888; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 3889; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 3890define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3891; 3892; 3893; GFX7LESS-LABEL: min_i32_varying: 3894; GFX7LESS: ; %bb.0: ; %entry 3895; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3896; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3897; GFX7LESS-NEXT: s_mov_b32 m0, -1 3898; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3899; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3900; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3901; GFX7LESS-NEXT: buffer_wbinvl1 3902; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3903; GFX7LESS-NEXT: s_mov_b32 s2, -1 3904; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3905; GFX7LESS-NEXT: s_endpgm 3906; 3907; GFX8-LABEL: min_i32_varying: 3908; GFX8: ; %bb.0: ; %entry 3909; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3910; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3911; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3912; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3913; GFX8-NEXT: v_mov_b32_e32 v2, v0 3914; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3915; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3916; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3917; GFX8-NEXT: s_not_b64 exec, exec 3918; GFX8-NEXT: v_mov_b32_e32 v2, v1 3919; GFX8-NEXT: s_not_b64 exec, exec 3920; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3921; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3922; GFX8-NEXT: s_nop 1 3923; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3924; GFX8-NEXT: s_nop 1 3925; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3926; GFX8-NEXT: s_nop 1 3927; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3928; GFX8-NEXT: s_nop 1 3929; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3930; GFX8-NEXT: s_nop 1 3931; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3932; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3933; GFX8-NEXT: s_nop 0 3934; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3935; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3936; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3937; GFX8-NEXT: ; implicit-def: $vgpr0 3938; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3939; GFX8-NEXT: ; mask branch BB19_2 3940; GFX8-NEXT: s_cbranch_execz BB19_2 3941; GFX8-NEXT: BB19_1: 3942; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3943; GFX8-NEXT: v_mov_b32_e32 v3, s2 3944; GFX8-NEXT: s_mov_b32 m0, -1 3945; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3946; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3947; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3948; GFX8-NEXT: buffer_wbinvl1_vol 3949; GFX8-NEXT: BB19_2: 3950; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3951; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3952; GFX8-NEXT: v_mov_b32_e32 v0, v1 3953; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3954; GFX8-NEXT: s_mov_b32 s3, 0xf000 3955; GFX8-NEXT: s_mov_b32 s2, -1 3956; GFX8-NEXT: s_nop 0 3957; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3958; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3959; GFX8-NEXT: s_endpgm 3960; 3961; GFX9-LABEL: min_i32_varying: 3962; GFX9: ; %bb.0: ; %entry 3963; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3964; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 3965; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 3966; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 3967; GFX9-NEXT: v_mov_b32_e32 v2, v0 3968; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3969; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3970; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3971; GFX9-NEXT: s_not_b64 exec, exec 3972; GFX9-NEXT: v_mov_b32_e32 v2, v1 3973; GFX9-NEXT: s_not_b64 exec, exec 3974; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3975; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3976; GFX9-NEXT: s_nop 1 3977; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3978; GFX9-NEXT: s_nop 1 3979; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3980; GFX9-NEXT: s_nop 1 3981; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3982; GFX9-NEXT: s_nop 1 3983; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3984; GFX9-NEXT: s_nop 1 3985; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3986; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3987; GFX9-NEXT: s_nop 0 3988; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3989; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3990; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3991; GFX9-NEXT: ; implicit-def: $vgpr0 3992; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3993; GFX9-NEXT: ; mask branch BB19_2 3994; GFX9-NEXT: s_cbranch_execz BB19_2 3995; GFX9-NEXT: BB19_1: 3996; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3997; GFX9-NEXT: v_mov_b32_e32 v3, s2 3998; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3999; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 4000; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4001; GFX9-NEXT: buffer_wbinvl1_vol 4002; GFX9-NEXT: BB19_2: 4003; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4004; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4005; GFX9-NEXT: v_mov_b32_e32 v0, v1 4006; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 4007; GFX9-NEXT: s_mov_b32 s3, 0xf000 4008; GFX9-NEXT: s_mov_b32 s2, -1 4009; GFX9-NEXT: s_nop 0 4010; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4011; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4012; GFX9-NEXT: s_endpgm 4013; 4014; GFX1064-LABEL: min_i32_varying: 4015; GFX1064: ; %bb.0: ; %entry 4016; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4017; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4018; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4019; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4020; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 4021; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4022; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 4023; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4024; GFX1064-NEXT: s_not_b64 exec, exec 4025; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4026; GFX1064-NEXT: s_not_b64 exec, exec 4027; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4028; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4029; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4030; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4031; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4032; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4033; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4034; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4035; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4036; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4037; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4038; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4039; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4040; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4041; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4042; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4043; GFX1064-NEXT: s_mov_b32 s2, -1 4044; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4045; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4046; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4047; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4048; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 4049; GFX1064-NEXT: ; implicit-def: $vgpr0 4050; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4051; GFX1064-NEXT: ; mask branch BB19_2 4052; GFX1064-NEXT: s_cbranch_execz BB19_2 4053; GFX1064-NEXT: BB19_1: 4054; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4055; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4056; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4057; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4058; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7 4059; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4060; GFX1064-NEXT: buffer_gl0_inv 4061; GFX1064-NEXT: buffer_gl1_inv 4062; GFX1064-NEXT: BB19_2: 4063; GFX1064-NEXT: v_nop 4064; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4065; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4066; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4067; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 4068; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4069; GFX1064-NEXT: s_nop 1 4070; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4071; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4072; GFX1064-NEXT: s_endpgm 4073; 4074; GFX1032-LABEL: min_i32_varying: 4075; GFX1032: ; %bb.0: ; %entry 4076; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4077; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4078; GFX1032-NEXT: ; implicit-def: $vcc_hi 4079; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4080; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4081; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4082; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 4083; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4084; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4085; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4086; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4087; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4088; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4089; GFX1032-NEXT: s_mov_b32 s2, -1 4090; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4091; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4092; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4093; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4094; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4095; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4096; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4097; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4098; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4099; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4100; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4101; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4102; GFX1032-NEXT: ; implicit-def: $vgpr0 4103; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4104; GFX1032-NEXT: ; mask branch BB19_2 4105; GFX1032-NEXT: s_cbranch_execz BB19_2 4106; GFX1032-NEXT: BB19_1: 4107; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4108; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4109; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4110; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4111; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7 4112; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4113; GFX1032-NEXT: buffer_gl0_inv 4114; GFX1032-NEXT: buffer_gl1_inv 4115; GFX1032-NEXT: BB19_2: 4116; GFX1032-NEXT: v_nop 4117; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4118; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4119; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4120; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 4121; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4122; GFX1032-NEXT: s_nop 1 4123; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4124; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4125; GFX1032-NEXT: s_endpgm 4126entry: 4127 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4128 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4129 store i32 %old, i32 addrspace(1)* %out 4130 ret void 4131} 4132 4133define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 4134; 4135; 4136; GFX7LESS-LABEL: min_i64_constant: 4137; GFX7LESS: ; %bb.0: ; %entry 4138; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4139; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4140; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4141; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 4142; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4143; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4144; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4145; GFX7LESS-NEXT: ; mask branch BB20_2 4146; GFX7LESS-NEXT: s_cbranch_execz BB20_2 4147; GFX7LESS-NEXT: BB20_1: 4148; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4149; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4150; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4151; GFX7LESS-NEXT: s_mov_b32 m0, -1 4152; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4153; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4154; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4155; GFX7LESS-NEXT: buffer_wbinvl1 4156; GFX7LESS-NEXT: BB20_2: 4157; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4158; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4159; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4160; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 4161; GFX7LESS-NEXT: s_mov_b32 s2, -1 4162; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4163; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4164; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4165; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4166; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4167; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4168; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4169; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4170; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4171; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4172; GFX7LESS-NEXT: s_endpgm 4173; 4174; GFX8-LABEL: min_i64_constant: 4175; GFX8: ; %bb.0: ; %entry 4176; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4177; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4178; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4179; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4180; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4181; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4182; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4183; GFX8-NEXT: ; mask branch BB20_2 4184; GFX8-NEXT: s_cbranch_execz BB20_2 4185; GFX8-NEXT: BB20_1: 4186; GFX8-NEXT: v_mov_b32_e32 v0, 5 4187; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4188; GFX8-NEXT: v_mov_b32_e32 v1, 0 4189; GFX8-NEXT: s_mov_b32 m0, -1 4190; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4191; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4192; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4193; GFX8-NEXT: buffer_wbinvl1_vol 4194; GFX8-NEXT: BB20_2: 4195; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4196; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4197; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 4198; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4199; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4200; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4201; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4202; GFX8-NEXT: v_mov_b32_e32 v2, s5 4203; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4204; GFX8-NEXT: v_mov_b32_e32 v2, s4 4205; GFX8-NEXT: s_mov_b32 s2, -1 4206; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4207; GFX8-NEXT: s_mov_b32 s3, 0xf000 4208; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4209; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4210; GFX8-NEXT: s_endpgm 4211; 4212; GFX9-LABEL: min_i64_constant: 4213; GFX9: ; %bb.0: ; %entry 4214; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4215; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4216; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4217; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4218; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4219; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4220; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4221; GFX9-NEXT: ; mask branch BB20_2 4222; GFX9-NEXT: s_cbranch_execz BB20_2 4223; GFX9-NEXT: BB20_1: 4224; GFX9-NEXT: v_mov_b32_e32 v0, 5 4225; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4226; GFX9-NEXT: v_mov_b32_e32 v1, 0 4227; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4228; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4229; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4230; GFX9-NEXT: buffer_wbinvl1_vol 4231; GFX9-NEXT: BB20_2: 4232; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4233; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4234; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4235; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4236; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4237; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4238; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4239; GFX9-NEXT: v_mov_b32_e32 v2, s5 4240; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4241; GFX9-NEXT: v_mov_b32_e32 v2, s4 4242; GFX9-NEXT: s_mov_b32 s2, -1 4243; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4244; GFX9-NEXT: s_mov_b32 s3, 0xf000 4245; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4246; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4247; GFX9-NEXT: s_endpgm 4248; 4249; GFX1064-LABEL: min_i64_constant: 4250; GFX1064: ; %bb.0: ; %entry 4251; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4252; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4253; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4254; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4255; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4256; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4257; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4258; GFX1064-NEXT: ; mask branch BB20_2 4259; GFX1064-NEXT: s_cbranch_execz BB20_2 4260; GFX1064-NEXT: BB20_1: 4261; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4262; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4263; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4264; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4265; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4266; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4267; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4268; GFX1064-NEXT: buffer_gl0_inv 4269; GFX1064-NEXT: buffer_gl1_inv 4270; GFX1064-NEXT: BB20_2: 4271; GFX1064-NEXT: v_nop 4272; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4273; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4274; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4275; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4276; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4277; GFX1064-NEXT: s_mov_b32 s2, -1 4278; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4279; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4280; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 4281; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4282; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4283; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4284; GFX1064-NEXT: s_endpgm 4285; 4286; GFX1032-LABEL: min_i64_constant: 4287; GFX1032: ; %bb.0: ; %entry 4288; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4289; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4290; GFX1032-NEXT: ; implicit-def: $vcc_hi 4291; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4292; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4293; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4294; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4295; GFX1032-NEXT: ; mask branch BB20_2 4296; GFX1032-NEXT: s_cbranch_execz BB20_2 4297; GFX1032-NEXT: BB20_1: 4298; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4299; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4300; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4301; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4302; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4303; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4304; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4305; GFX1032-NEXT: buffer_gl0_inv 4306; GFX1032-NEXT: buffer_gl1_inv 4307; GFX1032-NEXT: BB20_2: 4308; GFX1032-NEXT: v_nop 4309; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4310; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 4311; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 4312; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4313; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4314; GFX1032-NEXT: s_mov_b32 s2, -1 4315; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4316; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] 4317; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 4318; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 4319; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4320; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4321; GFX1032-NEXT: s_endpgm 4322entry: 4323 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4324 store i64 %old, i64 addrspace(1)* %out 4325 ret void 4326} 4327 4328; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 4329; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 4330; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 4331define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4332; 4333; 4334; GFX7LESS-LABEL: umax_i32_varying: 4335; GFX7LESS: ; %bb.0: ; %entry 4336; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4337; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4338; GFX7LESS-NEXT: s_mov_b32 m0, -1 4339; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4340; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4341; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4342; GFX7LESS-NEXT: buffer_wbinvl1 4343; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4344; GFX7LESS-NEXT: s_mov_b32 s2, -1 4345; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4346; GFX7LESS-NEXT: s_endpgm 4347; 4348; GFX8-LABEL: umax_i32_varying: 4349; GFX8: ; %bb.0: ; %entry 4350; GFX8-NEXT: v_mov_b32_e32 v2, v0 4351; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4352; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4353; GFX8-NEXT: v_mov_b32_e32 v1, 0 4354; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4355; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4356; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4357; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4358; GFX8-NEXT: s_not_b64 exec, exec 4359; GFX8-NEXT: v_mov_b32_e32 v2, 0 4360; GFX8-NEXT: s_not_b64 exec, exec 4361; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4362; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4363; GFX8-NEXT: s_nop 1 4364; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4365; GFX8-NEXT: s_nop 1 4366; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4367; GFX8-NEXT: s_nop 1 4368; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4369; GFX8-NEXT: s_nop 1 4370; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4371; GFX8-NEXT: s_nop 1 4372; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4373; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4374; GFX8-NEXT: s_nop 0 4375; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4376; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4377; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4378; GFX8-NEXT: ; implicit-def: $vgpr0 4379; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4380; GFX8-NEXT: ; mask branch BB21_2 4381; GFX8-NEXT: s_cbranch_execz BB21_2 4382; GFX8-NEXT: BB21_1: 4383; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4384; GFX8-NEXT: v_mov_b32_e32 v3, s2 4385; GFX8-NEXT: s_mov_b32 m0, -1 4386; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4387; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4388; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4389; GFX8-NEXT: buffer_wbinvl1_vol 4390; GFX8-NEXT: BB21_2: 4391; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4392; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4393; GFX8-NEXT: v_mov_b32_e32 v0, v1 4394; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4395; GFX8-NEXT: s_mov_b32 s3, 0xf000 4396; GFX8-NEXT: s_mov_b32 s2, -1 4397; GFX8-NEXT: s_nop 0 4398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4399; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4400; GFX8-NEXT: s_endpgm 4401; 4402; GFX9-LABEL: umax_i32_varying: 4403; GFX9: ; %bb.0: ; %entry 4404; GFX9-NEXT: v_mov_b32_e32 v2, v0 4405; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4406; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4407; GFX9-NEXT: v_mov_b32_e32 v1, 0 4408; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4409; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4410; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4411; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4412; GFX9-NEXT: s_not_b64 exec, exec 4413; GFX9-NEXT: v_mov_b32_e32 v2, 0 4414; GFX9-NEXT: s_not_b64 exec, exec 4415; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4416; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4417; GFX9-NEXT: s_nop 1 4418; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4419; GFX9-NEXT: s_nop 1 4420; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4421; GFX9-NEXT: s_nop 1 4422; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4423; GFX9-NEXT: s_nop 1 4424; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4425; GFX9-NEXT: s_nop 1 4426; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4427; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4428; GFX9-NEXT: s_nop 0 4429; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4430; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4431; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4432; GFX9-NEXT: ; implicit-def: $vgpr0 4433; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4434; GFX9-NEXT: ; mask branch BB21_2 4435; GFX9-NEXT: s_cbranch_execz BB21_2 4436; GFX9-NEXT: BB21_1: 4437; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4438; GFX9-NEXT: v_mov_b32_e32 v3, s2 4439; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4440; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4441; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4442; GFX9-NEXT: buffer_wbinvl1_vol 4443; GFX9-NEXT: BB21_2: 4444; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4445; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4446; GFX9-NEXT: v_mov_b32_e32 v0, v1 4447; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4448; GFX9-NEXT: s_mov_b32 s3, 0xf000 4449; GFX9-NEXT: s_mov_b32 s2, -1 4450; GFX9-NEXT: s_nop 0 4451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4452; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4453; GFX9-NEXT: s_endpgm 4454; 4455; GFX1064-LABEL: umax_i32_varying: 4456; GFX1064: ; %bb.0: ; %entry 4457; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4458; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4459; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4460; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4461; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4462; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4463; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4464; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4465; GFX1064-NEXT: s_not_b64 exec, exec 4466; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4467; GFX1064-NEXT: s_not_b64 exec, exec 4468; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4469; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4470; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4471; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4472; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4473; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4474; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4475; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4476; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4477; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4478; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4479; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4480; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4481; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4482; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4483; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4484; GFX1064-NEXT: s_mov_b32 s2, -1 4485; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4486; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4487; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4488; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4489; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4490; GFX1064-NEXT: ; implicit-def: $vgpr0 4491; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4492; GFX1064-NEXT: ; mask branch BB21_2 4493; GFX1064-NEXT: s_cbranch_execz BB21_2 4494; GFX1064-NEXT: BB21_1: 4495; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4496; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4497; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4498; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4499; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7 4500; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4501; GFX1064-NEXT: buffer_gl0_inv 4502; GFX1064-NEXT: buffer_gl1_inv 4503; GFX1064-NEXT: BB21_2: 4504; GFX1064-NEXT: v_nop 4505; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4506; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4507; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4508; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4509; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4510; GFX1064-NEXT: s_nop 1 4511; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4512; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4513; GFX1064-NEXT: s_endpgm 4514; 4515; GFX1032-LABEL: umax_i32_varying: 4516; GFX1032: ; %bb.0: ; %entry 4517; GFX1032-NEXT: ; implicit-def: $vcc_hi 4518; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4519; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4520; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4521; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4522; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4523; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4524; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4526; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4527; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4528; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4529; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4530; GFX1032-NEXT: s_mov_b32 s2, -1 4531; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4532; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4533; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4534; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4535; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4536; GFX1032-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4537; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4538; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4539; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4540; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4541; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4542; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4543; GFX1032-NEXT: ; implicit-def: $vgpr0 4544; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4545; GFX1032-NEXT: ; mask branch BB21_2 4546; GFX1032-NEXT: s_cbranch_execz BB21_2 4547; GFX1032-NEXT: BB21_1: 4548; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4549; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4550; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4551; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4552; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7 4553; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4554; GFX1032-NEXT: buffer_gl0_inv 4555; GFX1032-NEXT: buffer_gl1_inv 4556; GFX1032-NEXT: BB21_2: 4557; GFX1032-NEXT: v_nop 4558; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4559; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4560; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4561; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4562; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4563; GFX1032-NEXT: s_nop 1 4564; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4565; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4566; GFX1032-NEXT: s_endpgm 4567entry: 4568 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4569 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4570 store i32 %old, i32 addrspace(1)* %out 4571 ret void 4572} 4573 4574define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4575; 4576; 4577; GFX7LESS-LABEL: umax_i64_constant: 4578; GFX7LESS: ; %bb.0: ; %entry 4579; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4580; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4581; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4582; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 4583; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4584; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4585; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4586; GFX7LESS-NEXT: ; mask branch BB22_2 4587; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4588; GFX7LESS-NEXT: BB22_1: 4589; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4590; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4591; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4592; GFX7LESS-NEXT: s_mov_b32 m0, -1 4593; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4594; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4595; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4596; GFX7LESS-NEXT: buffer_wbinvl1 4597; GFX7LESS-NEXT: BB22_2: 4598; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4599; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4600; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4601; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4602; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4603; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4604; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4605; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4606; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4607; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4608; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4609; GFX7LESS-NEXT: s_mov_b32 s2, -1 4610; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4611; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4612; GFX7LESS-NEXT: s_endpgm 4613; 4614; GFX8-LABEL: umax_i64_constant: 4615; GFX8: ; %bb.0: ; %entry 4616; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4617; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4618; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4619; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4620; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4621; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4622; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4623; GFX8-NEXT: ; mask branch BB22_2 4624; GFX8-NEXT: s_cbranch_execz BB22_2 4625; GFX8-NEXT: BB22_1: 4626; GFX8-NEXT: v_mov_b32_e32 v0, 5 4627; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4628; GFX8-NEXT: v_mov_b32_e32 v1, 0 4629; GFX8-NEXT: s_mov_b32 m0, -1 4630; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4631; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4632; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4633; GFX8-NEXT: buffer_wbinvl1_vol 4634; GFX8-NEXT: BB22_2: 4635; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4636; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4637; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4638; GFX8-NEXT: v_mov_b32_e32 v1, 0 4639; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4640; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4641; GFX8-NEXT: v_mov_b32_e32 v1, s3 4642; GFX8-NEXT: v_mov_b32_e32 v2, s2 4643; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4644; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4645; GFX8-NEXT: s_mov_b32 s3, 0xf000 4646; GFX8-NEXT: s_mov_b32 s2, -1 4647; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4648; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4649; GFX8-NEXT: s_endpgm 4650; 4651; GFX9-LABEL: umax_i64_constant: 4652; GFX9: ; %bb.0: ; %entry 4653; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4654; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4655; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4656; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4657; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4658; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4659; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4660; GFX9-NEXT: ; mask branch BB22_2 4661; GFX9-NEXT: s_cbranch_execz BB22_2 4662; GFX9-NEXT: BB22_1: 4663; GFX9-NEXT: v_mov_b32_e32 v0, 5 4664; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4665; GFX9-NEXT: v_mov_b32_e32 v1, 0 4666; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4667; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4668; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4669; GFX9-NEXT: buffer_wbinvl1_vol 4670; GFX9-NEXT: BB22_2: 4671; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4672; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4673; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4674; GFX9-NEXT: v_mov_b32_e32 v1, 0 4675; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4676; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4677; GFX9-NEXT: v_mov_b32_e32 v1, s3 4678; GFX9-NEXT: v_mov_b32_e32 v2, s2 4679; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4680; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4681; GFX9-NEXT: s_mov_b32 s3, 0xf000 4682; GFX9-NEXT: s_mov_b32 s2, -1 4683; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4684; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4685; GFX9-NEXT: s_endpgm 4686; 4687; GFX1064-LABEL: umax_i64_constant: 4688; GFX1064: ; %bb.0: ; %entry 4689; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4690; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4691; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4692; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4693; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4694; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4695; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4696; GFX1064-NEXT: ; mask branch BB22_2 4697; GFX1064-NEXT: s_cbranch_execz BB22_2 4698; GFX1064-NEXT: BB22_1: 4699; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4700; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4701; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4702; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4703; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4704; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4705; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4706; GFX1064-NEXT: buffer_gl0_inv 4707; GFX1064-NEXT: buffer_gl1_inv 4708; GFX1064-NEXT: BB22_2: 4709; GFX1064-NEXT: v_nop 4710; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4711; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4712; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4713; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4714; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4715; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4716; GFX1064-NEXT: s_mov_b32 s2, -1 4717; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4718; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4719; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc 4720; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4721; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4722; GFX1064-NEXT: s_endpgm 4723; 4724; GFX1032-LABEL: umax_i64_constant: 4725; GFX1032: ; %bb.0: ; %entry 4726; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4727; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4728; GFX1032-NEXT: ; implicit-def: $vcc_hi 4729; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4730; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4731; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4732; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4733; GFX1032-NEXT: ; mask branch BB22_2 4734; GFX1032-NEXT: s_cbranch_execz BB22_2 4735; GFX1032-NEXT: BB22_1: 4736; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4737; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4738; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4739; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4740; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4741; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4742; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4743; GFX1032-NEXT: buffer_gl0_inv 4744; GFX1032-NEXT: buffer_gl1_inv 4745; GFX1032-NEXT: BB22_2: 4746; GFX1032-NEXT: v_nop 4747; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4748; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 4749; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 4750; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4751; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4752; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4753; GFX1032-NEXT: s_mov_b32 s2, -1 4754; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] 4755; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 4756; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo 4757; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4758; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4759; GFX1032-NEXT: s_endpgm 4760entry: 4761 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4762 store i64 %old, i64 addrspace(1)* %out 4763 ret void 4764} 4765 4766; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 4767; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] 4768; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] 4769define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4770; 4771; 4772; GFX7LESS-LABEL: umin_i32_varying: 4773; GFX7LESS: ; %bb.0: ; %entry 4774; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4775; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4776; GFX7LESS-NEXT: s_mov_b32 m0, -1 4777; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4778; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4779; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4780; GFX7LESS-NEXT: buffer_wbinvl1 4781; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4782; GFX7LESS-NEXT: s_mov_b32 s2, -1 4783; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4784; GFX7LESS-NEXT: s_endpgm 4785; 4786; GFX8-LABEL: umin_i32_varying: 4787; GFX8: ; %bb.0: ; %entry 4788; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4789; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4790; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 4791; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 4792; GFX8-NEXT: v_mov_b32_e32 v2, v0 4793; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4794; GFX8-NEXT: v_mov_b32_e32 v1, -1 4795; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4796; GFX8-NEXT: s_not_b64 exec, exec 4797; GFX8-NEXT: v_mov_b32_e32 v2, -1 4798; GFX8-NEXT: s_not_b64 exec, exec 4799; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4800; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4801; GFX8-NEXT: s_nop 1 4802; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4803; GFX8-NEXT: s_nop 1 4804; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4805; GFX8-NEXT: s_nop 1 4806; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4807; GFX8-NEXT: s_nop 1 4808; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4809; GFX8-NEXT: s_nop 1 4810; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4811; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4812; GFX8-NEXT: s_nop 0 4813; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4814; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4815; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4816; GFX8-NEXT: ; implicit-def: $vgpr0 4817; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4818; GFX8-NEXT: ; mask branch BB23_2 4819; GFX8-NEXT: s_cbranch_execz BB23_2 4820; GFX8-NEXT: BB23_1: 4821; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4822; GFX8-NEXT: v_mov_b32_e32 v3, s2 4823; GFX8-NEXT: s_mov_b32 m0, -1 4824; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4825; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4826; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4827; GFX8-NEXT: buffer_wbinvl1_vol 4828; GFX8-NEXT: BB23_2: 4829; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4830; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4831; GFX8-NEXT: v_mov_b32_e32 v0, v1 4832; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4833; GFX8-NEXT: s_mov_b32 s3, 0xf000 4834; GFX8-NEXT: s_mov_b32 s2, -1 4835; GFX8-NEXT: s_nop 0 4836; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4837; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4838; GFX8-NEXT: s_endpgm 4839; 4840; GFX9-LABEL: umin_i32_varying: 4841; GFX9: ; %bb.0: ; %entry 4842; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4843; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4844; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s2, 0 4845; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v3 4846; GFX9-NEXT: v_mov_b32_e32 v2, v0 4847; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4848; GFX9-NEXT: v_mov_b32_e32 v1, -1 4849; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4850; GFX9-NEXT: s_not_b64 exec, exec 4851; GFX9-NEXT: v_mov_b32_e32 v2, -1 4852; GFX9-NEXT: s_not_b64 exec, exec 4853; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4854; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4855; GFX9-NEXT: s_nop 1 4856; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4857; GFX9-NEXT: s_nop 1 4858; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4859; GFX9-NEXT: s_nop 1 4860; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4861; GFX9-NEXT: s_nop 1 4862; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4863; GFX9-NEXT: s_nop 1 4864; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4865; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4866; GFX9-NEXT: s_nop 0 4867; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4868; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4869; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4870; GFX9-NEXT: ; implicit-def: $vgpr0 4871; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4872; GFX9-NEXT: ; mask branch BB23_2 4873; GFX9-NEXT: s_cbranch_execz BB23_2 4874; GFX9-NEXT: BB23_1: 4875; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4876; GFX9-NEXT: v_mov_b32_e32 v3, s2 4877; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4878; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4879; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4880; GFX9-NEXT: buffer_wbinvl1_vol 4881; GFX9-NEXT: BB23_2: 4882; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4883; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4884; GFX9-NEXT: v_mov_b32_e32 v0, v1 4885; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4886; GFX9-NEXT: s_mov_b32 s3, 0xf000 4887; GFX9-NEXT: s_mov_b32 s2, -1 4888; GFX9-NEXT: s_nop 0 4889; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4890; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4891; GFX9-NEXT: s_endpgm 4892; 4893; GFX1064-LABEL: umin_i32_varying: 4894; GFX1064: ; %bb.0: ; %entry 4895; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4896; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 4897; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4898; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4899; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, s3, v4 4900; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4901; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4902; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4903; GFX1064-NEXT: s_not_b64 exec, exec 4904; GFX1064-NEXT: v_mov_b32_e32 v2, -1 4905; GFX1064-NEXT: s_not_b64 exec, exec 4906; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4907; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4908; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4909; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4910; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4911; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4912; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4913; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4914; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4915; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4916; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4917; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4918; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4919; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4920; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4921; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4922; GFX1064-NEXT: s_mov_b32 s2, -1 4923; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4924; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4925; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4926; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4927; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 4928; GFX1064-NEXT: ; implicit-def: $vgpr0 4929; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4930; GFX1064-NEXT: ; mask branch BB23_2 4931; GFX1064-NEXT: s_cbranch_execz BB23_2 4932; GFX1064-NEXT: BB23_1: 4933; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4934; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4935; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4936; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4937; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7 4938; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4939; GFX1064-NEXT: buffer_gl0_inv 4940; GFX1064-NEXT: buffer_gl1_inv 4941; GFX1064-NEXT: BB23_2: 4942; GFX1064-NEXT: v_nop 4943; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4944; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4945; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4946; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4947; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4948; GFX1064-NEXT: s_nop 1 4949; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4950; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4951; GFX1064-NEXT: s_endpgm 4952; 4953; GFX1032-LABEL: umin_i32_varying: 4954; GFX1032: ; %bb.0: ; %entry 4955; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4956; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 4957; GFX1032-NEXT: ; implicit-def: $vcc_hi 4958; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4959; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, s2, 0 4960; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4961; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4962; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4963; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4964; GFX1032-NEXT: v_mov_b32_e32 v2, -1 4965; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4966; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4967; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4968; GFX1032-NEXT: s_mov_b32 s2, -1 4969; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4970; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4971; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4972; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4973; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4974; GFX1032-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4975; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4976; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4977; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4978; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4979; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4980; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4981; GFX1032-NEXT: ; implicit-def: $vgpr0 4982; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4983; GFX1032-NEXT: ; mask branch BB23_2 4984; GFX1032-NEXT: s_cbranch_execz BB23_2 4985; GFX1032-NEXT: BB23_1: 4986; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4987; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4988; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4989; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4990; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7 4991; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4992; GFX1032-NEXT: buffer_gl0_inv 4993; GFX1032-NEXT: buffer_gl1_inv 4994; GFX1032-NEXT: BB23_2: 4995; GFX1032-NEXT: v_nop 4996; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4997; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4998; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4999; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 5000; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5001; GFX1032-NEXT: s_nop 1 5002; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5003; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 5004; GFX1032-NEXT: s_endpgm 5005entry: 5006 %lane = call i32 @llvm.amdgcn.workitem.id.x() 5007 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 5008 store i32 %old, i32 addrspace(1)* %out 5009 ret void 5010} 5011 5012define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 5013; 5014; 5015; GFX7LESS-LABEL: umin_i64_constant: 5016; GFX7LESS: ; %bb.0: ; %entry 5017; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5018; GFX7LESS-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 5019; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 5020; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 5021; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5022; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 5023; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 5024; GFX7LESS-NEXT: ; mask branch BB24_2 5025; GFX7LESS-NEXT: s_cbranch_execz BB24_2 5026; GFX7LESS-NEXT: BB24_1: 5027; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5028; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 5029; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 5030; GFX7LESS-NEXT: s_mov_b32 m0, -1 5031; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5032; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5033; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5034; GFX7LESS-NEXT: buffer_wbinvl1 5035; GFX7LESS-NEXT: BB24_2: 5036; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 5037; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 5038; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 5039; GFX7LESS-NEXT: s_mov_b32 s2, -1 5040; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5041; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5042; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 5043; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5044; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5045; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 5046; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5047; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 5048; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 5049; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5050; GFX7LESS-NEXT: s_endpgm 5051; 5052; GFX8-LABEL: umin_i64_constant: 5053; GFX8: ; %bb.0: ; %entry 5054; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5055; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 5056; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 5057; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 5058; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5059; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5060; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 5061; GFX8-NEXT: ; mask branch BB24_2 5062; GFX8-NEXT: s_cbranch_execz BB24_2 5063; GFX8-NEXT: BB24_1: 5064; GFX8-NEXT: v_mov_b32_e32 v0, 5 5065; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5066; GFX8-NEXT: v_mov_b32_e32 v1, 0 5067; GFX8-NEXT: s_mov_b32 m0, -1 5068; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5069; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5070; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5071; GFX8-NEXT: buffer_wbinvl1_vol 5072; GFX8-NEXT: BB24_2: 5073; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 5074; GFX8-NEXT: v_readfirstlane_b32 s5, v1 5075; GFX8-NEXT: v_readfirstlane_b32 s4, v0 5076; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5077; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5078; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5079; GFX8-NEXT: v_mov_b32_e32 v2, s5 5080; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5081; GFX8-NEXT: v_mov_b32_e32 v2, s4 5082; GFX8-NEXT: s_mov_b32 s2, -1 5083; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5084; GFX8-NEXT: s_mov_b32 s3, 0xf000 5085; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5086; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5087; GFX8-NEXT: s_endpgm 5088; 5089; GFX9-LABEL: umin_i64_constant: 5090; GFX9: ; %bb.0: ; %entry 5091; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5092; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 5093; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 5094; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 5095; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5096; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 5097; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 5098; GFX9-NEXT: ; mask branch BB24_2 5099; GFX9-NEXT: s_cbranch_execz BB24_2 5100; GFX9-NEXT: BB24_1: 5101; GFX9-NEXT: v_mov_b32_e32 v0, 5 5102; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5103; GFX9-NEXT: v_mov_b32_e32 v1, 0 5104; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5105; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5106; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5107; GFX9-NEXT: buffer_wbinvl1_vol 5108; GFX9-NEXT: BB24_2: 5109; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 5110; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5111; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5112; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5113; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5114; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5115; GFX9-NEXT: v_mov_b32_e32 v2, s5 5116; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5117; GFX9-NEXT: v_mov_b32_e32 v2, s4 5118; GFX9-NEXT: s_mov_b32 s2, -1 5119; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5120; GFX9-NEXT: s_mov_b32 s3, 0xf000 5121; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5122; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5123; GFX9-NEXT: s_endpgm 5124; 5125; GFX1064-LABEL: umin_i64_constant: 5126; GFX1064: ; %bb.0: ; %entry 5127; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 5128; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5129; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 5130; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 5131; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 5132; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 5133; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 5134; GFX1064-NEXT: ; mask branch BB24_2 5135; GFX1064-NEXT: s_cbranch_execz BB24_2 5136; GFX1064-NEXT: BB24_1: 5137; GFX1064-NEXT: v_mov_b32_e32 v0, 5 5138; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5139; GFX1064-NEXT: v_mov_b32_e32 v1, 0 5140; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5141; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 5142; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5143; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5144; GFX1064-NEXT: buffer_gl0_inv 5145; GFX1064-NEXT: buffer_gl1_inv 5146; GFX1064-NEXT: BB24_2: 5147; GFX1064-NEXT: v_nop 5148; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 5149; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 5150; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 5151; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 5152; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 5153; GFX1064-NEXT: s_mov_b32 s2, -1 5154; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 5155; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 5156; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 5157; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 5158; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 5159; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5160; GFX1064-NEXT: s_endpgm 5161; 5162; GFX1032-LABEL: umin_i64_constant: 5163; GFX1032: ; %bb.0: ; %entry 5164; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5165; GFX1032-NEXT: v_cmp_ne_u32_e64 s2, 1, 0 5166; GFX1032-NEXT: ; implicit-def: $vcc_hi 5167; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 5168; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 5169; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 5170; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 5171; GFX1032-NEXT: ; mask branch BB24_2 5172; GFX1032-NEXT: s_cbranch_execz BB24_2 5173; GFX1032-NEXT: BB24_1: 5174; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5175; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5176; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5177; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5178; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5179; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5180; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5181; GFX1032-NEXT: buffer_gl0_inv 5182; GFX1032-NEXT: buffer_gl1_inv 5183; GFX1032-NEXT: BB24_2: 5184; GFX1032-NEXT: v_nop 5185; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5186; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 5187; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 5188; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 5189; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5190; GFX1032-NEXT: s_mov_b32 s2, -1 5191; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5192; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] 5193; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 5194; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 5195; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5196; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5197; GFX1032-NEXT: s_endpgm 5198entry: 5199 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 5200 store i64 %old, i64 addrspace(1)* %out 5201 ret void 5202} 5203