1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 31; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 32; GFX7LESS-NEXT: s_mov_b32 m0, -1 33; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 34; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 35; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 36; GFX7LESS-NEXT: buffer_wbinvl1 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 40; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 41; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 42; GFX7LESS-NEXT: s_mov_b32 s2, -1 43; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 61; GFX8-NEXT: s_mov_b32 m0, -1 62; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 63; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 64; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 65; GFX8-NEXT: buffer_wbinvl1_vol 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: v_readfirstlane_b32 s2, v1 69; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 70; GFX8-NEXT: s_mov_b32 s3, 0xf000 71; GFX8-NEXT: s_mov_b32 s2, -1 72; GFX8-NEXT: s_waitcnt lgkmcnt(0) 73; GFX8-NEXT: s_nop 0 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 91; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 92; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 93; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 94; GFX9-NEXT: buffer_wbinvl1_vol 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: v_readfirstlane_b32 s2, v1 98; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 99; GFX9-NEXT: s_mov_b32 s3, 0xf000 100; GFX9-NEXT: s_mov_b32 s2, -1 101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 102; GFX9-NEXT: s_nop 0 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 119; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 120; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 121; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 122; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: buffer_gl0_inv 125; GFX1064-NEXT: buffer_gl1_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: s_nop 0 135; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 136; GFX1064-NEXT: s_endpgm 137; 138; GFX1032-LABEL: add_i32_constant: 139; GFX1032: ; %bb.0: ; %entry 140; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 141; GFX1032-NEXT: s_mov_b32 s2, exec_lo 142; GFX1032-NEXT: ; implicit-def: $vcc_hi 143; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 144; GFX1032-NEXT: ; implicit-def: $vgpr1 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz BB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 150; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 151; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 152; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 153; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 155; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 156; GFX1032-NEXT: buffer_gl0_inv 157; GFX1032-NEXT: buffer_gl1_inv 158; GFX1032-NEXT: BB0_2: 159; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 166; GFX1032-NEXT: s_nop 0 167; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 168; GFX1032-NEXT: s_endpgm 169entry: 170 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 171 store i32 %old, i32 addrspace(1)* %out 172 ret void 173} 174 175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 176; 177; 178; GFX7LESS-LABEL: add_i32_uniform: 179; GFX7LESS: ; %bb.0: ; %entry 180; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 181; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 182; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 183; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 184; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 185; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 186; GFX7LESS-NEXT: ; implicit-def: $vgpr1 187; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 188; GFX7LESS-NEXT: s_cbranch_execz BB1_2 189; GFX7LESS-NEXT: ; %bb.1: 190; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 191; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 192; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 193; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 194; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 195; GFX7LESS-NEXT: s_mov_b32 m0, -1 196; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 197; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 198; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 199; GFX7LESS-NEXT: buffer_wbinvl1 200; GFX7LESS-NEXT: BB1_2: 201; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 202; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 203; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 204; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 205; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 206; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 207; GFX7LESS-NEXT: s_mov_b32 s6, -1 208; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 209; GFX7LESS-NEXT: s_endpgm 210; 211; GFX8-LABEL: add_i32_uniform: 212; GFX8: ; %bb.0: ; %entry 213; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 214; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 215; GFX8-NEXT: s_mov_b64 s[2:3], exec 216; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 217; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 218; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 219; GFX8-NEXT: ; implicit-def: $vgpr1 220; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 221; GFX8-NEXT: s_cbranch_execz BB1_2 222; GFX8-NEXT: ; %bb.1: 223; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 225; GFX8-NEXT: s_mul_i32 s1, s0, s1 226; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 227; GFX8-NEXT: v_mov_b32_e32 v2, s1 228; GFX8-NEXT: s_mov_b32 m0, -1 229; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 230; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 231; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 232; GFX8-NEXT: buffer_wbinvl1_vol 233; GFX8-NEXT: BB1_2: 234; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 236; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 237; GFX8-NEXT: v_readfirstlane_b32 s0, v1 238; GFX8-NEXT: s_mov_b32 s7, 0xf000 239; GFX8-NEXT: s_mov_b32 s6, -1 240; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 241; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 242; GFX8-NEXT: s_endpgm 243; 244; GFX9-LABEL: add_i32_uniform: 245; GFX9: ; %bb.0: ; %entry 246; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 247; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 248; GFX9-NEXT: s_mov_b64 s[2:3], exec 249; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 250; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 251; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 252; GFX9-NEXT: ; implicit-def: $vgpr1 253; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 254; GFX9-NEXT: s_cbranch_execz BB1_2 255; GFX9-NEXT: ; %bb.1: 256; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 258; GFX9-NEXT: s_mul_i32 s1, s0, s1 259; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 260; GFX9-NEXT: v_mov_b32_e32 v2, s1 261; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 262; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 263; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 264; GFX9-NEXT: buffer_wbinvl1_vol 265; GFX9-NEXT: BB1_2: 266; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 267; GFX9-NEXT: s_waitcnt lgkmcnt(0) 268; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 269; GFX9-NEXT: v_readfirstlane_b32 s0, v1 270; GFX9-NEXT: s_mov_b32 s7, 0xf000 271; GFX9-NEXT: s_mov_b32 s6, -1 272; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 273; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 274; GFX9-NEXT: s_endpgm 275; 276; GFX1064-LABEL: add_i32_uniform: 277; GFX1064: ; %bb.0: ; %entry 278; GFX1064-NEXT: s_clause 0x1 279; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 280; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 281; GFX1064-NEXT: s_mov_b64 s[2:3], exec 282; GFX1064-NEXT: ; implicit-def: $vgpr1 283; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 284; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 285; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 286; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 287; GFX1064-NEXT: s_cbranch_execz BB1_2 288; GFX1064-NEXT: ; %bb.1: 289; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 290; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: s_mul_i32 s1, s0, s1 293; GFX1064-NEXT: v_mov_b32_e32 v2, s1 294; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 295; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 296; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 297; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 298; GFX1064-NEXT: buffer_gl0_inv 299; GFX1064-NEXT: buffer_gl1_inv 300; GFX1064-NEXT: BB1_2: 301; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 302; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 303; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 304; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 305; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 306; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 307; GFX1064-NEXT: s_mov_b32 s6, -1 308; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 309; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 310; GFX1064-NEXT: s_endpgm 311; 312; GFX1032-LABEL: add_i32_uniform: 313; GFX1032: ; %bb.0: ; %entry 314; GFX1032-NEXT: s_clause 0x1 315; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 316; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 317; GFX1032-NEXT: s_mov_b32 s2, exec_lo 318; GFX1032-NEXT: ; implicit-def: $vcc_hi 319; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 320; GFX1032-NEXT: ; implicit-def: $vgpr1 321; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 322; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 323; GFX1032-NEXT: s_cbranch_execz BB1_2 324; GFX1032-NEXT: ; %bb.1: 325; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 326; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 327; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 328; GFX1032-NEXT: s_mul_i32 s2, s0, s2 329; GFX1032-NEXT: v_mov_b32_e32 v2, s2 330; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 331; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 332; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 333; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 334; GFX1032-NEXT: buffer_gl0_inv 335; GFX1032-NEXT: buffer_gl1_inv 336; GFX1032-NEXT: BB1_2: 337; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 338; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 339; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 340; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 341; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 342; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 343; GFX1032-NEXT: s_mov_b32 s6, -1 344; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 345; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 346; GFX1032-NEXT: s_endpgm 347entry: 348 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 349 store i32 %old, i32 addrspace(1)* %out 350 ret void 351} 352 353define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 354; 355; 356; GFX7LESS-LABEL: add_i32_varying: 357; GFX7LESS: ; %bb.0: ; %entry 358; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 359; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 360; GFX7LESS-NEXT: s_mov_b32 m0, -1 361; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 362; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 363; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 364; GFX7LESS-NEXT: buffer_wbinvl1 365; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 366; GFX7LESS-NEXT: s_mov_b32 s2, -1 367; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 368; GFX7LESS-NEXT: s_endpgm 369; 370; GFX8-LABEL: add_i32_varying: 371; GFX8: ; %bb.0: ; %entry 372; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 373; GFX8-NEXT: s_mov_b64 s[2:3], exec 374; GFX8-NEXT: v_mov_b32_e32 v2, v0 375; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 376; GFX8-NEXT: v_mov_b32_e32 v1, 0 377; GFX8-NEXT: s_mov_b64 exec, s[4:5] 378; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 379; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 380; GFX8-NEXT: s_not_b64 exec, exec 381; GFX8-NEXT: v_mov_b32_e32 v2, 0 382; GFX8-NEXT: s_not_b64 exec, exec 383; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 384; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 385; GFX8-NEXT: s_nop 1 386; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 387; GFX8-NEXT: s_nop 1 388; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 389; GFX8-NEXT: s_nop 1 390; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 391; GFX8-NEXT: s_nop 1 392; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 393; GFX8-NEXT: s_nop 1 394; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 395; GFX8-NEXT: v_readlane_b32 s2, v2, 63 396; GFX8-NEXT: s_nop 0 397; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 398; GFX8-NEXT: s_mov_b64 exec, s[4:5] 399; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 400; GFX8-NEXT: ; implicit-def: $vgpr0 401; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 402; GFX8-NEXT: s_cbranch_execz BB2_2 403; GFX8-NEXT: ; %bb.1: 404; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 405; GFX8-NEXT: v_mov_b32_e32 v3, s2 406; GFX8-NEXT: s_mov_b32 m0, -1 407; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 408; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 409; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 410; GFX8-NEXT: buffer_wbinvl1_vol 411; GFX8-NEXT: BB2_2: 412; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 413; GFX8-NEXT: v_readfirstlane_b32 s2, v0 414; GFX8-NEXT: v_mov_b32_e32 v0, v1 415; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 416; GFX8-NEXT: s_mov_b32 s3, 0xf000 417; GFX8-NEXT: s_mov_b32 s2, -1 418; GFX8-NEXT: s_waitcnt lgkmcnt(0) 419; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 420; GFX8-NEXT: s_endpgm 421; 422; GFX9-LABEL: add_i32_varying: 423; GFX9: ; %bb.0: ; %entry 424; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 425; GFX9-NEXT: s_mov_b64 s[2:3], exec 426; GFX9-NEXT: v_mov_b32_e32 v2, v0 427; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 428; GFX9-NEXT: v_mov_b32_e32 v1, 0 429; GFX9-NEXT: s_mov_b64 exec, s[4:5] 430; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 431; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 432; GFX9-NEXT: s_not_b64 exec, exec 433; GFX9-NEXT: v_mov_b32_e32 v2, 0 434; GFX9-NEXT: s_not_b64 exec, exec 435; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 436; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 437; GFX9-NEXT: s_nop 1 438; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 439; GFX9-NEXT: s_nop 1 440; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 441; GFX9-NEXT: s_nop 1 442; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 443; GFX9-NEXT: s_nop 1 444; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 445; GFX9-NEXT: s_nop 1 446; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 447; GFX9-NEXT: v_readlane_b32 s2, v2, 63 448; GFX9-NEXT: s_nop 0 449; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 450; GFX9-NEXT: s_mov_b64 exec, s[4:5] 451; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 452; GFX9-NEXT: ; implicit-def: $vgpr0 453; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 454; GFX9-NEXT: s_cbranch_execz BB2_2 455; GFX9-NEXT: ; %bb.1: 456; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 457; GFX9-NEXT: v_mov_b32_e32 v3, s2 458; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 459; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 460; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 461; GFX9-NEXT: buffer_wbinvl1_vol 462; GFX9-NEXT: BB2_2: 463; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 464; GFX9-NEXT: v_readfirstlane_b32 s2, v0 465; GFX9-NEXT: v_mov_b32_e32 v0, v1 466; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 467; GFX9-NEXT: s_mov_b32 s3, 0xf000 468; GFX9-NEXT: s_mov_b32 s2, -1 469; GFX9-NEXT: s_waitcnt lgkmcnt(0) 470; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 471; GFX9-NEXT: s_endpgm 472; 473; GFX1064-LABEL: add_i32_varying: 474; GFX1064: ; %bb.0: ; %entry 475; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 476; GFX1064-NEXT: s_mov_b64 s[2:3], exec 477; GFX1064-NEXT: v_mov_b32_e32 v2, v0 478; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 479; GFX1064-NEXT: v_mov_b32_e32 v1, 0 480; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 481; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 482; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 483; GFX1064-NEXT: s_not_b64 exec, exec 484; GFX1064-NEXT: v_mov_b32_e32 v2, 0 485; GFX1064-NEXT: s_not_b64 exec, exec 486; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 487; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 488; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 489; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 490; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 491; GFX1064-NEXT: v_mov_b32_e32 v3, v2 492; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 493; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 494; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 495; GFX1064-NEXT: v_mov_b32_e32 v3, s2 496; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 497; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 498; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 499; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 500; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 501; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 502; GFX1064-NEXT: s_mov_b32 s2, -1 503; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 504; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 505; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 506; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 507; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 508; GFX1064-NEXT: ; implicit-def: $vgpr0 509; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 510; GFX1064-NEXT: s_cbranch_execz BB2_2 511; GFX1064-NEXT: ; %bb.1: 512; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 513; GFX1064-NEXT: v_mov_b32_e32 v4, s3 514; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 515; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 516; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 517; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 518; GFX1064-NEXT: buffer_gl0_inv 519; GFX1064-NEXT: buffer_gl1_inv 520; GFX1064-NEXT: BB2_2: 521; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 522; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 523; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 524; GFX1064-NEXT: v_mov_b32_e32 v0, v1 525; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 526; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 527; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 528; GFX1064-NEXT: s_nop 0 529; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 530; GFX1064-NEXT: s_endpgm 531; 532; GFX1032-LABEL: add_i32_varying: 533; GFX1032: ; %bb.0: ; %entry 534; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 535; GFX1032-NEXT: s_mov_b32 s2, exec_lo 536; GFX1032-NEXT: ; implicit-def: $vcc_hi 537; GFX1032-NEXT: v_mov_b32_e32 v2, v0 538; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 539; GFX1032-NEXT: v_mov_b32_e32 v1, 0 540; GFX1032-NEXT: s_mov_b32 exec_lo, s3 541; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 542; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 543; GFX1032-NEXT: v_mov_b32_e32 v2, 0 544; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 545; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 546; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 547; GFX1032-NEXT: s_mov_b32 s2, -1 548; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 549; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 550; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 551; GFX1032-NEXT: v_mov_b32_e32 v3, v2 552; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 553; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 554; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 555; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 556; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 557; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 558; GFX1032-NEXT: s_mov_b32 exec_lo, s4 559; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 560; GFX1032-NEXT: ; implicit-def: $vgpr0 561; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 562; GFX1032-NEXT: s_cbranch_execz BB2_2 563; GFX1032-NEXT: ; %bb.1: 564; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 565; GFX1032-NEXT: v_mov_b32_e32 v4, s3 566; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 567; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 568; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 569; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 570; GFX1032-NEXT: buffer_gl0_inv 571; GFX1032-NEXT: buffer_gl1_inv 572; GFX1032-NEXT: BB2_2: 573; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 574; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 575; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 576; GFX1032-NEXT: v_mov_b32_e32 v0, v1 577; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 578; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 579; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 580; GFX1032-NEXT: s_nop 0 581; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 582; GFX1032-NEXT: s_endpgm 583entry: 584 %lane = call i32 @llvm.amdgcn.workitem.id.x() 585 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 586 store i32 %old, i32 addrspace(1)* %out 587 ret void 588} 589 590define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 591; 592; 593; GFX7LESS-LABEL: add_i32_varying_gfx1032: 594; GFX7LESS: ; %bb.0: ; %entry 595; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 596; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 597; GFX7LESS-NEXT: s_mov_b32 m0, -1 598; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 599; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 600; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 601; GFX7LESS-NEXT: buffer_wbinvl1 602; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 603; GFX7LESS-NEXT: s_mov_b32 s2, -1 604; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 605; GFX7LESS-NEXT: s_endpgm 606; 607; GFX8-LABEL: add_i32_varying_gfx1032: 608; GFX8: ; %bb.0: ; %entry 609; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 610; GFX8-NEXT: s_mov_b64 s[2:3], exec 611; GFX8-NEXT: v_mov_b32_e32 v2, v0 612; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 613; GFX8-NEXT: v_mov_b32_e32 v1, 0 614; GFX8-NEXT: s_mov_b64 exec, s[4:5] 615; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 616; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 617; GFX8-NEXT: s_not_b64 exec, exec 618; GFX8-NEXT: v_mov_b32_e32 v2, 0 619; GFX8-NEXT: s_not_b64 exec, exec 620; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 621; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 622; GFX8-NEXT: s_nop 1 623; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 624; GFX8-NEXT: s_nop 1 625; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 626; GFX8-NEXT: s_nop 1 627; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 628; GFX8-NEXT: s_nop 1 629; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 630; GFX8-NEXT: s_nop 1 631; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 632; GFX8-NEXT: v_readlane_b32 s2, v2, 63 633; GFX8-NEXT: s_nop 0 634; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 635; GFX8-NEXT: s_mov_b64 exec, s[4:5] 636; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 637; GFX8-NEXT: ; implicit-def: $vgpr0 638; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 639; GFX8-NEXT: s_cbranch_execz BB3_2 640; GFX8-NEXT: ; %bb.1: 641; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 642; GFX8-NEXT: v_mov_b32_e32 v3, s2 643; GFX8-NEXT: s_mov_b32 m0, -1 644; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 645; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 646; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 647; GFX8-NEXT: buffer_wbinvl1_vol 648; GFX8-NEXT: BB3_2: 649; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 650; GFX8-NEXT: v_readfirstlane_b32 s2, v0 651; GFX8-NEXT: v_mov_b32_e32 v0, v1 652; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 653; GFX8-NEXT: s_mov_b32 s3, 0xf000 654; GFX8-NEXT: s_mov_b32 s2, -1 655; GFX8-NEXT: s_waitcnt lgkmcnt(0) 656; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 657; GFX8-NEXT: s_endpgm 658; 659; GFX9-LABEL: add_i32_varying_gfx1032: 660; GFX9: ; %bb.0: ; %entry 661; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 662; GFX9-NEXT: s_mov_b64 s[2:3], exec 663; GFX9-NEXT: v_mov_b32_e32 v2, v0 664; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 665; GFX9-NEXT: v_mov_b32_e32 v1, 0 666; GFX9-NEXT: s_mov_b64 exec, s[4:5] 667; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 668; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 669; GFX9-NEXT: s_not_b64 exec, exec 670; GFX9-NEXT: v_mov_b32_e32 v2, 0 671; GFX9-NEXT: s_not_b64 exec, exec 672; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 673; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 674; GFX9-NEXT: s_nop 1 675; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 676; GFX9-NEXT: s_nop 1 677; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 678; GFX9-NEXT: s_nop 1 679; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 680; GFX9-NEXT: s_nop 1 681; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 682; GFX9-NEXT: s_nop 1 683; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 684; GFX9-NEXT: v_readlane_b32 s2, v2, 63 685; GFX9-NEXT: s_nop 0 686; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 687; GFX9-NEXT: s_mov_b64 exec, s[4:5] 688; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 689; GFX9-NEXT: ; implicit-def: $vgpr0 690; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 691; GFX9-NEXT: s_cbranch_execz BB3_2 692; GFX9-NEXT: ; %bb.1: 693; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 694; GFX9-NEXT: v_mov_b32_e32 v3, s2 695; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 696; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 697; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 698; GFX9-NEXT: buffer_wbinvl1_vol 699; GFX9-NEXT: BB3_2: 700; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 701; GFX9-NEXT: v_readfirstlane_b32 s2, v0 702; GFX9-NEXT: v_mov_b32_e32 v0, v1 703; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 704; GFX9-NEXT: s_mov_b32 s3, 0xf000 705; GFX9-NEXT: s_mov_b32 s2, -1 706; GFX9-NEXT: s_waitcnt lgkmcnt(0) 707; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 708; GFX9-NEXT: s_endpgm 709; 710; GFX1064-LABEL: add_i32_varying_gfx1032: 711; GFX1064: ; %bb.0: ; %entry 712; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 713; GFX1064-NEXT: s_mov_b64 s[2:3], exec 714; GFX1064-NEXT: v_mov_b32_e32 v2, v0 715; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 716; GFX1064-NEXT: v_mov_b32_e32 v1, 0 717; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 718; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 719; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 720; GFX1064-NEXT: s_not_b64 exec, exec 721; GFX1064-NEXT: v_mov_b32_e32 v2, 0 722; GFX1064-NEXT: s_not_b64 exec, exec 723; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 724; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 725; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 726; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 727; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 728; GFX1064-NEXT: v_mov_b32_e32 v3, v2 729; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 730; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 731; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 732; GFX1064-NEXT: v_mov_b32_e32 v3, s2 733; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 734; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 735; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 736; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 737; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 738; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 739; GFX1064-NEXT: s_mov_b32 s2, -1 740; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 741; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 742; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 743; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 744; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 745; GFX1064-NEXT: ; implicit-def: $vgpr0 746; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 747; GFX1064-NEXT: s_cbranch_execz BB3_2 748; GFX1064-NEXT: ; %bb.1: 749; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 750; GFX1064-NEXT: v_mov_b32_e32 v4, s3 751; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 752; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 753; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 754; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 755; GFX1064-NEXT: buffer_gl0_inv 756; GFX1064-NEXT: buffer_gl1_inv 757; GFX1064-NEXT: BB3_2: 758; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 759; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 760; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 761; GFX1064-NEXT: v_mov_b32_e32 v0, v1 762; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 763; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 764; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 765; GFX1064-NEXT: s_nop 0 766; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 767; GFX1064-NEXT: s_endpgm 768; 769; GFX1032-LABEL: add_i32_varying_gfx1032: 770; GFX1032: ; %bb.0: ; %entry 771; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 772; GFX1032-NEXT: s_mov_b32 s2, exec_lo 773; GFX1032-NEXT: ; implicit-def: $vcc_hi 774; GFX1032-NEXT: v_mov_b32_e32 v2, v0 775; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 776; GFX1032-NEXT: v_mov_b32_e32 v1, 0 777; GFX1032-NEXT: s_mov_b32 exec_lo, s3 778; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 779; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 780; GFX1032-NEXT: v_mov_b32_e32 v2, 0 781; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 782; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 783; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 784; GFX1032-NEXT: s_mov_b32 s2, -1 785; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 786; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 787; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 788; GFX1032-NEXT: v_mov_b32_e32 v3, v2 789; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 790; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 791; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 792; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 793; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 794; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 795; GFX1032-NEXT: s_mov_b32 exec_lo, s4 796; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 797; GFX1032-NEXT: ; implicit-def: $vgpr0 798; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 799; GFX1032-NEXT: s_cbranch_execz BB3_2 800; GFX1032-NEXT: ; %bb.1: 801; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 802; GFX1032-NEXT: v_mov_b32_e32 v4, s3 803; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 804; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 805; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 806; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 807; GFX1032-NEXT: buffer_gl0_inv 808; GFX1032-NEXT: buffer_gl1_inv 809; GFX1032-NEXT: BB3_2: 810; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 811; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 812; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 813; GFX1032-NEXT: v_mov_b32_e32 v0, v1 814; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 815; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 816; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 817; GFX1032-NEXT: s_nop 0 818; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 819; GFX1032-NEXT: s_endpgm 820entry: 821 %lane = call i32 @llvm.amdgcn.workitem.id.x() 822 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 823 store i32 %old, i32 addrspace(1)* %out 824 ret void 825} 826 827define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 828; 829; 830; GFX7LESS-LABEL: add_i32_varying_gfx1064: 831; GFX7LESS: ; %bb.0: ; %entry 832; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 833; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 834; GFX7LESS-NEXT: s_mov_b32 m0, -1 835; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 836; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 837; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 838; GFX7LESS-NEXT: buffer_wbinvl1 839; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 840; GFX7LESS-NEXT: s_mov_b32 s2, -1 841; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 842; GFX7LESS-NEXT: s_endpgm 843; 844; GFX8-LABEL: add_i32_varying_gfx1064: 845; GFX8: ; %bb.0: ; %entry 846; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 847; GFX8-NEXT: s_mov_b64 s[2:3], exec 848; GFX8-NEXT: v_mov_b32_e32 v2, v0 849; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 850; GFX8-NEXT: v_mov_b32_e32 v1, 0 851; GFX8-NEXT: s_mov_b64 exec, s[4:5] 852; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 853; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 854; GFX8-NEXT: s_not_b64 exec, exec 855; GFX8-NEXT: v_mov_b32_e32 v2, 0 856; GFX8-NEXT: s_not_b64 exec, exec 857; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 858; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 859; GFX8-NEXT: s_nop 1 860; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 861; GFX8-NEXT: s_nop 1 862; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 863; GFX8-NEXT: s_nop 1 864; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 865; GFX8-NEXT: s_nop 1 866; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 867; GFX8-NEXT: s_nop 1 868; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 869; GFX8-NEXT: v_readlane_b32 s2, v2, 63 870; GFX8-NEXT: s_nop 0 871; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 872; GFX8-NEXT: s_mov_b64 exec, s[4:5] 873; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 874; GFX8-NEXT: ; implicit-def: $vgpr0 875; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 876; GFX8-NEXT: s_cbranch_execz BB4_2 877; GFX8-NEXT: ; %bb.1: 878; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 879; GFX8-NEXT: v_mov_b32_e32 v3, s2 880; GFX8-NEXT: s_mov_b32 m0, -1 881; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 882; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 883; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 884; GFX8-NEXT: buffer_wbinvl1_vol 885; GFX8-NEXT: BB4_2: 886; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 887; GFX8-NEXT: v_readfirstlane_b32 s2, v0 888; GFX8-NEXT: v_mov_b32_e32 v0, v1 889; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 890; GFX8-NEXT: s_mov_b32 s3, 0xf000 891; GFX8-NEXT: s_mov_b32 s2, -1 892; GFX8-NEXT: s_waitcnt lgkmcnt(0) 893; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 894; GFX8-NEXT: s_endpgm 895; 896; GFX9-LABEL: add_i32_varying_gfx1064: 897; GFX9: ; %bb.0: ; %entry 898; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 899; GFX9-NEXT: s_mov_b64 s[2:3], exec 900; GFX9-NEXT: v_mov_b32_e32 v2, v0 901; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 902; GFX9-NEXT: v_mov_b32_e32 v1, 0 903; GFX9-NEXT: s_mov_b64 exec, s[4:5] 904; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 905; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 906; GFX9-NEXT: s_not_b64 exec, exec 907; GFX9-NEXT: v_mov_b32_e32 v2, 0 908; GFX9-NEXT: s_not_b64 exec, exec 909; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 910; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 911; GFX9-NEXT: s_nop 1 912; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 913; GFX9-NEXT: s_nop 1 914; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 915; GFX9-NEXT: s_nop 1 916; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 917; GFX9-NEXT: s_nop 1 918; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 919; GFX9-NEXT: s_nop 1 920; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 921; GFX9-NEXT: v_readlane_b32 s2, v2, 63 922; GFX9-NEXT: s_nop 0 923; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 924; GFX9-NEXT: s_mov_b64 exec, s[4:5] 925; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 926; GFX9-NEXT: ; implicit-def: $vgpr0 927; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 928; GFX9-NEXT: s_cbranch_execz BB4_2 929; GFX9-NEXT: ; %bb.1: 930; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 931; GFX9-NEXT: v_mov_b32_e32 v3, s2 932; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 933; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 934; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 935; GFX9-NEXT: buffer_wbinvl1_vol 936; GFX9-NEXT: BB4_2: 937; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 938; GFX9-NEXT: v_readfirstlane_b32 s2, v0 939; GFX9-NEXT: v_mov_b32_e32 v0, v1 940; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 941; GFX9-NEXT: s_mov_b32 s3, 0xf000 942; GFX9-NEXT: s_mov_b32 s2, -1 943; GFX9-NEXT: s_waitcnt lgkmcnt(0) 944; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 945; GFX9-NEXT: s_endpgm 946; 947; GFX1064-LABEL: add_i32_varying_gfx1064: 948; GFX1064: ; %bb.0: ; %entry 949; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 950; GFX1064-NEXT: s_mov_b64 s[2:3], exec 951; GFX1064-NEXT: v_mov_b32_e32 v2, v0 952; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 953; GFX1064-NEXT: v_mov_b32_e32 v1, 0 954; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 955; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 956; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 957; GFX1064-NEXT: s_not_b64 exec, exec 958; GFX1064-NEXT: v_mov_b32_e32 v2, 0 959; GFX1064-NEXT: s_not_b64 exec, exec 960; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 961; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 962; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 963; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 964; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 965; GFX1064-NEXT: v_mov_b32_e32 v3, v2 966; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 967; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 968; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 969; GFX1064-NEXT: v_mov_b32_e32 v3, s2 970; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 971; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 972; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 973; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 974; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 975; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 976; GFX1064-NEXT: s_mov_b32 s2, -1 977; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 978; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 979; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 980; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 981; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 982; GFX1064-NEXT: ; implicit-def: $vgpr0 983; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 984; GFX1064-NEXT: s_cbranch_execz BB4_2 985; GFX1064-NEXT: ; %bb.1: 986; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 987; GFX1064-NEXT: v_mov_b32_e32 v4, s3 988; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 989; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 990; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 991; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 992; GFX1064-NEXT: buffer_gl0_inv 993; GFX1064-NEXT: buffer_gl1_inv 994; GFX1064-NEXT: BB4_2: 995; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 996; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 997; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 998; GFX1064-NEXT: v_mov_b32_e32 v0, v1 999; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 1000; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1001; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX1064-NEXT: s_nop 0 1003; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1004; GFX1064-NEXT: s_endpgm 1005; 1006; GFX1032-LABEL: add_i32_varying_gfx1064: 1007; GFX1032: ; %bb.0: ; %entry 1008; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1009; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1010; GFX1032-NEXT: ; implicit-def: $vcc_hi 1011; GFX1032-NEXT: v_mov_b32_e32 v2, v0 1012; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 1013; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1014; GFX1032-NEXT: s_mov_b32 exec_lo, s3 1015; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1016; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1017; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1018; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1019; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1020; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1021; GFX1032-NEXT: s_mov_b32 s2, -1 1022; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1023; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1024; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1025; GFX1032-NEXT: v_mov_b32_e32 v3, v2 1026; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 1027; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1028; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 1029; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 1030; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 1031; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 1032; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1033; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1034; GFX1032-NEXT: ; implicit-def: $vgpr0 1035; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1036; GFX1032-NEXT: s_cbranch_execz BB4_2 1037; GFX1032-NEXT: ; %bb.1: 1038; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1039; GFX1032-NEXT: v_mov_b32_e32 v4, s3 1040; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1041; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1042; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 1043; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1044; GFX1032-NEXT: buffer_gl0_inv 1045; GFX1032-NEXT: buffer_gl1_inv 1046; GFX1032-NEXT: BB4_2: 1047; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1048; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1049; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1050; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1051; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1052; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1053; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1054; GFX1032-NEXT: s_nop 0 1055; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1056; GFX1032-NEXT: s_endpgm 1057entry: 1058 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1059 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1060 store i32 %old, i32 addrspace(1)* %out 1061 ret void 1062} 1063 1064define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1065; 1066; 1067; GFX7LESS-LABEL: add_i64_constant: 1068; GFX7LESS: ; %bb.0: ; %entry 1069; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1070; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1071; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1072; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1073; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1074; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1075; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1076; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1077; GFX7LESS-NEXT: ; %bb.1: 1078; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1079; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1080; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1081; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1082; GFX7LESS-NEXT: s_mov_b32 m0, -1 1083; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1084; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1085; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1086; GFX7LESS-NEXT: buffer_wbinvl1 1087; GFX7LESS-NEXT: BB5_2: 1088; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1089; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1090; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1091; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1092; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1093; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1094; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1095; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1096; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1097; GFX7LESS-NEXT: s_mov_b32 s2, -1 1098; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1100; GFX7LESS-NEXT: s_endpgm 1101; 1102; GFX8-LABEL: add_i64_constant: 1103; GFX8: ; %bb.0: ; %entry 1104; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1105; GFX8-NEXT: s_mov_b64 s[4:5], exec 1106; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1107; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1108; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1109; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1110; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1111; GFX8-NEXT: s_cbranch_execz BB5_2 1112; GFX8-NEXT: ; %bb.1: 1113; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1114; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1115; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1116; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1117; GFX8-NEXT: s_mov_b32 m0, -1 1118; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1119; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1120; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1121; GFX8-NEXT: buffer_wbinvl1_vol 1122; GFX8-NEXT: BB5_2: 1123; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1124; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1125; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1126; GFX8-NEXT: v_mov_b32_e32 v1, s2 1127; GFX8-NEXT: v_mov_b32_e32 v2, s3 1128; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1129; GFX8-NEXT: s_mov_b32 s3, 0xf000 1130; GFX8-NEXT: s_mov_b32 s2, -1 1131; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1132; GFX8-NEXT: s_nop 1 1133; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1134; GFX8-NEXT: s_endpgm 1135; 1136; GFX9-LABEL: add_i64_constant: 1137; GFX9: ; %bb.0: ; %entry 1138; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1139; GFX9-NEXT: s_mov_b64 s[4:5], exec 1140; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1141; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1142; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1143; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1144; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1145; GFX9-NEXT: s_cbranch_execz BB5_2 1146; GFX9-NEXT: ; %bb.1: 1147; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1148; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1149; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1150; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1151; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1152; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1153; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1154; GFX9-NEXT: buffer_wbinvl1_vol 1155; GFX9-NEXT: BB5_2: 1156; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1157; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1158; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1159; GFX9-NEXT: v_mov_b32_e32 v1, s2 1160; GFX9-NEXT: v_mov_b32_e32 v2, s3 1161; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1162; GFX9-NEXT: s_mov_b32 s3, 0xf000 1163; GFX9-NEXT: s_mov_b32 s2, -1 1164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX9-NEXT: s_nop 1 1166; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1167; GFX9-NEXT: s_endpgm 1168; 1169; GFX1064-LABEL: add_i64_constant: 1170; GFX1064: ; %bb.0: ; %entry 1171; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1172; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1173; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1174; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1175; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 1176; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1177; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1178; GFX1064-NEXT: s_cbranch_execz BB5_2 1179; GFX1064-NEXT: ; %bb.1: 1180; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1181; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1182; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1183; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1184; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1185; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1186; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1187; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1188; GFX1064-NEXT: buffer_gl0_inv 1189; GFX1064-NEXT: buffer_gl1_inv 1190; GFX1064-NEXT: BB5_2: 1191; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1192; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1193; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1194; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1195; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1196; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1197; GFX1064-NEXT: s_mov_b32 s2, -1 1198; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1199; GFX1064-NEXT: s_nop 1 1200; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1201; GFX1064-NEXT: s_endpgm 1202; 1203; GFX1032-LABEL: add_i64_constant: 1204; GFX1032: ; %bb.0: ; %entry 1205; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1206; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1207; GFX1032-NEXT: ; implicit-def: $vcc_hi 1208; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1209; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1210; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1211; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1212; GFX1032-NEXT: s_cbranch_execz BB5_2 1213; GFX1032-NEXT: ; %bb.1: 1214; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1215; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1216; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1217; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1218; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1219; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1220; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1221; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1222; GFX1032-NEXT: buffer_gl0_inv 1223; GFX1032-NEXT: buffer_gl1_inv 1224; GFX1032-NEXT: BB5_2: 1225; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1226; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1227; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1228; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1229; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1230; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1231; GFX1032-NEXT: s_mov_b32 s2, -1 1232; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1233; GFX1032-NEXT: s_nop 1 1234; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1235; GFX1032-NEXT: s_endpgm 1236entry: 1237 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1238 store i64 %old, i64 addrspace(1)* %out 1239 ret void 1240} 1241 1242define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1243; 1244; 1245; GFX7LESS-LABEL: add_i64_uniform: 1246; GFX7LESS: ; %bb.0: ; %entry 1247; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1248; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1249; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1250; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1251; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1252; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1253; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1254; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1255; GFX7LESS-NEXT: ; %bb.1: 1256; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1257; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1258; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1259; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1260; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1261; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1262; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1263; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1264; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1265; GFX7LESS-NEXT: s_mov_b32 m0, -1 1266; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1267; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1268; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1269; GFX7LESS-NEXT: buffer_wbinvl1 1270; GFX7LESS-NEXT: BB6_2: 1271; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1272; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1273; GFX7LESS-NEXT: s_mov_b32 s6, -1 1274; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1275; GFX7LESS-NEXT: s_mov_b32 s4, s0 1276; GFX7LESS-NEXT: s_mov_b32 s5, s1 1277; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1278; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1279; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1280; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1281; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1282; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1283; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1284; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1285; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1286; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1287; GFX7LESS-NEXT: s_endpgm 1288; 1289; GFX8-LABEL: add_i64_uniform: 1290; GFX8: ; %bb.0: ; %entry 1291; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1292; GFX8-NEXT: s_mov_b64 s[6:7], exec 1293; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1294; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1295; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1296; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1297; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1298; GFX8-NEXT: s_cbranch_execz BB6_2 1299; GFX8-NEXT: ; %bb.1: 1300; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1301; GFX8-NEXT: v_mov_b32_e32 v1, s6 1302; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1303; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1304; GFX8-NEXT: s_mul_i32 s7, s3, s6 1305; GFX8-NEXT: s_mul_i32 s6, s2, s6 1306; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1307; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1308; GFX8-NEXT: v_mov_b32_e32 v1, s6 1309; GFX8-NEXT: s_mov_b32 m0, -1 1310; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1311; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1312; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1313; GFX8-NEXT: buffer_wbinvl1_vol 1314; GFX8-NEXT: BB6_2: 1315; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1316; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1317; GFX8-NEXT: s_mov_b32 s4, s0 1318; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1319; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1320; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1321; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1322; GFX8-NEXT: s_mov_b32 s5, s1 1323; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1324; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1325; GFX8-NEXT: v_mov_b32_e32 v2, s1 1326; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1327; GFX8-NEXT: s_mov_b32 s7, 0xf000 1328; GFX8-NEXT: s_mov_b32 s6, -1 1329; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1330; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1331; GFX8-NEXT: s_endpgm 1332; 1333; GFX9-LABEL: add_i64_uniform: 1334; GFX9: ; %bb.0: ; %entry 1335; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1336; GFX9-NEXT: s_mov_b64 s[6:7], exec 1337; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1338; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1339; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1340; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1341; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1342; GFX9-NEXT: s_cbranch_execz BB6_2 1343; GFX9-NEXT: ; %bb.1: 1344; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1345; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1346; GFX9-NEXT: s_mul_i32 s7, s3, s6 1347; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1348; GFX9-NEXT: s_add_i32 s8, s8, s7 1349; GFX9-NEXT: s_mul_i32 s6, s2, s6 1350; GFX9-NEXT: v_mov_b32_e32 v1, s6 1351; GFX9-NEXT: v_mov_b32_e32 v2, s8 1352; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1353; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1354; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1355; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1356; GFX9-NEXT: buffer_wbinvl1_vol 1357; GFX9-NEXT: BB6_2: 1358; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1359; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1360; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1361; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1362; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1363; GFX9-NEXT: s_mov_b32 s4, s0 1364; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1365; GFX9-NEXT: s_mov_b32 s5, s1 1366; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1367; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1368; GFX9-NEXT: v_mov_b32_e32 v2, s1 1369; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1370; GFX9-NEXT: s_mov_b32 s7, 0xf000 1371; GFX9-NEXT: s_mov_b32 s6, -1 1372; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1373; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1374; GFX9-NEXT: s_endpgm 1375; 1376; GFX1064-LABEL: add_i64_uniform: 1377; GFX1064: ; %bb.0: ; %entry 1378; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1379; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1380; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1381; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1382; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1383; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1384; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1385; GFX1064-NEXT: s_cbranch_execz BB6_2 1386; GFX1064-NEXT: ; %bb.1: 1387; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1388; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1389; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1390; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1391; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1392; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1393; GFX1064-NEXT: s_add_i32 s8, s8, s7 1394; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1395; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1396; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1397; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1398; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1399; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1400; GFX1064-NEXT: buffer_gl0_inv 1401; GFX1064-NEXT: buffer_gl1_inv 1402; GFX1064-NEXT: BB6_2: 1403; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1404; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1405; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1406; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1407; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1408; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1409; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1410; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1411; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1412; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1413; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 1414; GFX1064-NEXT: s_mov_b32 s2, -1 1415; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1416; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1417; GFX1064-NEXT: s_endpgm 1418; 1419; GFX1032-LABEL: add_i64_uniform: 1420; GFX1032: ; %bb.0: ; %entry 1421; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1422; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1423; GFX1032-NEXT: ; implicit-def: $vcc_hi 1424; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1425; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1426; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1427; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1428; GFX1032-NEXT: s_cbranch_execz BB6_2 1429; GFX1032-NEXT: ; %bb.1: 1430; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1431; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1432; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1434; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1435; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1436; GFX1032-NEXT: s_add_i32 s7, s7, s6 1437; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1438; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1439; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1440; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1441; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1442; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1443; GFX1032-NEXT: buffer_gl0_inv 1444; GFX1032-NEXT: buffer_gl1_inv 1445; GFX1032-NEXT: BB6_2: 1446; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1447; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1448; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1449; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1450; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1451; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1452; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1453; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1454; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1455; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1456; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 1457; GFX1032-NEXT: s_mov_b32 s2, -1 1458; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1459; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1460; GFX1032-NEXT: s_endpgm 1461entry: 1462 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1463 store i64 %old, i64 addrspace(1)* %out 1464 ret void 1465} 1466 1467; GCN-NOT: v_mbcnt_lo_u32_b32 1468; GCN-NOT: v_mbcnt_hi_u32_b32 1469; GCN-NOT: s_bcnt1_i32_b64 1470define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1471; 1472; 1473; GFX7LESS-LABEL: add_i64_varying: 1474; GFX7LESS: ; %bb.0: ; %entry 1475; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1476; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1477; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1478; GFX7LESS-NEXT: s_mov_b32 m0, -1 1479; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1480; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1481; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1482; GFX7LESS-NEXT: buffer_wbinvl1 1483; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1484; GFX7LESS-NEXT: s_mov_b32 s2, -1 1485; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1486; GFX7LESS-NEXT: s_endpgm 1487; 1488; GFX8-LABEL: add_i64_varying: 1489; GFX8: ; %bb.0: ; %entry 1490; GFX8-NEXT: v_mov_b32_e32 v1, 0 1491; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1492; GFX8-NEXT: s_mov_b32 m0, -1 1493; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1494; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1495; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1496; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1497; GFX8-NEXT: buffer_wbinvl1_vol 1498; GFX8-NEXT: s_mov_b32 s3, 0xf000 1499; GFX8-NEXT: s_mov_b32 s2, -1 1500; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1501; GFX8-NEXT: s_endpgm 1502; 1503; GFX9-LABEL: add_i64_varying: 1504; GFX9: ; %bb.0: ; %entry 1505; GFX9-NEXT: v_mov_b32_e32 v1, 0 1506; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1507; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1508; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1509; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1510; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1511; GFX9-NEXT: buffer_wbinvl1_vol 1512; GFX9-NEXT: s_mov_b32 s3, 0xf000 1513; GFX9-NEXT: s_mov_b32 s2, -1 1514; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1515; GFX9-NEXT: s_endpgm 1516; 1517; GFX1064-LABEL: add_i64_varying: 1518; GFX1064: ; %bb.0: ; %entry 1519; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1520; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1521; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1522; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1523; GFX1064-NEXT: s_mov_b32 s2, -1 1524; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1525; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1526; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1527; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1528; GFX1064-NEXT: buffer_gl0_inv 1529; GFX1064-NEXT: buffer_gl1_inv 1530; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1531; GFX1064-NEXT: s_endpgm 1532; 1533; GFX1032-LABEL: add_i64_varying: 1534; GFX1032: ; %bb.0: ; %entry 1535; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1536; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1537; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1538; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1539; GFX1032-NEXT: s_mov_b32 s2, -1 1540; GFX1032-NEXT: ; implicit-def: $vcc_hi 1541; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1542; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1543; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1544; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1545; GFX1032-NEXT: buffer_gl0_inv 1546; GFX1032-NEXT: buffer_gl1_inv 1547; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1548; GFX1032-NEXT: s_endpgm 1549entry: 1550 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1551 %zext = zext i32 %lane to i64 1552 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1553 store i64 %old, i64 addrspace(1)* %out 1554 ret void 1555} 1556 1557define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1558; 1559; 1560; GFX7LESS-LABEL: sub_i32_constant: 1561; GFX7LESS: ; %bb.0: ; %entry 1562; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1563; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1564; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1565; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1566; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1567; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1568; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1569; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1570; GFX7LESS-NEXT: ; %bb.1: 1571; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1572; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1573; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 1574; GFX7LESS-NEXT: s_mov_b32 m0, -1 1575; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1576; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1577; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1578; GFX7LESS-NEXT: buffer_wbinvl1 1579; GFX7LESS-NEXT: BB8_2: 1580; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1581; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1582; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1583; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1584; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1585; GFX7LESS-NEXT: s_mov_b32 s2, -1 1586; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1587; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1588; GFX7LESS-NEXT: s_endpgm 1589; 1590; GFX8-LABEL: sub_i32_constant: 1591; GFX8: ; %bb.0: ; %entry 1592; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1593; GFX8-NEXT: s_mov_b64 s[2:3], exec 1594; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1595; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1596; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1597; GFX8-NEXT: ; implicit-def: $vgpr1 1598; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1599; GFX8-NEXT: s_cbranch_execz BB8_2 1600; GFX8-NEXT: ; %bb.1: 1601; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1602; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1603; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1604; GFX8-NEXT: s_mov_b32 m0, -1 1605; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1606; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 1607; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1608; GFX8-NEXT: buffer_wbinvl1_vol 1609; GFX8-NEXT: BB8_2: 1610; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1611; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1612; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1613; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1614; GFX8-NEXT: s_mov_b32 s3, 0xf000 1615; GFX8-NEXT: s_mov_b32 s2, -1 1616; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1617; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1618; GFX8-NEXT: s_endpgm 1619; 1620; GFX9-LABEL: sub_i32_constant: 1621; GFX9: ; %bb.0: ; %entry 1622; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1623; GFX9-NEXT: s_mov_b64 s[2:3], exec 1624; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1625; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1626; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1627; GFX9-NEXT: ; implicit-def: $vgpr1 1628; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1629; GFX9-NEXT: s_cbranch_execz BB8_2 1630; GFX9-NEXT: ; %bb.1: 1631; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1632; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1633; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1634; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1635; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 1636; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1637; GFX9-NEXT: buffer_wbinvl1_vol 1638; GFX9-NEXT: BB8_2: 1639; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1640; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1641; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1642; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1643; GFX9-NEXT: s_mov_b32 s3, 0xf000 1644; GFX9-NEXT: s_mov_b32 s2, -1 1645; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1646; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1647; GFX9-NEXT: s_endpgm 1648; 1649; GFX1064-LABEL: sub_i32_constant: 1650; GFX1064: ; %bb.0: ; %entry 1651; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1652; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1653; GFX1064-NEXT: ; implicit-def: $vgpr1 1654; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1655; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1656; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1657; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1658; GFX1064-NEXT: s_cbranch_execz BB8_2 1659; GFX1064-NEXT: ; %bb.1: 1660; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1661; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1662; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1663; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1664; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1665; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 1666; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1667; GFX1064-NEXT: buffer_gl0_inv 1668; GFX1064-NEXT: buffer_gl1_inv 1669; GFX1064-NEXT: BB8_2: 1670; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1671; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1672; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1673; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1674; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1675; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1676; GFX1064-NEXT: s_mov_b32 s2, -1 1677; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1678; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1679; GFX1064-NEXT: s_endpgm 1680; 1681; GFX1032-LABEL: sub_i32_constant: 1682; GFX1032: ; %bb.0: ; %entry 1683; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1684; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1685; GFX1032-NEXT: ; implicit-def: $vcc_hi 1686; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1687; GFX1032-NEXT: ; implicit-def: $vgpr1 1688; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1689; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1690; GFX1032-NEXT: s_cbranch_execz BB8_2 1691; GFX1032-NEXT: ; %bb.1: 1692; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1693; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1694; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1695; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1696; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1697; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 1698; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1699; GFX1032-NEXT: buffer_gl0_inv 1700; GFX1032-NEXT: buffer_gl1_inv 1701; GFX1032-NEXT: BB8_2: 1702; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1703; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1704; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1705; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1706; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1707; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1708; GFX1032-NEXT: s_mov_b32 s2, -1 1709; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1711; GFX1032-NEXT: s_endpgm 1712entry: 1713 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1714 store i32 %old, i32 addrspace(1)* %out 1715 ret void 1716} 1717 1718define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1719; 1720; 1721; GFX7LESS-LABEL: sub_i32_uniform: 1722; GFX7LESS: ; %bb.0: ; %entry 1723; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1724; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1725; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 1726; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1727; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1728; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1729; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1730; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1731; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1732; GFX7LESS-NEXT: ; %bb.1: 1733; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1734; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1735; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 1736; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1737; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 1738; GFX7LESS-NEXT: s_mov_b32 m0, -1 1739; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1740; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1741; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1742; GFX7LESS-NEXT: buffer_wbinvl1 1743; GFX7LESS-NEXT: BB9_2: 1744; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1745; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1746; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1747; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1748; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1749; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1750; GFX7LESS-NEXT: s_mov_b32 s6, -1 1751; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1752; GFX7LESS-NEXT: s_endpgm 1753; 1754; GFX8-LABEL: sub_i32_uniform: 1755; GFX8: ; %bb.0: ; %entry 1756; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1757; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1758; GFX8-NEXT: s_mov_b64 s[2:3], exec 1759; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1760; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1761; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1762; GFX8-NEXT: ; implicit-def: $vgpr1 1763; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1764; GFX8-NEXT: s_cbranch_execz BB9_2 1765; GFX8-NEXT: ; %bb.1: 1766; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1767; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX8-NEXT: s_mul_i32 s1, s0, s1 1769; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1770; GFX8-NEXT: v_mov_b32_e32 v2, s1 1771; GFX8-NEXT: s_mov_b32 m0, -1 1772; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1773; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1774; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1775; GFX8-NEXT: buffer_wbinvl1_vol 1776; GFX8-NEXT: BB9_2: 1777; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1778; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1779; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1780; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1781; GFX8-NEXT: s_mov_b32 s7, 0xf000 1782; GFX8-NEXT: s_mov_b32 s6, -1 1783; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1784; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1785; GFX8-NEXT: s_endpgm 1786; 1787; GFX9-LABEL: sub_i32_uniform: 1788; GFX9: ; %bb.0: ; %entry 1789; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1790; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 1791; GFX9-NEXT: s_mov_b64 s[2:3], exec 1792; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1793; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1794; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1795; GFX9-NEXT: ; implicit-def: $vgpr1 1796; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 1797; GFX9-NEXT: s_cbranch_execz BB9_2 1798; GFX9-NEXT: ; %bb.1: 1799; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1800; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1801; GFX9-NEXT: s_mul_i32 s1, s0, s1 1802; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1803; GFX9-NEXT: v_mov_b32_e32 v2, s1 1804; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1805; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1806; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1807; GFX9-NEXT: buffer_wbinvl1_vol 1808; GFX9-NEXT: BB9_2: 1809; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 1810; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1811; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 1812; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1813; GFX9-NEXT: s_mov_b32 s7, 0xf000 1814; GFX9-NEXT: s_mov_b32 s6, -1 1815; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1816; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1817; GFX9-NEXT: s_endpgm 1818; 1819; GFX1064-LABEL: sub_i32_uniform: 1820; GFX1064: ; %bb.0: ; %entry 1821; GFX1064-NEXT: s_clause 0x1 1822; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1823; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 1824; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1825; GFX1064-NEXT: ; implicit-def: $vgpr1 1826; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1827; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1828; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1829; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 1830; GFX1064-NEXT: s_cbranch_execz BB9_2 1831; GFX1064-NEXT: ; %bb.1: 1832; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1833; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1834; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1835; GFX1064-NEXT: s_mul_i32 s1, s0, s1 1836; GFX1064-NEXT: v_mov_b32_e32 v2, s1 1837; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1838; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1839; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1840; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1841; GFX1064-NEXT: buffer_gl0_inv 1842; GFX1064-NEXT: buffer_gl1_inv 1843; GFX1064-NEXT: BB9_2: 1844; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1845; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 1846; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1847; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 1848; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1849; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1850; GFX1064-NEXT: s_mov_b32 s6, -1 1851; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1852; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1853; GFX1064-NEXT: s_endpgm 1854; 1855; GFX1032-LABEL: sub_i32_uniform: 1856; GFX1032: ; %bb.0: ; %entry 1857; GFX1032-NEXT: s_clause 0x1 1858; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1859; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 1860; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1861; GFX1032-NEXT: ; implicit-def: $vcc_hi 1862; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1863; GFX1032-NEXT: ; implicit-def: $vgpr1 1864; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1865; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1866; GFX1032-NEXT: s_cbranch_execz BB9_2 1867; GFX1032-NEXT: ; %bb.1: 1868; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1869; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1870; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1871; GFX1032-NEXT: s_mul_i32 s2, s0, s2 1872; GFX1032-NEXT: v_mov_b32_e32 v2, s2 1873; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1874; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1875; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1876; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1877; GFX1032-NEXT: buffer_gl0_inv 1878; GFX1032-NEXT: buffer_gl1_inv 1879; GFX1032-NEXT: BB9_2: 1880; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1881; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 1882; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1883; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 1884; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1885; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1886; GFX1032-NEXT: s_mov_b32 s6, -1 1887; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1888; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1889; GFX1032-NEXT: s_endpgm 1890entry: 1891 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1892 store i32 %old, i32 addrspace(1)* %out 1893 ret void 1894} 1895 1896define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1897; 1898; 1899; GFX7LESS-LABEL: sub_i32_varying: 1900; GFX7LESS: ; %bb.0: ; %entry 1901; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1902; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1903; GFX7LESS-NEXT: s_mov_b32 m0, -1 1904; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1905; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1906; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1907; GFX7LESS-NEXT: buffer_wbinvl1 1908; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1909; GFX7LESS-NEXT: s_mov_b32 s2, -1 1910; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1911; GFX7LESS-NEXT: s_endpgm 1912; 1913; GFX8-LABEL: sub_i32_varying: 1914; GFX8: ; %bb.0: ; %entry 1915; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1916; GFX8-NEXT: s_mov_b64 s[2:3], exec 1917; GFX8-NEXT: v_mov_b32_e32 v2, v0 1918; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1919; GFX8-NEXT: v_mov_b32_e32 v1, 0 1920; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1921; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1922; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1923; GFX8-NEXT: s_not_b64 exec, exec 1924; GFX8-NEXT: v_mov_b32_e32 v2, 0 1925; GFX8-NEXT: s_not_b64 exec, exec 1926; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1927; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1928; GFX8-NEXT: s_nop 1 1929; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1930; GFX8-NEXT: s_nop 1 1931; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1932; GFX8-NEXT: s_nop 1 1933; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1934; GFX8-NEXT: s_nop 1 1935; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1936; GFX8-NEXT: s_nop 1 1937; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1938; GFX8-NEXT: v_readlane_b32 s2, v2, 63 1939; GFX8-NEXT: s_nop 0 1940; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1941; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1942; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1943; GFX8-NEXT: ; implicit-def: $vgpr0 1944; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1945; GFX8-NEXT: s_cbranch_execz BB10_2 1946; GFX8-NEXT: ; %bb.1: 1947; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1948; GFX8-NEXT: v_mov_b32_e32 v3, s2 1949; GFX8-NEXT: s_mov_b32 m0, -1 1950; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1951; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1952; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1953; GFX8-NEXT: buffer_wbinvl1_vol 1954; GFX8-NEXT: BB10_2: 1955; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1956; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1957; GFX8-NEXT: v_mov_b32_e32 v0, v1 1958; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1959; GFX8-NEXT: s_mov_b32 s3, 0xf000 1960; GFX8-NEXT: s_mov_b32 s2, -1 1961; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1962; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1963; GFX8-NEXT: s_endpgm 1964; 1965; GFX9-LABEL: sub_i32_varying: 1966; GFX9: ; %bb.0: ; %entry 1967; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1968; GFX9-NEXT: s_mov_b64 s[2:3], exec 1969; GFX9-NEXT: v_mov_b32_e32 v2, v0 1970; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1971; GFX9-NEXT: v_mov_b32_e32 v1, 0 1972; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1973; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1974; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1975; GFX9-NEXT: s_not_b64 exec, exec 1976; GFX9-NEXT: v_mov_b32_e32 v2, 0 1977; GFX9-NEXT: s_not_b64 exec, exec 1978; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1979; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1980; GFX9-NEXT: s_nop 1 1981; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1982; GFX9-NEXT: s_nop 1 1983; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1984; GFX9-NEXT: s_nop 1 1985; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1986; GFX9-NEXT: s_nop 1 1987; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1988; GFX9-NEXT: s_nop 1 1989; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1990; GFX9-NEXT: v_readlane_b32 s2, v2, 63 1991; GFX9-NEXT: s_nop 0 1992; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1993; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1994; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1995; GFX9-NEXT: ; implicit-def: $vgpr0 1996; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1997; GFX9-NEXT: s_cbranch_execz BB10_2 1998; GFX9-NEXT: ; %bb.1: 1999; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2000; GFX9-NEXT: v_mov_b32_e32 v3, s2 2001; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2002; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2003; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2004; GFX9-NEXT: buffer_wbinvl1_vol 2005; GFX9-NEXT: BB10_2: 2006; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2007; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2008; GFX9-NEXT: v_mov_b32_e32 v0, v1 2009; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2010; GFX9-NEXT: s_mov_b32 s3, 0xf000 2011; GFX9-NEXT: s_mov_b32 s2, -1 2012; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2013; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2014; GFX9-NEXT: s_endpgm 2015; 2016; GFX1064-LABEL: sub_i32_varying: 2017; GFX1064: ; %bb.0: ; %entry 2018; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2019; GFX1064-NEXT: s_mov_b64 s[2:3], exec 2020; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2021; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2022; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2023; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2024; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2025; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 2026; GFX1064-NEXT: s_not_b64 exec, exec 2027; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2028; GFX1064-NEXT: s_not_b64 exec, exec 2029; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2030; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2031; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2032; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2033; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2034; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2035; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2036; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2037; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2038; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2039; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2040; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2041; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2042; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2043; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2044; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2045; GFX1064-NEXT: s_mov_b32 s2, -1 2046; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2047; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2048; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2049; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2050; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2051; GFX1064-NEXT: ; implicit-def: $vgpr0 2052; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2053; GFX1064-NEXT: s_cbranch_execz BB10_2 2054; GFX1064-NEXT: ; %bb.1: 2055; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2056; GFX1064-NEXT: v_mov_b32_e32 v4, s3 2057; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2058; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2059; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 2060; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2061; GFX1064-NEXT: buffer_gl0_inv 2062; GFX1064-NEXT: buffer_gl1_inv 2063; GFX1064-NEXT: BB10_2: 2064; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2065; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2066; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2067; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2068; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2069; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2070; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2071; GFX1064-NEXT: s_nop 0 2072; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2073; GFX1064-NEXT: s_endpgm 2074; 2075; GFX1032-LABEL: sub_i32_varying: 2076; GFX1032: ; %bb.0: ; %entry 2077; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2078; GFX1032-NEXT: s_mov_b32 s2, exec_lo 2079; GFX1032-NEXT: ; implicit-def: $vcc_hi 2080; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2081; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 2082; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2083; GFX1032-NEXT: s_mov_b32 exec_lo, s3 2084; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2085; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2086; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2087; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2088; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2089; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2090; GFX1032-NEXT: s_mov_b32 s2, -1 2091; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2092; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2093; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2094; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2095; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2096; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2097; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2098; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2099; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2100; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2101; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2102; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2103; GFX1032-NEXT: ; implicit-def: $vgpr0 2104; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2105; GFX1032-NEXT: s_cbranch_execz BB10_2 2106; GFX1032-NEXT: ; %bb.1: 2107; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2108; GFX1032-NEXT: v_mov_b32_e32 v4, s3 2109; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2110; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2111; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 2112; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2113; GFX1032-NEXT: buffer_gl0_inv 2114; GFX1032-NEXT: buffer_gl1_inv 2115; GFX1032-NEXT: BB10_2: 2116; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2117; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2118; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2119; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2120; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2121; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2122; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2123; GFX1032-NEXT: s_nop 0 2124; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2125; GFX1032-NEXT: s_endpgm 2126entry: 2127 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2128 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2129 store i32 %old, i32 addrspace(1)* %out 2130 ret void 2131} 2132 2133define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2134; 2135; 2136; GFX7LESS-LABEL: sub_i64_constant: 2137; GFX7LESS: ; %bb.0: ; %entry 2138; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2139; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2140; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2141; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2142; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2143; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2144; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2145; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2146; GFX7LESS-NEXT: ; %bb.1: 2147; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2148; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2149; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2150; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2151; GFX7LESS-NEXT: s_mov_b32 m0, -1 2152; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2153; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2154; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2155; GFX7LESS-NEXT: buffer_wbinvl1 2156; GFX7LESS-NEXT: BB11_2: 2157; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2158; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2159; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2160; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2161; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2162; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2163; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2164; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2165; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2166; GFX7LESS-NEXT: s_mov_b32 s2, -1 2167; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2168; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2169; GFX7LESS-NEXT: s_endpgm 2170; 2171; GFX8-LABEL: sub_i64_constant: 2172; GFX8: ; %bb.0: ; %entry 2173; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2174; GFX8-NEXT: s_mov_b64 s[4:5], exec 2175; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2176; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2177; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2178; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2179; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2180; GFX8-NEXT: s_cbranch_execz BB11_2 2181; GFX8-NEXT: ; %bb.1: 2182; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2183; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2184; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2185; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2186; GFX8-NEXT: s_mov_b32 m0, -1 2187; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2188; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2189; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2190; GFX8-NEXT: buffer_wbinvl1_vol 2191; GFX8-NEXT: BB11_2: 2192; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2193; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2194; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2195; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2196; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2197; GFX8-NEXT: v_mov_b32_e32 v2, s3 2198; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2199; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2200; GFX8-NEXT: s_mov_b32 s3, 0xf000 2201; GFX8-NEXT: s_mov_b32 s2, -1 2202; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2203; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2204; GFX8-NEXT: s_endpgm 2205; 2206; GFX9-LABEL: sub_i64_constant: 2207; GFX9: ; %bb.0: ; %entry 2208; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2209; GFX9-NEXT: s_mov_b64 s[4:5], exec 2210; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2211; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2212; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2213; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2214; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2215; GFX9-NEXT: s_cbranch_execz BB11_2 2216; GFX9-NEXT: ; %bb.1: 2217; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2218; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2219; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2220; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2221; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2222; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2223; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2224; GFX9-NEXT: buffer_wbinvl1_vol 2225; GFX9-NEXT: BB11_2: 2226; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2227; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2228; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2229; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2230; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2231; GFX9-NEXT: v_mov_b32_e32 v2, s3 2232; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2233; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2234; GFX9-NEXT: s_mov_b32 s3, 0xf000 2235; GFX9-NEXT: s_mov_b32 s2, -1 2236; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2237; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2238; GFX9-NEXT: s_endpgm 2239; 2240; GFX1064-LABEL: sub_i64_constant: 2241; GFX1064: ; %bb.0: ; %entry 2242; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2243; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2244; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2245; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2246; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2247; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2248; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2249; GFX1064-NEXT: s_cbranch_execz BB11_2 2250; GFX1064-NEXT: ; %bb.1: 2251; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2252; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2253; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2254; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2255; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2256; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2257; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2258; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2259; GFX1064-NEXT: buffer_gl0_inv 2260; GFX1064-NEXT: buffer_gl1_inv 2261; GFX1064-NEXT: BB11_2: 2262; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2263; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2264; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2265; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2266; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2267; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2268; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2269; GFX1064-NEXT: s_mov_b32 s2, -1 2270; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2271; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2272; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2273; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2274; GFX1064-NEXT: s_endpgm 2275; 2276; GFX1032-LABEL: sub_i64_constant: 2277; GFX1032: ; %bb.0: ; %entry 2278; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2279; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2280; GFX1032-NEXT: ; implicit-def: $vcc_hi 2281; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2282; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2283; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2284; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2285; GFX1032-NEXT: s_cbranch_execz BB11_2 2286; GFX1032-NEXT: ; %bb.1: 2287; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2288; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2289; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2290; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 2291; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2292; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2293; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2294; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2295; GFX1032-NEXT: buffer_gl0_inv 2296; GFX1032-NEXT: buffer_gl1_inv 2297; GFX1032-NEXT: BB11_2: 2298; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2299; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2300; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2301; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2302; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2303; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2304; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2305; GFX1032-NEXT: s_mov_b32 s2, -1 2306; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2307; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2308; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2309; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2310; GFX1032-NEXT: s_endpgm 2311entry: 2312 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2313 store i64 %old, i64 addrspace(1)* %out 2314 ret void 2315} 2316 2317define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2318; 2319; 2320; GFX7LESS-LABEL: sub_i64_uniform: 2321; GFX7LESS: ; %bb.0: ; %entry 2322; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2323; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2324; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2325; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2326; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2327; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2328; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2329; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2330; GFX7LESS-NEXT: ; %bb.1: 2331; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2332; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2333; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2334; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2335; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2336; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2337; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2338; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2339; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2340; GFX7LESS-NEXT: s_mov_b32 m0, -1 2341; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2342; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2343; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2344; GFX7LESS-NEXT: buffer_wbinvl1 2345; GFX7LESS-NEXT: BB12_2: 2346; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2347; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2348; GFX7LESS-NEXT: s_mov_b32 s6, -1 2349; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2350; GFX7LESS-NEXT: s_mov_b32 s4, s0 2351; GFX7LESS-NEXT: s_mov_b32 s5, s1 2352; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2353; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2354; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2355; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2356; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2357; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2358; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2359; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2360; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2361; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2362; GFX7LESS-NEXT: s_endpgm 2363; 2364; GFX8-LABEL: sub_i64_uniform: 2365; GFX8: ; %bb.0: ; %entry 2366; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2367; GFX8-NEXT: s_mov_b64 s[6:7], exec 2368; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2369; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2370; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2371; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2372; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2373; GFX8-NEXT: s_cbranch_execz BB12_2 2374; GFX8-NEXT: ; %bb.1: 2375; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2376; GFX8-NEXT: v_mov_b32_e32 v1, s6 2377; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2378; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2379; GFX8-NEXT: s_mul_i32 s7, s3, s6 2380; GFX8-NEXT: s_mul_i32 s6, s2, s6 2381; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2382; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2383; GFX8-NEXT: v_mov_b32_e32 v1, s6 2384; GFX8-NEXT: s_mov_b32 m0, -1 2385; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2386; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2387; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2388; GFX8-NEXT: buffer_wbinvl1_vol 2389; GFX8-NEXT: BB12_2: 2390; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2391; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2392; GFX8-NEXT: s_mov_b32 s4, s0 2393; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2394; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2395; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2396; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2397; GFX8-NEXT: s_mov_b32 s5, s1 2398; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2399; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2400; GFX8-NEXT: v_mov_b32_e32 v2, s1 2401; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2402; GFX8-NEXT: s_mov_b32 s7, 0xf000 2403; GFX8-NEXT: s_mov_b32 s6, -1 2404; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2405; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2406; GFX8-NEXT: s_endpgm 2407; 2408; GFX9-LABEL: sub_i64_uniform: 2409; GFX9: ; %bb.0: ; %entry 2410; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2411; GFX9-NEXT: s_mov_b64 s[6:7], exec 2412; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2413; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2414; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2415; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2416; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2417; GFX9-NEXT: s_cbranch_execz BB12_2 2418; GFX9-NEXT: ; %bb.1: 2419; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2420; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2421; GFX9-NEXT: s_mul_i32 s7, s3, s6 2422; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2423; GFX9-NEXT: s_add_i32 s8, s8, s7 2424; GFX9-NEXT: s_mul_i32 s6, s2, s6 2425; GFX9-NEXT: v_mov_b32_e32 v1, s6 2426; GFX9-NEXT: v_mov_b32_e32 v2, s8 2427; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2428; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2429; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2430; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2431; GFX9-NEXT: buffer_wbinvl1_vol 2432; GFX9-NEXT: BB12_2: 2433; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2434; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2435; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2436; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2437; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2438; GFX9-NEXT: s_mov_b32 s4, s0 2439; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2440; GFX9-NEXT: s_mov_b32 s5, s1 2441; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2442; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2443; GFX9-NEXT: v_mov_b32_e32 v2, s1 2444; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2445; GFX9-NEXT: s_mov_b32 s7, 0xf000 2446; GFX9-NEXT: s_mov_b32 s6, -1 2447; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2448; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2449; GFX9-NEXT: s_endpgm 2450; 2451; GFX1064-LABEL: sub_i64_uniform: 2452; GFX1064: ; %bb.0: ; %entry 2453; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2454; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2455; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2456; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2457; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2458; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2459; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2460; GFX1064-NEXT: s_cbranch_execz BB12_2 2461; GFX1064-NEXT: ; %bb.1: 2462; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2463; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2464; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2465; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2466; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2467; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2468; GFX1064-NEXT: s_add_i32 s8, s8, s7 2469; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2470; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2471; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2472; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2473; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2474; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2475; GFX1064-NEXT: buffer_gl0_inv 2476; GFX1064-NEXT: buffer_gl1_inv 2477; GFX1064-NEXT: BB12_2: 2478; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2479; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2480; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2481; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2482; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2483; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2484; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2485; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2486; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2487; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2488; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0 2489; GFX1064-NEXT: s_mov_b32 s2, -1 2490; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2491; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2492; GFX1064-NEXT: s_endpgm 2493; 2494; GFX1032-LABEL: sub_i64_uniform: 2495; GFX1032: ; %bb.0: ; %entry 2496; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2497; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2498; GFX1032-NEXT: ; implicit-def: $vcc_hi 2499; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2500; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2501; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2502; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2503; GFX1032-NEXT: s_cbranch_execz BB12_2 2504; GFX1032-NEXT: ; %bb.1: 2505; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2506; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2507; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2508; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2509; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2510; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2511; GFX1032-NEXT: s_add_i32 s7, s7, s6 2512; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2513; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2514; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2515; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2516; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2517; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2518; GFX1032-NEXT: buffer_gl0_inv 2519; GFX1032-NEXT: buffer_gl1_inv 2520; GFX1032-NEXT: BB12_2: 2521; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2522; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2523; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2524; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2525; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2526; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2527; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2528; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2529; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2530; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2531; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0 2532; GFX1032-NEXT: s_mov_b32 s2, -1 2533; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2534; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2535; GFX1032-NEXT: s_endpgm 2536entry: 2537 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2538 store i64 %old, i64 addrspace(1)* %out 2539 ret void 2540} 2541 2542; GCN-NOT: v_mbcnt_lo_u32_b32 2543; GCN-NOT: v_mbcnt_hi_u32_b32 2544; GCN-NOT: s_bcnt1_i32_b64 2545define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2546; 2547; 2548; GFX7LESS-LABEL: sub_i64_varying: 2549; GFX7LESS: ; %bb.0: ; %entry 2550; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2551; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2552; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2553; GFX7LESS-NEXT: s_mov_b32 m0, -1 2554; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2555; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2556; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2557; GFX7LESS-NEXT: buffer_wbinvl1 2558; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2559; GFX7LESS-NEXT: s_mov_b32 s2, -1 2560; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2561; GFX7LESS-NEXT: s_endpgm 2562; 2563; GFX8-LABEL: sub_i64_varying: 2564; GFX8: ; %bb.0: ; %entry 2565; GFX8-NEXT: v_mov_b32_e32 v1, 0 2566; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2567; GFX8-NEXT: s_mov_b32 m0, -1 2568; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2569; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2570; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2571; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2572; GFX8-NEXT: buffer_wbinvl1_vol 2573; GFX8-NEXT: s_mov_b32 s3, 0xf000 2574; GFX8-NEXT: s_mov_b32 s2, -1 2575; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2576; GFX8-NEXT: s_endpgm 2577; 2578; GFX9-LABEL: sub_i64_varying: 2579; GFX9: ; %bb.0: ; %entry 2580; GFX9-NEXT: v_mov_b32_e32 v1, 0 2581; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2582; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2583; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2584; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2585; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2586; GFX9-NEXT: buffer_wbinvl1_vol 2587; GFX9-NEXT: s_mov_b32 s3, 0xf000 2588; GFX9-NEXT: s_mov_b32 s2, -1 2589; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2590; GFX9-NEXT: s_endpgm 2591; 2592; GFX1064-LABEL: sub_i64_varying: 2593; GFX1064: ; %bb.0: ; %entry 2594; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2595; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2596; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2597; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2598; GFX1064-NEXT: s_mov_b32 s2, -1 2599; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2600; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2601; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2602; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2603; GFX1064-NEXT: buffer_gl0_inv 2604; GFX1064-NEXT: buffer_gl1_inv 2605; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2606; GFX1064-NEXT: s_endpgm 2607; 2608; GFX1032-LABEL: sub_i64_varying: 2609; GFX1032: ; %bb.0: ; %entry 2610; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2611; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2612; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2613; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2614; GFX1032-NEXT: s_mov_b32 s2, -1 2615; GFX1032-NEXT: ; implicit-def: $vcc_hi 2616; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2617; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2618; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2619; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2620; GFX1032-NEXT: buffer_gl0_inv 2621; GFX1032-NEXT: buffer_gl1_inv 2622; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2623; GFX1032-NEXT: s_endpgm 2624entry: 2625 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2626 %zext = zext i32 %lane to i64 2627 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2628 store i64 %old, i64 addrspace(1)* %out 2629 ret void 2630} 2631 2632define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2633; 2634; 2635; GFX7LESS-LABEL: and_i32_varying: 2636; GFX7LESS: ; %bb.0: ; %entry 2637; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2638; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2639; GFX7LESS-NEXT: s_mov_b32 m0, -1 2640; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2641; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2642; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2643; GFX7LESS-NEXT: buffer_wbinvl1 2644; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2645; GFX7LESS-NEXT: s_mov_b32 s2, -1 2646; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2647; GFX7LESS-NEXT: s_endpgm 2648; 2649; GFX8-LABEL: and_i32_varying: 2650; GFX8: ; %bb.0: ; %entry 2651; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2652; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2653; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2654; GFX8-NEXT: v_mov_b32_e32 v2, v0 2655; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2656; GFX8-NEXT: v_mov_b32_e32 v1, -1 2657; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2658; GFX8-NEXT: s_not_b64 exec, exec 2659; GFX8-NEXT: v_mov_b32_e32 v2, -1 2660; GFX8-NEXT: s_not_b64 exec, exec 2661; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2662; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2663; GFX8-NEXT: s_nop 1 2664; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2665; GFX8-NEXT: s_nop 1 2666; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2667; GFX8-NEXT: s_nop 1 2668; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2669; GFX8-NEXT: s_nop 1 2670; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2671; GFX8-NEXT: s_nop 1 2672; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2673; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2674; GFX8-NEXT: s_nop 0 2675; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2676; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2677; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2678; GFX8-NEXT: ; implicit-def: $vgpr0 2679; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2680; GFX8-NEXT: s_cbranch_execz BB14_2 2681; GFX8-NEXT: ; %bb.1: 2682; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2683; GFX8-NEXT: v_mov_b32_e32 v3, s2 2684; GFX8-NEXT: s_mov_b32 m0, -1 2685; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2686; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2687; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2688; GFX8-NEXT: buffer_wbinvl1_vol 2689; GFX8-NEXT: BB14_2: 2690; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2691; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2692; GFX8-NEXT: v_mov_b32_e32 v0, v1 2693; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2694; GFX8-NEXT: s_mov_b32 s3, 0xf000 2695; GFX8-NEXT: s_mov_b32 s2, -1 2696; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2697; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2698; GFX8-NEXT: s_endpgm 2699; 2700; GFX9-LABEL: and_i32_varying: 2701; GFX9: ; %bb.0: ; %entry 2702; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2703; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2704; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2705; GFX9-NEXT: v_mov_b32_e32 v2, v0 2706; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2707; GFX9-NEXT: v_mov_b32_e32 v1, -1 2708; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2709; GFX9-NEXT: s_not_b64 exec, exec 2710; GFX9-NEXT: v_mov_b32_e32 v2, -1 2711; GFX9-NEXT: s_not_b64 exec, exec 2712; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2713; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2714; GFX9-NEXT: s_nop 1 2715; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2716; GFX9-NEXT: s_nop 1 2717; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2718; GFX9-NEXT: s_nop 1 2719; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2720; GFX9-NEXT: s_nop 1 2721; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2722; GFX9-NEXT: s_nop 1 2723; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2724; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2725; GFX9-NEXT: s_nop 0 2726; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2727; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2728; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2729; GFX9-NEXT: ; implicit-def: $vgpr0 2730; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2731; GFX9-NEXT: s_cbranch_execz BB14_2 2732; GFX9-NEXT: ; %bb.1: 2733; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2734; GFX9-NEXT: v_mov_b32_e32 v3, s2 2735; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2736; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2737; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2738; GFX9-NEXT: buffer_wbinvl1_vol 2739; GFX9-NEXT: BB14_2: 2740; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2741; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2742; GFX9-NEXT: v_mov_b32_e32 v0, v1 2743; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2744; GFX9-NEXT: s_mov_b32 s3, 0xf000 2745; GFX9-NEXT: s_mov_b32 s2, -1 2746; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2747; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2748; GFX9-NEXT: s_endpgm 2749; 2750; GFX1064-LABEL: and_i32_varying: 2751; GFX1064: ; %bb.0: ; %entry 2752; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2753; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 2754; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2755; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 2756; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2757; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2758; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2759; GFX1064-NEXT: s_not_b64 exec, exec 2760; GFX1064-NEXT: v_mov_b32_e32 v2, -1 2761; GFX1064-NEXT: s_not_b64 exec, exec 2762; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2763; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2764; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2765; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2766; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2767; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2768; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2769; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2770; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2771; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2772; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2773; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2774; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2775; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2776; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2777; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2778; GFX1064-NEXT: s_mov_b32 s2, -1 2779; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2780; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2781; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2782; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2783; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 2784; GFX1064-NEXT: ; implicit-def: $vgpr0 2785; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2786; GFX1064-NEXT: s_cbranch_execz BB14_2 2787; GFX1064-NEXT: ; %bb.1: 2788; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2789; GFX1064-NEXT: v_mov_b32_e32 v4, s3 2790; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2791; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2792; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 2793; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2794; GFX1064-NEXT: buffer_gl0_inv 2795; GFX1064-NEXT: buffer_gl1_inv 2796; GFX1064-NEXT: BB14_2: 2797; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2798; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2799; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2800; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2801; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2802; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2803; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2804; GFX1064-NEXT: s_nop 0 2805; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2806; GFX1064-NEXT: s_endpgm 2807; 2808; GFX1032-LABEL: and_i32_varying: 2809; GFX1032: ; %bb.0: ; %entry 2810; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2811; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 2812; GFX1032-NEXT: ; implicit-def: $vcc_hi 2813; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2814; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2815; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2816; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2817; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2818; GFX1032-NEXT: v_mov_b32_e32 v2, -1 2819; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2820; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2821; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2822; GFX1032-NEXT: s_mov_b32 s2, -1 2823; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2824; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2825; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2826; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2827; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2828; GFX1032-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2829; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2830; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2831; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2832; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2833; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2834; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 2835; GFX1032-NEXT: ; implicit-def: $vgpr0 2836; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2837; GFX1032-NEXT: s_cbranch_execz BB14_2 2838; GFX1032-NEXT: ; %bb.1: 2839; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2840; GFX1032-NEXT: v_mov_b32_e32 v4, s3 2841; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2842; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2843; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 2844; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2845; GFX1032-NEXT: buffer_gl0_inv 2846; GFX1032-NEXT: buffer_gl1_inv 2847; GFX1032-NEXT: BB14_2: 2848; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2849; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2850; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2851; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2852; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2853; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2854; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2855; GFX1032-NEXT: s_nop 0 2856; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2857; GFX1032-NEXT: s_endpgm 2858entry: 2859 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2860 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2861 store i32 %old, i32 addrspace(1)* %out 2862 ret void 2863} 2864 2865define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2866; 2867; 2868; GFX7LESS-LABEL: or_i32_varying: 2869; GFX7LESS: ; %bb.0: ; %entry 2870; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2871; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2872; GFX7LESS-NEXT: s_mov_b32 m0, -1 2873; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2874; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2875; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2876; GFX7LESS-NEXT: buffer_wbinvl1 2877; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2878; GFX7LESS-NEXT: s_mov_b32 s2, -1 2879; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2880; GFX7LESS-NEXT: s_endpgm 2881; 2882; GFX8-LABEL: or_i32_varying: 2883; GFX8: ; %bb.0: ; %entry 2884; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2885; GFX8-NEXT: s_mov_b64 s[2:3], exec 2886; GFX8-NEXT: v_mov_b32_e32 v2, v0 2887; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2888; GFX8-NEXT: v_mov_b32_e32 v1, 0 2889; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2890; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2891; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2892; GFX8-NEXT: s_not_b64 exec, exec 2893; GFX8-NEXT: v_mov_b32_e32 v2, 0 2894; GFX8-NEXT: s_not_b64 exec, exec 2895; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2896; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2897; GFX8-NEXT: s_nop 1 2898; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2899; GFX8-NEXT: s_nop 1 2900; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2901; GFX8-NEXT: s_nop 1 2902; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2903; GFX8-NEXT: s_nop 1 2904; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2905; GFX8-NEXT: s_nop 1 2906; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2907; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2908; GFX8-NEXT: s_nop 0 2909; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2910; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2911; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2912; GFX8-NEXT: ; implicit-def: $vgpr0 2913; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2914; GFX8-NEXT: s_cbranch_execz BB15_2 2915; GFX8-NEXT: ; %bb.1: 2916; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2917; GFX8-NEXT: v_mov_b32_e32 v3, s2 2918; GFX8-NEXT: s_mov_b32 m0, -1 2919; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2920; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2921; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2922; GFX8-NEXT: buffer_wbinvl1_vol 2923; GFX8-NEXT: BB15_2: 2924; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2925; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2926; GFX8-NEXT: v_mov_b32_e32 v0, v1 2927; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2928; GFX8-NEXT: s_mov_b32 s3, 0xf000 2929; GFX8-NEXT: s_mov_b32 s2, -1 2930; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2931; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2932; GFX8-NEXT: s_endpgm 2933; 2934; GFX9-LABEL: or_i32_varying: 2935; GFX9: ; %bb.0: ; %entry 2936; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2937; GFX9-NEXT: s_mov_b64 s[2:3], exec 2938; GFX9-NEXT: v_mov_b32_e32 v2, v0 2939; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2940; GFX9-NEXT: v_mov_b32_e32 v1, 0 2941; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2942; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2943; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2944; GFX9-NEXT: s_not_b64 exec, exec 2945; GFX9-NEXT: v_mov_b32_e32 v2, 0 2946; GFX9-NEXT: s_not_b64 exec, exec 2947; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2948; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2949; GFX9-NEXT: s_nop 1 2950; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2951; GFX9-NEXT: s_nop 1 2952; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2953; GFX9-NEXT: s_nop 1 2954; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2955; GFX9-NEXT: s_nop 1 2956; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2957; GFX9-NEXT: s_nop 1 2958; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2959; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2960; GFX9-NEXT: s_nop 0 2961; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2962; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2963; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2964; GFX9-NEXT: ; implicit-def: $vgpr0 2965; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2966; GFX9-NEXT: s_cbranch_execz BB15_2 2967; GFX9-NEXT: ; %bb.1: 2968; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2969; GFX9-NEXT: v_mov_b32_e32 v3, s2 2970; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2971; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2972; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2973; GFX9-NEXT: buffer_wbinvl1_vol 2974; GFX9-NEXT: BB15_2: 2975; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2976; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2977; GFX9-NEXT: v_mov_b32_e32 v0, v1 2978; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2979; GFX9-NEXT: s_mov_b32 s3, 0xf000 2980; GFX9-NEXT: s_mov_b32 s2, -1 2981; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2982; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2983; GFX9-NEXT: s_endpgm 2984; 2985; GFX1064-LABEL: or_i32_varying: 2986; GFX1064: ; %bb.0: ; %entry 2987; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2988; GFX1064-NEXT: s_mov_b64 s[2:3], exec 2989; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2990; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2991; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2992; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2993; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2994; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 2995; GFX1064-NEXT: s_not_b64 exec, exec 2996; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2997; GFX1064-NEXT: s_not_b64 exec, exec 2998; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2999; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3000; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3001; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3002; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3003; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3004; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3005; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3006; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3007; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3008; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3009; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3010; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3011; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3012; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3013; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3014; GFX1064-NEXT: s_mov_b32 s2, -1 3015; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3016; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3017; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3018; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3019; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3020; GFX1064-NEXT: ; implicit-def: $vgpr0 3021; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3022; GFX1064-NEXT: s_cbranch_execz BB15_2 3023; GFX1064-NEXT: ; %bb.1: 3024; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3025; GFX1064-NEXT: v_mov_b32_e32 v4, s3 3026; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3027; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3028; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 3029; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3030; GFX1064-NEXT: buffer_gl0_inv 3031; GFX1064-NEXT: buffer_gl1_inv 3032; GFX1064-NEXT: BB15_2: 3033; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3034; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3035; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3036; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3037; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3038; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3039; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3040; GFX1064-NEXT: s_nop 0 3041; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3042; GFX1064-NEXT: s_endpgm 3043; 3044; GFX1032-LABEL: or_i32_varying: 3045; GFX1032: ; %bb.0: ; %entry 3046; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3047; GFX1032-NEXT: s_mov_b32 s2, exec_lo 3048; GFX1032-NEXT: ; implicit-def: $vcc_hi 3049; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3050; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 3051; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3052; GFX1032-NEXT: s_mov_b32 exec_lo, s3 3053; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3054; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3055; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3056; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3057; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3058; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3059; GFX1032-NEXT: s_mov_b32 s2, -1 3060; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3061; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3062; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3063; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3064; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3065; GFX1032-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3066; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3067; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3068; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3069; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3070; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3071; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3072; GFX1032-NEXT: ; implicit-def: $vgpr0 3073; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3074; GFX1032-NEXT: s_cbranch_execz BB15_2 3075; GFX1032-NEXT: ; %bb.1: 3076; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3077; GFX1032-NEXT: v_mov_b32_e32 v4, s3 3078; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3079; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3080; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 3081; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3082; GFX1032-NEXT: buffer_gl0_inv 3083; GFX1032-NEXT: buffer_gl1_inv 3084; GFX1032-NEXT: BB15_2: 3085; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3086; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3087; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3088; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3089; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3090; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3091; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3092; GFX1032-NEXT: s_nop 0 3093; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3094; GFX1032-NEXT: s_endpgm 3095entry: 3096 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3097 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3098 store i32 %old, i32 addrspace(1)* %out 3099 ret void 3100} 3101 3102define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3103; 3104; 3105; GFX7LESS-LABEL: xor_i32_varying: 3106; GFX7LESS: ; %bb.0: ; %entry 3107; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3108; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3109; GFX7LESS-NEXT: s_mov_b32 m0, -1 3110; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3111; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3112; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3113; GFX7LESS-NEXT: buffer_wbinvl1 3114; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3115; GFX7LESS-NEXT: s_mov_b32 s2, -1 3116; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3117; GFX7LESS-NEXT: s_endpgm 3118; 3119; GFX8-LABEL: xor_i32_varying: 3120; GFX8: ; %bb.0: ; %entry 3121; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3122; GFX8-NEXT: s_mov_b64 s[2:3], exec 3123; GFX8-NEXT: v_mov_b32_e32 v2, v0 3124; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3125; GFX8-NEXT: v_mov_b32_e32 v1, 0 3126; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3127; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3128; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3129; GFX8-NEXT: s_not_b64 exec, exec 3130; GFX8-NEXT: v_mov_b32_e32 v2, 0 3131; GFX8-NEXT: s_not_b64 exec, exec 3132; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3133; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3134; GFX8-NEXT: s_nop 1 3135; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3136; GFX8-NEXT: s_nop 1 3137; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3138; GFX8-NEXT: s_nop 1 3139; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3140; GFX8-NEXT: s_nop 1 3141; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3142; GFX8-NEXT: s_nop 1 3143; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3144; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3145; GFX8-NEXT: s_nop 0 3146; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3147; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3148; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3149; GFX8-NEXT: ; implicit-def: $vgpr0 3150; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3151; GFX8-NEXT: s_cbranch_execz BB16_2 3152; GFX8-NEXT: ; %bb.1: 3153; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3154; GFX8-NEXT: v_mov_b32_e32 v3, s2 3155; GFX8-NEXT: s_mov_b32 m0, -1 3156; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3157; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3158; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3159; GFX8-NEXT: buffer_wbinvl1_vol 3160; GFX8-NEXT: BB16_2: 3161; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3162; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3163; GFX8-NEXT: v_mov_b32_e32 v0, v1 3164; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3165; GFX8-NEXT: s_mov_b32 s3, 0xf000 3166; GFX8-NEXT: s_mov_b32 s2, -1 3167; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3168; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3169; GFX8-NEXT: s_endpgm 3170; 3171; GFX9-LABEL: xor_i32_varying: 3172; GFX9: ; %bb.0: ; %entry 3173; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3174; GFX9-NEXT: s_mov_b64 s[2:3], exec 3175; GFX9-NEXT: v_mov_b32_e32 v2, v0 3176; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3177; GFX9-NEXT: v_mov_b32_e32 v1, 0 3178; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3179; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3180; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3181; GFX9-NEXT: s_not_b64 exec, exec 3182; GFX9-NEXT: v_mov_b32_e32 v2, 0 3183; GFX9-NEXT: s_not_b64 exec, exec 3184; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3185; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3186; GFX9-NEXT: s_nop 1 3187; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3188; GFX9-NEXT: s_nop 1 3189; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3190; GFX9-NEXT: s_nop 1 3191; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3192; GFX9-NEXT: s_nop 1 3193; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3194; GFX9-NEXT: s_nop 1 3195; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3196; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3197; GFX9-NEXT: s_nop 0 3198; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3199; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3200; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3201; GFX9-NEXT: ; implicit-def: $vgpr0 3202; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3203; GFX9-NEXT: s_cbranch_execz BB16_2 3204; GFX9-NEXT: ; %bb.1: 3205; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3206; GFX9-NEXT: v_mov_b32_e32 v3, s2 3207; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3208; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3209; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3210; GFX9-NEXT: buffer_wbinvl1_vol 3211; GFX9-NEXT: BB16_2: 3212; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3213; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3214; GFX9-NEXT: v_mov_b32_e32 v0, v1 3215; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3216; GFX9-NEXT: s_mov_b32 s3, 0xf000 3217; GFX9-NEXT: s_mov_b32 s2, -1 3218; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3219; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3220; GFX9-NEXT: s_endpgm 3221; 3222; GFX1064-LABEL: xor_i32_varying: 3223; GFX1064: ; %bb.0: ; %entry 3224; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3225; GFX1064-NEXT: s_mov_b64 s[2:3], exec 3226; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3227; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3228; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3229; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3230; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3231; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3232; GFX1064-NEXT: s_not_b64 exec, exec 3233; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3234; GFX1064-NEXT: s_not_b64 exec, exec 3235; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3236; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3237; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3238; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3239; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3240; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3241; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3242; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3243; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3244; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3245; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3246; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3247; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3248; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3249; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3250; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3251; GFX1064-NEXT: s_mov_b32 s2, -1 3252; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3253; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3254; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3255; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3256; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3257; GFX1064-NEXT: ; implicit-def: $vgpr0 3258; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3259; GFX1064-NEXT: s_cbranch_execz BB16_2 3260; GFX1064-NEXT: ; %bb.1: 3261; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3262; GFX1064-NEXT: v_mov_b32_e32 v4, s3 3263; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3264; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3265; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 3266; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3267; GFX1064-NEXT: buffer_gl0_inv 3268; GFX1064-NEXT: buffer_gl1_inv 3269; GFX1064-NEXT: BB16_2: 3270; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3271; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3272; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3273; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3274; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3275; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3276; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3277; GFX1064-NEXT: s_nop 0 3278; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3279; GFX1064-NEXT: s_endpgm 3280; 3281; GFX1032-LABEL: xor_i32_varying: 3282; GFX1032: ; %bb.0: ; %entry 3283; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3284; GFX1032-NEXT: s_mov_b32 s2, exec_lo 3285; GFX1032-NEXT: ; implicit-def: $vcc_hi 3286; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3287; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 3288; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3289; GFX1032-NEXT: s_mov_b32 exec_lo, s3 3290; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3291; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3292; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3293; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3294; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3295; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3296; GFX1032-NEXT: s_mov_b32 s2, -1 3297; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3298; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3299; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3300; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3301; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3302; GFX1032-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3303; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3304; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3305; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3306; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3307; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3308; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3309; GFX1032-NEXT: ; implicit-def: $vgpr0 3310; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3311; GFX1032-NEXT: s_cbranch_execz BB16_2 3312; GFX1032-NEXT: ; %bb.1: 3313; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3314; GFX1032-NEXT: v_mov_b32_e32 v4, s3 3315; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3316; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3317; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 3318; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3319; GFX1032-NEXT: buffer_gl0_inv 3320; GFX1032-NEXT: buffer_gl1_inv 3321; GFX1032-NEXT: BB16_2: 3322; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3323; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3324; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3325; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3326; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3327; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3328; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3329; GFX1032-NEXT: s_nop 0 3330; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3331; GFX1032-NEXT: s_endpgm 3332entry: 3333 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3334 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3335 store i32 %old, i32 addrspace(1)* %out 3336 ret void 3337} 3338 3339define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3340; 3341; 3342; GFX7LESS-LABEL: max_i32_varying: 3343; GFX7LESS: ; %bb.0: ; %entry 3344; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3345; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3346; GFX7LESS-NEXT: s_mov_b32 m0, -1 3347; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3348; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3349; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3350; GFX7LESS-NEXT: buffer_wbinvl1 3351; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3352; GFX7LESS-NEXT: s_mov_b32 s2, -1 3353; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3354; GFX7LESS-NEXT: s_endpgm 3355; 3356; GFX8-LABEL: max_i32_varying: 3357; GFX8: ; %bb.0: ; %entry 3358; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3359; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3360; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3361; GFX8-NEXT: v_mov_b32_e32 v2, v0 3362; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3363; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3364; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3365; GFX8-NEXT: s_not_b64 exec, exec 3366; GFX8-NEXT: v_mov_b32_e32 v2, v1 3367; GFX8-NEXT: s_not_b64 exec, exec 3368; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3369; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3370; GFX8-NEXT: s_nop 1 3371; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3372; GFX8-NEXT: s_nop 1 3373; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3374; GFX8-NEXT: s_nop 1 3375; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3376; GFX8-NEXT: s_nop 1 3377; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3378; GFX8-NEXT: s_nop 1 3379; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3380; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3381; GFX8-NEXT: s_nop 0 3382; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3383; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3384; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3385; GFX8-NEXT: ; implicit-def: $vgpr0 3386; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3387; GFX8-NEXT: s_cbranch_execz BB17_2 3388; GFX8-NEXT: ; %bb.1: 3389; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3390; GFX8-NEXT: v_mov_b32_e32 v3, s2 3391; GFX8-NEXT: s_mov_b32 m0, -1 3392; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3393; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3394; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3395; GFX8-NEXT: buffer_wbinvl1_vol 3396; GFX8-NEXT: BB17_2: 3397; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3398; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3399; GFX8-NEXT: v_mov_b32_e32 v0, v1 3400; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3401; GFX8-NEXT: s_mov_b32 s3, 0xf000 3402; GFX8-NEXT: s_mov_b32 s2, -1 3403; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3404; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3405; GFX8-NEXT: s_endpgm 3406; 3407; GFX9-LABEL: max_i32_varying: 3408; GFX9: ; %bb.0: ; %entry 3409; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3410; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3411; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3412; GFX9-NEXT: v_mov_b32_e32 v2, v0 3413; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3414; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3415; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3416; GFX9-NEXT: s_not_b64 exec, exec 3417; GFX9-NEXT: v_mov_b32_e32 v2, v1 3418; GFX9-NEXT: s_not_b64 exec, exec 3419; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3420; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3421; GFX9-NEXT: s_nop 1 3422; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3423; GFX9-NEXT: s_nop 1 3424; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3425; GFX9-NEXT: s_nop 1 3426; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3427; GFX9-NEXT: s_nop 1 3428; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3429; GFX9-NEXT: s_nop 1 3430; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3431; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3432; GFX9-NEXT: s_nop 0 3433; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3434; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3435; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3436; GFX9-NEXT: ; implicit-def: $vgpr0 3437; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3438; GFX9-NEXT: s_cbranch_execz BB17_2 3439; GFX9-NEXT: ; %bb.1: 3440; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3441; GFX9-NEXT: v_mov_b32_e32 v3, s2 3442; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3443; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3444; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3445; GFX9-NEXT: buffer_wbinvl1_vol 3446; GFX9-NEXT: BB17_2: 3447; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3448; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3449; GFX9-NEXT: v_mov_b32_e32 v0, v1 3450; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3451; GFX9-NEXT: s_mov_b32 s3, 0xf000 3452; GFX9-NEXT: s_mov_b32 s2, -1 3453; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3454; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3455; GFX9-NEXT: s_endpgm 3456; 3457; GFX1064-LABEL: max_i32_varying: 3458; GFX1064: ; %bb.0: ; %entry 3459; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3460; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 3461; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3462; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 3463; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3464; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3465; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3466; GFX1064-NEXT: s_not_b64 exec, exec 3467; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3468; GFX1064-NEXT: s_not_b64 exec, exec 3469; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3470; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3471; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3472; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3473; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3474; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3475; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3476; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3477; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3478; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3479; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3480; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3481; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3482; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3483; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3484; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3485; GFX1064-NEXT: s_mov_b32 s2, -1 3486; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3487; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3488; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3489; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3490; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3491; GFX1064-NEXT: ; implicit-def: $vgpr0 3492; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3493; GFX1064-NEXT: s_cbranch_execz BB17_2 3494; GFX1064-NEXT: ; %bb.1: 3495; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3496; GFX1064-NEXT: v_mov_b32_e32 v4, s3 3497; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3498; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3499; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 3500; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3501; GFX1064-NEXT: buffer_gl0_inv 3502; GFX1064-NEXT: buffer_gl1_inv 3503; GFX1064-NEXT: BB17_2: 3504; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3505; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3506; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3507; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3508; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3509; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3510; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3511; GFX1064-NEXT: s_nop 0 3512; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3513; GFX1064-NEXT: s_endpgm 3514; 3515; GFX1032-LABEL: max_i32_varying: 3516; GFX1032: ; %bb.0: ; %entry 3517; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3518; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 3519; GFX1032-NEXT: ; implicit-def: $vcc_hi 3520; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3521; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3522; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3523; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3524; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3525; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3526; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3527; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3528; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3529; GFX1032-NEXT: s_mov_b32 s2, -1 3530; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3531; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3532; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3533; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3534; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3535; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3536; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3537; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3538; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3539; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3540; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3541; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 3542; GFX1032-NEXT: ; implicit-def: $vgpr0 3543; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3544; GFX1032-NEXT: s_cbranch_execz BB17_2 3545; GFX1032-NEXT: ; %bb.1: 3546; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3547; GFX1032-NEXT: v_mov_b32_e32 v4, s3 3548; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3549; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3550; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 3551; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3552; GFX1032-NEXT: buffer_gl0_inv 3553; GFX1032-NEXT: buffer_gl1_inv 3554; GFX1032-NEXT: BB17_2: 3555; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3556; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3557; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3558; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3559; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3560; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3561; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3562; GFX1032-NEXT: s_nop 0 3563; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3564; GFX1032-NEXT: s_endpgm 3565entry: 3566 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3567 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3568 store i32 %old, i32 addrspace(1)* %out 3569 ret void 3570} 3571 3572define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3573; 3574; 3575; GFX7LESS-LABEL: max_i64_constant: 3576; GFX7LESS: ; %bb.0: ; %entry 3577; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3578; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3579; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3580; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3581; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3582; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3583; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3584; GFX7LESS-NEXT: ; %bb.1: 3585; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3586; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3587; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3588; GFX7LESS-NEXT: s_mov_b32 m0, -1 3589; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3590; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3591; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3592; GFX7LESS-NEXT: buffer_wbinvl1 3593; GFX7LESS-NEXT: BB18_2: 3594; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3595; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3596; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3597; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3598; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3599; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3600; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3601; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3602; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3603; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3604; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3605; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3606; GFX7LESS-NEXT: s_mov_b32 s2, -1 3607; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3608; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3609; GFX7LESS-NEXT: s_endpgm 3610; 3611; GFX8-LABEL: max_i64_constant: 3612; GFX8: ; %bb.0: ; %entry 3613; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3614; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3615; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3616; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3617; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3618; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3619; GFX8-NEXT: s_cbranch_execz BB18_2 3620; GFX8-NEXT: ; %bb.1: 3621; GFX8-NEXT: v_mov_b32_e32 v0, 5 3622; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3623; GFX8-NEXT: v_mov_b32_e32 v1, 0 3624; GFX8-NEXT: s_mov_b32 m0, -1 3625; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3626; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3627; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3628; GFX8-NEXT: buffer_wbinvl1_vol 3629; GFX8-NEXT: BB18_2: 3630; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3631; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3632; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3633; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3634; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3635; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3636; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3637; GFX8-NEXT: v_mov_b32_e32 v2, s3 3638; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3639; GFX8-NEXT: v_mov_b32_e32 v2, s2 3640; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3641; GFX8-NEXT: s_mov_b32 s3, 0xf000 3642; GFX8-NEXT: s_mov_b32 s2, -1 3643; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3644; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3645; GFX8-NEXT: s_endpgm 3646; 3647; GFX9-LABEL: max_i64_constant: 3648; GFX9: ; %bb.0: ; %entry 3649; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3650; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3651; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3652; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3653; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3654; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3655; GFX9-NEXT: s_cbranch_execz BB18_2 3656; GFX9-NEXT: ; %bb.1: 3657; GFX9-NEXT: v_mov_b32_e32 v0, 5 3658; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3659; GFX9-NEXT: v_mov_b32_e32 v1, 0 3660; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3661; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3662; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3663; GFX9-NEXT: buffer_wbinvl1_vol 3664; GFX9-NEXT: BB18_2: 3665; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3666; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3667; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3668; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3669; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3670; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3671; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3672; GFX9-NEXT: v_mov_b32_e32 v2, s3 3673; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3674; GFX9-NEXT: v_mov_b32_e32 v2, s2 3675; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3676; GFX9-NEXT: s_mov_b32 s3, 0xf000 3677; GFX9-NEXT: s_mov_b32 s2, -1 3678; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3679; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3680; GFX9-NEXT: s_endpgm 3681; 3682; GFX1064-LABEL: max_i64_constant: 3683; GFX1064: ; %bb.0: ; %entry 3684; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3685; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3686; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3687; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3688; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3689; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3690; GFX1064-NEXT: s_cbranch_execz BB18_2 3691; GFX1064-NEXT: ; %bb.1: 3692; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3693; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3694; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3695; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3696; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3697; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3698; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3699; GFX1064-NEXT: buffer_gl0_inv 3700; GFX1064-NEXT: buffer_gl1_inv 3701; GFX1064-NEXT: BB18_2: 3702; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3703; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3704; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3705; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3706; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3707; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3708; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3709; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3710; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3711; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3712; GFX1064-NEXT: s_mov_b32 s2, -1 3713; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3714; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3715; GFX1064-NEXT: s_endpgm 3716; 3717; GFX1032-LABEL: max_i64_constant: 3718; GFX1032: ; %bb.0: ; %entry 3719; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3720; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3721; GFX1032-NEXT: ; implicit-def: $vcc_hi 3722; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3723; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3724; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3725; GFX1032-NEXT: s_cbranch_execz BB18_2 3726; GFX1032-NEXT: ; %bb.1: 3727; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3728; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3729; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3730; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3731; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3732; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3733; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3734; GFX1032-NEXT: buffer_gl0_inv 3735; GFX1032-NEXT: buffer_gl1_inv 3736; GFX1032-NEXT: BB18_2: 3737; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3738; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3739; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3740; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3741; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3742; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3743; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3744; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3745; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3746; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3747; GFX1032-NEXT: s_mov_b32 s2, -1 3748; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3749; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3750; GFX1032-NEXT: s_endpgm 3751entry: 3752 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3753 store i64 %old, i64 addrspace(1)* %out 3754 ret void 3755} 3756 3757define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3758; 3759; 3760; GFX7LESS-LABEL: min_i32_varying: 3761; GFX7LESS: ; %bb.0: ; %entry 3762; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3763; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3764; GFX7LESS-NEXT: s_mov_b32 m0, -1 3765; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3766; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3767; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3768; GFX7LESS-NEXT: buffer_wbinvl1 3769; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3770; GFX7LESS-NEXT: s_mov_b32 s2, -1 3771; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3772; GFX7LESS-NEXT: s_endpgm 3773; 3774; GFX8-LABEL: min_i32_varying: 3775; GFX8: ; %bb.0: ; %entry 3776; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3777; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3778; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3779; GFX8-NEXT: v_mov_b32_e32 v2, v0 3780; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3781; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3782; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3783; GFX8-NEXT: s_not_b64 exec, exec 3784; GFX8-NEXT: v_mov_b32_e32 v2, v1 3785; GFX8-NEXT: s_not_b64 exec, exec 3786; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3787; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3788; GFX8-NEXT: s_nop 1 3789; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3790; GFX8-NEXT: s_nop 1 3791; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3792; GFX8-NEXT: s_nop 1 3793; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3794; GFX8-NEXT: s_nop 1 3795; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3796; GFX8-NEXT: s_nop 1 3797; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3798; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3799; GFX8-NEXT: s_nop 0 3800; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3801; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3802; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3803; GFX8-NEXT: ; implicit-def: $vgpr0 3804; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3805; GFX8-NEXT: s_cbranch_execz BB19_2 3806; GFX8-NEXT: ; %bb.1: 3807; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3808; GFX8-NEXT: v_mov_b32_e32 v3, s2 3809; GFX8-NEXT: s_mov_b32 m0, -1 3810; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3811; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3812; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3813; GFX8-NEXT: buffer_wbinvl1_vol 3814; GFX8-NEXT: BB19_2: 3815; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3816; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3817; GFX8-NEXT: v_mov_b32_e32 v0, v1 3818; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3819; GFX8-NEXT: s_mov_b32 s3, 0xf000 3820; GFX8-NEXT: s_mov_b32 s2, -1 3821; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3822; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3823; GFX8-NEXT: s_endpgm 3824; 3825; GFX9-LABEL: min_i32_varying: 3826; GFX9: ; %bb.0: ; %entry 3827; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3828; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3829; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3830; GFX9-NEXT: v_mov_b32_e32 v2, v0 3831; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3832; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3833; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3834; GFX9-NEXT: s_not_b64 exec, exec 3835; GFX9-NEXT: v_mov_b32_e32 v2, v1 3836; GFX9-NEXT: s_not_b64 exec, exec 3837; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3838; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3839; GFX9-NEXT: s_nop 1 3840; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3841; GFX9-NEXT: s_nop 1 3842; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3843; GFX9-NEXT: s_nop 1 3844; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3845; GFX9-NEXT: s_nop 1 3846; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3847; GFX9-NEXT: s_nop 1 3848; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3849; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3850; GFX9-NEXT: s_nop 0 3851; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3852; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3853; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3854; GFX9-NEXT: ; implicit-def: $vgpr0 3855; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3856; GFX9-NEXT: s_cbranch_execz BB19_2 3857; GFX9-NEXT: ; %bb.1: 3858; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3859; GFX9-NEXT: v_mov_b32_e32 v3, s2 3860; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3861; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3862; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3863; GFX9-NEXT: buffer_wbinvl1_vol 3864; GFX9-NEXT: BB19_2: 3865; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3866; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3867; GFX9-NEXT: v_mov_b32_e32 v0, v1 3868; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3869; GFX9-NEXT: s_mov_b32 s3, 0xf000 3870; GFX9-NEXT: s_mov_b32 s2, -1 3871; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3872; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3873; GFX9-NEXT: s_endpgm 3874; 3875; GFX1064-LABEL: min_i32_varying: 3876; GFX1064: ; %bb.0: ; %entry 3877; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3878; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 3879; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3880; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 3881; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3882; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3883; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3884; GFX1064-NEXT: s_not_b64 exec, exec 3885; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3886; GFX1064-NEXT: s_not_b64 exec, exec 3887; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3888; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3889; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3890; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3891; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3892; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3893; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3894; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3895; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3896; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3897; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3898; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3899; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3900; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3901; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3902; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3903; GFX1064-NEXT: s_mov_b32 s2, -1 3904; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3905; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3906; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3907; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3908; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3909; GFX1064-NEXT: ; implicit-def: $vgpr0 3910; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3911; GFX1064-NEXT: s_cbranch_execz BB19_2 3912; GFX1064-NEXT: ; %bb.1: 3913; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3914; GFX1064-NEXT: v_mov_b32_e32 v4, s3 3915; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3916; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3917; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 3918; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3919; GFX1064-NEXT: buffer_gl0_inv 3920; GFX1064-NEXT: buffer_gl1_inv 3921; GFX1064-NEXT: BB19_2: 3922; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3923; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3924; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3925; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3926; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3927; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3928; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3929; GFX1064-NEXT: s_nop 0 3930; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3931; GFX1064-NEXT: s_endpgm 3932; 3933; GFX1032-LABEL: min_i32_varying: 3934; GFX1032: ; %bb.0: ; %entry 3935; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3936; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 3937; GFX1032-NEXT: ; implicit-def: $vcc_hi 3938; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3939; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3940; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3941; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3942; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3943; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3944; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3945; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3946; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3947; GFX1032-NEXT: s_mov_b32 s2, -1 3948; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3949; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3950; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3951; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3952; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3953; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3954; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3955; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3956; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3957; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3958; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3959; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 3960; GFX1032-NEXT: ; implicit-def: $vgpr0 3961; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3962; GFX1032-NEXT: s_cbranch_execz BB19_2 3963; GFX1032-NEXT: ; %bb.1: 3964; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3965; GFX1032-NEXT: v_mov_b32_e32 v4, s3 3966; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3967; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3968; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 3969; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3970; GFX1032-NEXT: buffer_gl0_inv 3971; GFX1032-NEXT: buffer_gl1_inv 3972; GFX1032-NEXT: BB19_2: 3973; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3974; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3975; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3976; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3977; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3978; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3979; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3980; GFX1032-NEXT: s_nop 0 3981; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3982; GFX1032-NEXT: s_endpgm 3983entry: 3984 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3985 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3986 store i32 %old, i32 addrspace(1)* %out 3987 ret void 3988} 3989 3990define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3991; 3992; 3993; GFX7LESS-LABEL: min_i64_constant: 3994; GFX7LESS: ; %bb.0: ; %entry 3995; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3996; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3997; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3998; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3999; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4000; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4001; GFX7LESS-NEXT: s_cbranch_execz BB20_2 4002; GFX7LESS-NEXT: ; %bb.1: 4003; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4004; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4005; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4006; GFX7LESS-NEXT: s_mov_b32 m0, -1 4007; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4008; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4009; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4010; GFX7LESS-NEXT: buffer_wbinvl1 4011; GFX7LESS-NEXT: BB20_2: 4012; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4013; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4014; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4015; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 4016; GFX7LESS-NEXT: s_mov_b32 s2, -1 4017; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4018; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4019; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4020; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4021; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4022; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4023; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4024; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4025; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4026; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4027; GFX7LESS-NEXT: s_endpgm 4028; 4029; GFX8-LABEL: min_i64_constant: 4030; GFX8: ; %bb.0: ; %entry 4031; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4032; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4033; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4034; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4035; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4036; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4037; GFX8-NEXT: s_cbranch_execz BB20_2 4038; GFX8-NEXT: ; %bb.1: 4039; GFX8-NEXT: v_mov_b32_e32 v0, 5 4040; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4041; GFX8-NEXT: v_mov_b32_e32 v1, 0 4042; GFX8-NEXT: s_mov_b32 m0, -1 4043; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4044; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4045; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4046; GFX8-NEXT: buffer_wbinvl1_vol 4047; GFX8-NEXT: BB20_2: 4048; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4049; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4050; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 4051; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4052; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4053; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4054; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4055; GFX8-NEXT: v_mov_b32_e32 v2, s5 4056; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4057; GFX8-NEXT: v_mov_b32_e32 v2, s4 4058; GFX8-NEXT: s_mov_b32 s2, -1 4059; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4060; GFX8-NEXT: s_mov_b32 s3, 0xf000 4061; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4062; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4063; GFX8-NEXT: s_endpgm 4064; 4065; GFX9-LABEL: min_i64_constant: 4066; GFX9: ; %bb.0: ; %entry 4067; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4068; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4069; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4070; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4071; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4072; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4073; GFX9-NEXT: s_cbranch_execz BB20_2 4074; GFX9-NEXT: ; %bb.1: 4075; GFX9-NEXT: v_mov_b32_e32 v0, 5 4076; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4077; GFX9-NEXT: v_mov_b32_e32 v1, 0 4078; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4079; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4080; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4081; GFX9-NEXT: buffer_wbinvl1_vol 4082; GFX9-NEXT: BB20_2: 4083; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4084; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4085; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4086; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4087; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4088; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4089; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4090; GFX9-NEXT: v_mov_b32_e32 v2, s5 4091; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4092; GFX9-NEXT: v_mov_b32_e32 v2, s4 4093; GFX9-NEXT: s_mov_b32 s2, -1 4094; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4095; GFX9-NEXT: s_mov_b32 s3, 0xf000 4096; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4097; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4098; GFX9-NEXT: s_endpgm 4099; 4100; GFX1064-LABEL: min_i64_constant: 4101; GFX1064: ; %bb.0: ; %entry 4102; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4103; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4104; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4105; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4106; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4107; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4108; GFX1064-NEXT: s_cbranch_execz BB20_2 4109; GFX1064-NEXT: ; %bb.1: 4110; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4111; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4112; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4113; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4114; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4115; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4116; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4117; GFX1064-NEXT: buffer_gl0_inv 4118; GFX1064-NEXT: buffer_gl1_inv 4119; GFX1064-NEXT: BB20_2: 4120; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4121; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4122; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4123; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4124; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4125; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4126; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 4127; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4128; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4129; GFX1064-NEXT: s_mov_b32 s2, -1 4130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4131; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4132; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4133; GFX1064-NEXT: s_endpgm 4134; 4135; GFX1032-LABEL: min_i64_constant: 4136; GFX1032: ; %bb.0: ; %entry 4137; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4138; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4139; GFX1032-NEXT: ; implicit-def: $vcc_hi 4140; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4141; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4142; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4143; GFX1032-NEXT: s_cbranch_execz BB20_2 4144; GFX1032-NEXT: ; %bb.1: 4145; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4146; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4147; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4148; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4149; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4150; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4152; GFX1032-NEXT: buffer_gl0_inv 4153; GFX1032-NEXT: buffer_gl1_inv 4154; GFX1032-NEXT: BB20_2: 4155; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4156; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4157; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4158; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4159; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4160; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4161; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 4162; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4163; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4164; GFX1032-NEXT: s_mov_b32 s2, -1 4165; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4166; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4167; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4168; GFX1032-NEXT: s_endpgm 4169entry: 4170 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4171 store i64 %old, i64 addrspace(1)* %out 4172 ret void 4173} 4174 4175define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4176; 4177; 4178; GFX7LESS-LABEL: umax_i32_varying: 4179; GFX7LESS: ; %bb.0: ; %entry 4180; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4181; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4182; GFX7LESS-NEXT: s_mov_b32 m0, -1 4183; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4184; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4185; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4186; GFX7LESS-NEXT: buffer_wbinvl1 4187; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4188; GFX7LESS-NEXT: s_mov_b32 s2, -1 4189; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4190; GFX7LESS-NEXT: s_endpgm 4191; 4192; GFX8-LABEL: umax_i32_varying: 4193; GFX8: ; %bb.0: ; %entry 4194; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4195; GFX8-NEXT: s_mov_b64 s[2:3], exec 4196; GFX8-NEXT: v_mov_b32_e32 v2, v0 4197; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4198; GFX8-NEXT: v_mov_b32_e32 v1, 0 4199; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4200; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4201; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4202; GFX8-NEXT: s_not_b64 exec, exec 4203; GFX8-NEXT: v_mov_b32_e32 v2, 0 4204; GFX8-NEXT: s_not_b64 exec, exec 4205; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4206; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4207; GFX8-NEXT: s_nop 1 4208; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4209; GFX8-NEXT: s_nop 1 4210; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4211; GFX8-NEXT: s_nop 1 4212; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4213; GFX8-NEXT: s_nop 1 4214; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4215; GFX8-NEXT: s_nop 1 4216; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4217; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4218; GFX8-NEXT: s_nop 0 4219; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4220; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4221; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4222; GFX8-NEXT: ; implicit-def: $vgpr0 4223; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4224; GFX8-NEXT: s_cbranch_execz BB21_2 4225; GFX8-NEXT: ; %bb.1: 4226; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4227; GFX8-NEXT: v_mov_b32_e32 v3, s2 4228; GFX8-NEXT: s_mov_b32 m0, -1 4229; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4230; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4231; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4232; GFX8-NEXT: buffer_wbinvl1_vol 4233; GFX8-NEXT: BB21_2: 4234; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4235; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4236; GFX8-NEXT: v_mov_b32_e32 v0, v1 4237; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4238; GFX8-NEXT: s_mov_b32 s3, 0xf000 4239; GFX8-NEXT: s_mov_b32 s2, -1 4240; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4241; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4242; GFX8-NEXT: s_endpgm 4243; 4244; GFX9-LABEL: umax_i32_varying: 4245; GFX9: ; %bb.0: ; %entry 4246; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4247; GFX9-NEXT: s_mov_b64 s[2:3], exec 4248; GFX9-NEXT: v_mov_b32_e32 v2, v0 4249; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4250; GFX9-NEXT: v_mov_b32_e32 v1, 0 4251; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4252; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4253; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4254; GFX9-NEXT: s_not_b64 exec, exec 4255; GFX9-NEXT: v_mov_b32_e32 v2, 0 4256; GFX9-NEXT: s_not_b64 exec, exec 4257; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4258; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4259; GFX9-NEXT: s_nop 1 4260; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4261; GFX9-NEXT: s_nop 1 4262; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4263; GFX9-NEXT: s_nop 1 4264; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4265; GFX9-NEXT: s_nop 1 4266; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4267; GFX9-NEXT: s_nop 1 4268; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4269; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4270; GFX9-NEXT: s_nop 0 4271; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4272; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4273; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4274; GFX9-NEXT: ; implicit-def: $vgpr0 4275; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4276; GFX9-NEXT: s_cbranch_execz BB21_2 4277; GFX9-NEXT: ; %bb.1: 4278; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4279; GFX9-NEXT: v_mov_b32_e32 v3, s2 4280; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4281; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4282; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4283; GFX9-NEXT: buffer_wbinvl1_vol 4284; GFX9-NEXT: BB21_2: 4285; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4286; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4287; GFX9-NEXT: v_mov_b32_e32 v0, v1 4288; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4289; GFX9-NEXT: s_mov_b32 s3, 0xf000 4290; GFX9-NEXT: s_mov_b32 s2, -1 4291; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4292; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4293; GFX9-NEXT: s_endpgm 4294; 4295; GFX1064-LABEL: umax_i32_varying: 4296; GFX1064: ; %bb.0: ; %entry 4297; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4298; GFX1064-NEXT: s_mov_b64 s[2:3], exec 4299; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4300; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4301; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4302; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4303; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4304; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4305; GFX1064-NEXT: s_not_b64 exec, exec 4306; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4307; GFX1064-NEXT: s_not_b64 exec, exec 4308; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4309; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4310; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4311; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4312; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4313; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4314; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4315; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4316; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4317; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4318; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4319; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4320; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4321; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4322; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4323; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4324; GFX1064-NEXT: s_mov_b32 s2, -1 4325; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4326; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4327; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4328; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4329; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4330; GFX1064-NEXT: ; implicit-def: $vgpr0 4331; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4332; GFX1064-NEXT: s_cbranch_execz BB21_2 4333; GFX1064-NEXT: ; %bb.1: 4334; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4335; GFX1064-NEXT: v_mov_b32_e32 v4, s3 4336; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4337; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4338; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 4339; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4340; GFX1064-NEXT: buffer_gl0_inv 4341; GFX1064-NEXT: buffer_gl1_inv 4342; GFX1064-NEXT: BB21_2: 4343; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4344; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4345; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4346; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4347; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4348; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4349; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4350; GFX1064-NEXT: s_nop 0 4351; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4352; GFX1064-NEXT: s_endpgm 4353; 4354; GFX1032-LABEL: umax_i32_varying: 4355; GFX1032: ; %bb.0: ; %entry 4356; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4357; GFX1032-NEXT: s_mov_b32 s2, exec_lo 4358; GFX1032-NEXT: ; implicit-def: $vcc_hi 4359; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4360; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 4361; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4362; GFX1032-NEXT: s_mov_b32 exec_lo, s3 4363; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4364; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4365; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4366; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4367; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4368; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4369; GFX1032-NEXT: s_mov_b32 s2, -1 4370; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4371; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4372; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4373; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4374; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4375; GFX1032-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4376; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4377; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4378; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4379; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4380; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4381; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4382; GFX1032-NEXT: ; implicit-def: $vgpr0 4383; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4384; GFX1032-NEXT: s_cbranch_execz BB21_2 4385; GFX1032-NEXT: ; %bb.1: 4386; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4387; GFX1032-NEXT: v_mov_b32_e32 v4, s3 4388; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4389; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4390; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 4391; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4392; GFX1032-NEXT: buffer_gl0_inv 4393; GFX1032-NEXT: buffer_gl1_inv 4394; GFX1032-NEXT: BB21_2: 4395; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4396; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4397; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4398; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4399; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4400; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4401; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4402; GFX1032-NEXT: s_nop 0 4403; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4404; GFX1032-NEXT: s_endpgm 4405entry: 4406 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4407 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4408 store i32 %old, i32 addrspace(1)* %out 4409 ret void 4410} 4411 4412define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4413; 4414; 4415; GFX7LESS-LABEL: umax_i64_constant: 4416; GFX7LESS: ; %bb.0: ; %entry 4417; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4418; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4419; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4420; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4421; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4422; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4423; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4424; GFX7LESS-NEXT: ; %bb.1: 4425; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4426; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4427; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4428; GFX7LESS-NEXT: s_mov_b32 m0, -1 4429; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4430; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4431; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4432; GFX7LESS-NEXT: buffer_wbinvl1 4433; GFX7LESS-NEXT: BB22_2: 4434; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4435; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4436; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4437; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4438; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4439; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4440; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4441; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4442; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4443; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4444; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4445; GFX7LESS-NEXT: s_mov_b32 s2, -1 4446; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4447; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4448; GFX7LESS-NEXT: s_endpgm 4449; 4450; GFX8-LABEL: umax_i64_constant: 4451; GFX8: ; %bb.0: ; %entry 4452; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4453; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4454; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4455; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4456; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4457; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4458; GFX8-NEXT: s_cbranch_execz BB22_2 4459; GFX8-NEXT: ; %bb.1: 4460; GFX8-NEXT: v_mov_b32_e32 v0, 5 4461; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4462; GFX8-NEXT: v_mov_b32_e32 v1, 0 4463; GFX8-NEXT: s_mov_b32 m0, -1 4464; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4465; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4466; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4467; GFX8-NEXT: buffer_wbinvl1_vol 4468; GFX8-NEXT: BB22_2: 4469; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4470; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4471; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4472; GFX8-NEXT: v_mov_b32_e32 v1, 0 4473; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4474; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4475; GFX8-NEXT: v_mov_b32_e32 v1, s3 4476; GFX8-NEXT: v_mov_b32_e32 v2, s2 4477; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4478; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4479; GFX8-NEXT: s_mov_b32 s3, 0xf000 4480; GFX8-NEXT: s_mov_b32 s2, -1 4481; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4482; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4483; GFX8-NEXT: s_endpgm 4484; 4485; GFX9-LABEL: umax_i64_constant: 4486; GFX9: ; %bb.0: ; %entry 4487; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4488; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4489; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4490; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4491; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4492; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4493; GFX9-NEXT: s_cbranch_execz BB22_2 4494; GFX9-NEXT: ; %bb.1: 4495; GFX9-NEXT: v_mov_b32_e32 v0, 5 4496; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4497; GFX9-NEXT: v_mov_b32_e32 v1, 0 4498; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4499; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4500; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4501; GFX9-NEXT: buffer_wbinvl1_vol 4502; GFX9-NEXT: BB22_2: 4503; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4504; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4505; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4506; GFX9-NEXT: v_mov_b32_e32 v1, 0 4507; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4508; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4509; GFX9-NEXT: v_mov_b32_e32 v1, s3 4510; GFX9-NEXT: v_mov_b32_e32 v2, s2 4511; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4512; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4513; GFX9-NEXT: s_mov_b32 s3, 0xf000 4514; GFX9-NEXT: s_mov_b32 s2, -1 4515; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4516; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4517; GFX9-NEXT: s_endpgm 4518; 4519; GFX1064-LABEL: umax_i64_constant: 4520; GFX1064: ; %bb.0: ; %entry 4521; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4522; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4523; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4524; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4525; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4526; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4527; GFX1064-NEXT: s_cbranch_execz BB22_2 4528; GFX1064-NEXT: ; %bb.1: 4529; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4530; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4531; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4532; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4533; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4534; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4535; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4536; GFX1064-NEXT: buffer_gl0_inv 4537; GFX1064-NEXT: buffer_gl1_inv 4538; GFX1064-NEXT: BB22_2: 4539; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4540; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4541; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4542; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4543; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4544; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4545; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4546; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4547; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4548; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4549; GFX1064-NEXT: s_mov_b32 s2, -1 4550; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4551; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4552; GFX1064-NEXT: s_endpgm 4553; 4554; GFX1032-LABEL: umax_i64_constant: 4555; GFX1032: ; %bb.0: ; %entry 4556; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4557; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4558; GFX1032-NEXT: ; implicit-def: $vcc_hi 4559; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4560; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4561; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4562; GFX1032-NEXT: s_cbranch_execz BB22_2 4563; GFX1032-NEXT: ; %bb.1: 4564; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4565; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4566; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4567; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4568; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4569; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4570; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4571; GFX1032-NEXT: buffer_gl0_inv 4572; GFX1032-NEXT: buffer_gl1_inv 4573; GFX1032-NEXT: BB22_2: 4574; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4575; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4576; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4577; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4578; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4579; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4580; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4581; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4582; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4583; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4584; GFX1032-NEXT: s_mov_b32 s2, -1 4585; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4586; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4587; GFX1032-NEXT: s_endpgm 4588entry: 4589 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4590 store i64 %old, i64 addrspace(1)* %out 4591 ret void 4592} 4593 4594define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4595; 4596; 4597; GFX7LESS-LABEL: umin_i32_varying: 4598; GFX7LESS: ; %bb.0: ; %entry 4599; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4600; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4601; GFX7LESS-NEXT: s_mov_b32 m0, -1 4602; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4603; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4604; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4605; GFX7LESS-NEXT: buffer_wbinvl1 4606; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4607; GFX7LESS-NEXT: s_mov_b32 s2, -1 4608; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4609; GFX7LESS-NEXT: s_endpgm 4610; 4611; GFX8-LABEL: umin_i32_varying: 4612; GFX8: ; %bb.0: ; %entry 4613; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4614; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4615; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4616; GFX8-NEXT: v_mov_b32_e32 v2, v0 4617; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4618; GFX8-NEXT: v_mov_b32_e32 v1, -1 4619; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4620; GFX8-NEXT: s_not_b64 exec, exec 4621; GFX8-NEXT: v_mov_b32_e32 v2, -1 4622; GFX8-NEXT: s_not_b64 exec, exec 4623; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4624; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4625; GFX8-NEXT: s_nop 1 4626; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4627; GFX8-NEXT: s_nop 1 4628; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4629; GFX8-NEXT: s_nop 1 4630; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4631; GFX8-NEXT: s_nop 1 4632; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4633; GFX8-NEXT: s_nop 1 4634; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4635; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4636; GFX8-NEXT: s_nop 0 4637; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4638; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4639; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4640; GFX8-NEXT: ; implicit-def: $vgpr0 4641; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4642; GFX8-NEXT: s_cbranch_execz BB23_2 4643; GFX8-NEXT: ; %bb.1: 4644; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4645; GFX8-NEXT: v_mov_b32_e32 v3, s2 4646; GFX8-NEXT: s_mov_b32 m0, -1 4647; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4648; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4649; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4650; GFX8-NEXT: buffer_wbinvl1_vol 4651; GFX8-NEXT: BB23_2: 4652; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4653; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4654; GFX8-NEXT: v_mov_b32_e32 v0, v1 4655; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4656; GFX8-NEXT: s_mov_b32 s3, 0xf000 4657; GFX8-NEXT: s_mov_b32 s2, -1 4658; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4659; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4660; GFX8-NEXT: s_endpgm 4661; 4662; GFX9-LABEL: umin_i32_varying: 4663; GFX9: ; %bb.0: ; %entry 4664; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4665; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4666; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4667; GFX9-NEXT: v_mov_b32_e32 v2, v0 4668; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4669; GFX9-NEXT: v_mov_b32_e32 v1, -1 4670; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4671; GFX9-NEXT: s_not_b64 exec, exec 4672; GFX9-NEXT: v_mov_b32_e32 v2, -1 4673; GFX9-NEXT: s_not_b64 exec, exec 4674; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4675; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4676; GFX9-NEXT: s_nop 1 4677; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4678; GFX9-NEXT: s_nop 1 4679; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4680; GFX9-NEXT: s_nop 1 4681; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4682; GFX9-NEXT: s_nop 1 4683; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4684; GFX9-NEXT: s_nop 1 4685; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4686; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4687; GFX9-NEXT: s_nop 0 4688; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4689; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4690; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4691; GFX9-NEXT: ; implicit-def: $vgpr0 4692; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4693; GFX9-NEXT: s_cbranch_execz BB23_2 4694; GFX9-NEXT: ; %bb.1: 4695; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4696; GFX9-NEXT: v_mov_b32_e32 v3, s2 4697; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4698; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4699; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4700; GFX9-NEXT: buffer_wbinvl1_vol 4701; GFX9-NEXT: BB23_2: 4702; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4703; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4704; GFX9-NEXT: v_mov_b32_e32 v0, v1 4705; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4706; GFX9-NEXT: s_mov_b32 s3, 0xf000 4707; GFX9-NEXT: s_mov_b32 s2, -1 4708; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4709; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4710; GFX9-NEXT: s_endpgm 4711; 4712; GFX1064-LABEL: umin_i32_varying: 4713; GFX1064: ; %bb.0: ; %entry 4714; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4715; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 4716; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4717; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 4718; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4719; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4720; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4721; GFX1064-NEXT: s_not_b64 exec, exec 4722; GFX1064-NEXT: v_mov_b32_e32 v2, -1 4723; GFX1064-NEXT: s_not_b64 exec, exec 4724; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4725; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4726; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4727; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4728; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4729; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4730; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4731; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4732; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4733; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4734; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4735; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4736; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4737; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4738; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4739; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4740; GFX1064-NEXT: s_mov_b32 s2, -1 4741; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4742; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4743; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4744; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4745; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 4746; GFX1064-NEXT: ; implicit-def: $vgpr0 4747; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4748; GFX1064-NEXT: s_cbranch_execz BB23_2 4749; GFX1064-NEXT: ; %bb.1: 4750; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4751; GFX1064-NEXT: v_mov_b32_e32 v4, s3 4752; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4753; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4754; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 4755; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4756; GFX1064-NEXT: buffer_gl0_inv 4757; GFX1064-NEXT: buffer_gl1_inv 4758; GFX1064-NEXT: BB23_2: 4759; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4760; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4761; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4762; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4763; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4764; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4765; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4766; GFX1064-NEXT: s_nop 0 4767; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4768; GFX1064-NEXT: s_endpgm 4769; 4770; GFX1032-LABEL: umin_i32_varying: 4771; GFX1032: ; %bb.0: ; %entry 4772; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4773; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 4774; GFX1032-NEXT: ; implicit-def: $vcc_hi 4775; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4776; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4777; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4778; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4779; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4780; GFX1032-NEXT: v_mov_b32_e32 v2, -1 4781; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4782; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4783; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4784; GFX1032-NEXT: s_mov_b32 s2, -1 4785; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4786; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4787; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4788; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4789; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4790; GFX1032-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4791; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4792; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4793; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4794; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4795; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4796; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4797; GFX1032-NEXT: ; implicit-def: $vgpr0 4798; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4799; GFX1032-NEXT: s_cbranch_execz BB23_2 4800; GFX1032-NEXT: ; %bb.1: 4801; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4802; GFX1032-NEXT: v_mov_b32_e32 v4, s3 4803; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4804; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4805; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 4806; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4807; GFX1032-NEXT: buffer_gl0_inv 4808; GFX1032-NEXT: buffer_gl1_inv 4809; GFX1032-NEXT: BB23_2: 4810; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4811; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4812; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4813; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4814; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4815; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4816; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4817; GFX1032-NEXT: s_nop 0 4818; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4819; GFX1032-NEXT: s_endpgm 4820entry: 4821 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4822 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4823 store i32 %old, i32 addrspace(1)* %out 4824 ret void 4825} 4826 4827define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4828; 4829; 4830; GFX7LESS-LABEL: umin_i64_constant: 4831; GFX7LESS: ; %bb.0: ; %entry 4832; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4833; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4834; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4835; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4836; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4837; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4838; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4839; GFX7LESS-NEXT: ; %bb.1: 4840; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4841; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4842; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4843; GFX7LESS-NEXT: s_mov_b32 m0, -1 4844; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4845; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4846; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4847; GFX7LESS-NEXT: buffer_wbinvl1 4848; GFX7LESS-NEXT: BB24_2: 4849; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4850; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4851; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4852; GFX7LESS-NEXT: s_mov_b32 s2, -1 4853; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4854; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4855; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4856; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4857; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4858; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4859; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4860; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4861; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4862; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4863; GFX7LESS-NEXT: s_endpgm 4864; 4865; GFX8-LABEL: umin_i64_constant: 4866; GFX8: ; %bb.0: ; %entry 4867; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4868; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4869; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4870; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4871; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4872; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4873; GFX8-NEXT: s_cbranch_execz BB24_2 4874; GFX8-NEXT: ; %bb.1: 4875; GFX8-NEXT: v_mov_b32_e32 v0, 5 4876; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4877; GFX8-NEXT: v_mov_b32_e32 v1, 0 4878; GFX8-NEXT: s_mov_b32 m0, -1 4879; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4880; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4881; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4882; GFX8-NEXT: buffer_wbinvl1_vol 4883; GFX8-NEXT: BB24_2: 4884; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4885; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4886; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4887; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4888; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4889; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4890; GFX8-NEXT: v_mov_b32_e32 v2, s5 4891; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4892; GFX8-NEXT: v_mov_b32_e32 v2, s4 4893; GFX8-NEXT: s_mov_b32 s2, -1 4894; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4895; GFX8-NEXT: s_mov_b32 s3, 0xf000 4896; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4897; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4898; GFX8-NEXT: s_endpgm 4899; 4900; GFX9-LABEL: umin_i64_constant: 4901; GFX9: ; %bb.0: ; %entry 4902; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4903; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4904; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4905; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4906; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4907; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4908; GFX9-NEXT: s_cbranch_execz BB24_2 4909; GFX9-NEXT: ; %bb.1: 4910; GFX9-NEXT: v_mov_b32_e32 v0, 5 4911; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4912; GFX9-NEXT: v_mov_b32_e32 v1, 0 4913; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4914; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4915; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4916; GFX9-NEXT: buffer_wbinvl1_vol 4917; GFX9-NEXT: BB24_2: 4918; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4919; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4920; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4921; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4922; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4923; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4924; GFX9-NEXT: v_mov_b32_e32 v2, s5 4925; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4926; GFX9-NEXT: v_mov_b32_e32 v2, s4 4927; GFX9-NEXT: s_mov_b32 s2, -1 4928; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4929; GFX9-NEXT: s_mov_b32 s3, 0xf000 4930; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4931; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4932; GFX9-NEXT: s_endpgm 4933; 4934; GFX1064-LABEL: umin_i64_constant: 4935; GFX1064: ; %bb.0: ; %entry 4936; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4937; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4938; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4939; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4940; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4941; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4942; GFX1064-NEXT: s_cbranch_execz BB24_2 4943; GFX1064-NEXT: ; %bb.1: 4944; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4945; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4946; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4947; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4948; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4949; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4950; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4951; GFX1064-NEXT: buffer_gl0_inv 4952; GFX1064-NEXT: buffer_gl1_inv 4953; GFX1064-NEXT: BB24_2: 4954; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4955; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4956; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4957; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4958; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4959; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4960; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4961; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4962; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4963; GFX1064-NEXT: s_mov_b32 s2, -1 4964; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4965; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4966; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4967; GFX1064-NEXT: s_endpgm 4968; 4969; GFX1032-LABEL: umin_i64_constant: 4970; GFX1032: ; %bb.0: ; %entry 4971; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4972; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4973; GFX1032-NEXT: ; implicit-def: $vcc_hi 4974; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4975; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4976; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4977; GFX1032-NEXT: s_cbranch_execz BB24_2 4978; GFX1032-NEXT: ; %bb.1: 4979; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4980; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4981; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4982; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4983; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4984; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4985; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4986; GFX1032-NEXT: buffer_gl0_inv 4987; GFX1032-NEXT: buffer_gl1_inv 4988; GFX1032-NEXT: BB24_2: 4989; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4990; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4991; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4992; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4993; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4994; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4995; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4996; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4997; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4998; GFX1032-NEXT: s_mov_b32 s2, -1 4999; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5000; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5001; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5002; GFX1032-NEXT: s_endpgm 5003entry: 5004 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 5005 store i64 %old, i64 addrspace(1)* %out 5006 ret void 5007} 5008