1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 31; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 32; GFX7LESS-NEXT: s_mov_b32 m0, -1 33; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 34; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 35; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 36; GFX7LESS-NEXT: buffer_wbinvl1 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 40; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 41; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 42; GFX7LESS-NEXT: s_mov_b32 s2, -1 43; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 61; GFX8-NEXT: s_mov_b32 m0, -1 62; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 63; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 64; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 65; GFX8-NEXT: buffer_wbinvl1_vol 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: v_readfirstlane_b32 s2, v1 69; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 70; GFX8-NEXT: s_mov_b32 s3, 0xf000 71; GFX8-NEXT: s_mov_b32 s2, -1 72; GFX8-NEXT: s_nop 1 73; GFX8-NEXT: s_waitcnt lgkmcnt(0) 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 91; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 92; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 93; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 94; GFX9-NEXT: buffer_wbinvl1_vol 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: v_readfirstlane_b32 s2, v1 98; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 99; GFX9-NEXT: s_mov_b32 s3, 0xf000 100; GFX9-NEXT: s_mov_b32 s2, -1 101; GFX9-NEXT: s_nop 1 102; GFX9-NEXT: s_waitcnt lgkmcnt(0) 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_mov_b64 s[2:3], exec 109; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 119; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 120; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 121; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 122; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 123; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 124; GFX1064-NEXT: buffer_gl0_inv 125; GFX1064-NEXT: buffer_gl1_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: v_nop 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_nop 1 134; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 135; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 136; GFX1064-NEXT: s_endpgm 137; 138; GFX1032-LABEL: add_i32_constant: 139; GFX1032: ; %bb.0: ; %entry 140; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 141; GFX1032-NEXT: s_mov_b32 s2, exec_lo 142; GFX1032-NEXT: ; implicit-def: $vcc_hi 143; GFX1032-NEXT: ; implicit-def: $vgpr1 144; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 145; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 146; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 147; GFX1032-NEXT: s_cbranch_execz BB0_2 148; GFX1032-NEXT: ; %bb.1: 149; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 150; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 151; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 152; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 153; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 155; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 156; GFX1032-NEXT: buffer_gl0_inv 157; GFX1032-NEXT: buffer_gl1_inv 158; GFX1032-NEXT: BB0_2: 159; GFX1032-NEXT: v_nop 160; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 161; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 162; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 163; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 164; GFX1032-NEXT: s_mov_b32 s2, -1 165; GFX1032-NEXT: s_nop 1 166; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 167; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 168; GFX1032-NEXT: s_endpgm 169entry: 170 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 171 store i32 %old, i32 addrspace(1)* %out 172 ret void 173} 174 175define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 176; 177; 178; GFX7LESS-LABEL: add_i32_uniform: 179; GFX7LESS: ; %bb.0: ; %entry 180; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 181; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 182; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 183; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 184; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 185; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 186; GFX7LESS-NEXT: ; implicit-def: $vgpr1 187; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 188; GFX7LESS-NEXT: s_cbranch_execz BB1_2 189; GFX7LESS-NEXT: ; %bb.1: 190; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 191; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 192; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 193; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 194; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 195; GFX7LESS-NEXT: s_mov_b32 m0, -1 196; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 197; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 198; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 199; GFX7LESS-NEXT: buffer_wbinvl1 200; GFX7LESS-NEXT: BB1_2: 201; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 202; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 203; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 204; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 205; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 206; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 207; GFX7LESS-NEXT: s_mov_b32 s6, -1 208; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 209; GFX7LESS-NEXT: s_endpgm 210; 211; GFX8-LABEL: add_i32_uniform: 212; GFX8: ; %bb.0: ; %entry 213; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 214; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 215; GFX8-NEXT: s_mov_b64 s[2:3], exec 216; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 217; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 218; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 219; GFX8-NEXT: ; implicit-def: $vgpr1 220; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 221; GFX8-NEXT: s_cbranch_execz BB1_2 222; GFX8-NEXT: ; %bb.1: 223; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 225; GFX8-NEXT: s_mul_i32 s1, s0, s1 226; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 227; GFX8-NEXT: v_mov_b32_e32 v2, s1 228; GFX8-NEXT: s_mov_b32 m0, -1 229; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 230; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 231; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 232; GFX8-NEXT: buffer_wbinvl1_vol 233; GFX8-NEXT: BB1_2: 234; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 235; GFX8-NEXT: s_waitcnt lgkmcnt(0) 236; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 237; GFX8-NEXT: v_readfirstlane_b32 s0, v1 238; GFX8-NEXT: s_mov_b32 s7, 0xf000 239; GFX8-NEXT: s_mov_b32 s6, -1 240; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 241; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 242; GFX8-NEXT: s_endpgm 243; 244; GFX9-LABEL: add_i32_uniform: 245; GFX9: ; %bb.0: ; %entry 246; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 247; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 248; GFX9-NEXT: s_mov_b64 s[2:3], exec 249; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 250; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 251; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 252; GFX9-NEXT: ; implicit-def: $vgpr1 253; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 254; GFX9-NEXT: s_cbranch_execz BB1_2 255; GFX9-NEXT: ; %bb.1: 256; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 258; GFX9-NEXT: s_mul_i32 s1, s0, s1 259; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 260; GFX9-NEXT: v_mov_b32_e32 v2, s1 261; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 262; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 263; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 264; GFX9-NEXT: buffer_wbinvl1_vol 265; GFX9-NEXT: BB1_2: 266; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 267; GFX9-NEXT: s_waitcnt lgkmcnt(0) 268; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 269; GFX9-NEXT: v_readfirstlane_b32 s0, v1 270; GFX9-NEXT: s_mov_b32 s7, 0xf000 271; GFX9-NEXT: s_mov_b32 s6, -1 272; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 273; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 274; GFX9-NEXT: s_endpgm 275; 276; GFX1064-LABEL: add_i32_uniform: 277; GFX1064: ; %bb.0: ; %entry 278; GFX1064-NEXT: s_mov_b64 s[2:3], exec 279; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 280; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 281; GFX1064-NEXT: ; implicit-def: $vgpr1 282; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 283; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 284; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 285; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 286; GFX1064-NEXT: s_cbranch_execz BB1_2 287; GFX1064-NEXT: ; %bb.1: 288; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 289; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 290; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 291; GFX1064-NEXT: s_mul_i32 s1, s0, s1 292; GFX1064-NEXT: v_mov_b32_e32 v2, s1 293; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 294; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 295; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 296; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 297; GFX1064-NEXT: buffer_gl0_inv 298; GFX1064-NEXT: buffer_gl1_inv 299; GFX1064-NEXT: BB1_2: 300; GFX1064-NEXT: v_nop 301; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 302; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 303; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 304; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 305; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 306; GFX1064-NEXT: s_mov_b32 s6, -1 307; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 308; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 309; GFX1064-NEXT: s_endpgm 310; 311; GFX1032-LABEL: add_i32_uniform: 312; GFX1032: ; %bb.0: ; %entry 313; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 314; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 315; GFX1032-NEXT: s_mov_b32 s2, exec_lo 316; GFX1032-NEXT: ; implicit-def: $vcc_hi 317; GFX1032-NEXT: ; implicit-def: $vgpr1 318; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 319; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 320; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 321; GFX1032-NEXT: s_cbranch_execz BB1_2 322; GFX1032-NEXT: ; %bb.1: 323; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 324; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: s_mul_i32 s2, s0, s2 327; GFX1032-NEXT: v_mov_b32_e32 v2, s2 328; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 329; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 330; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 331; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 332; GFX1032-NEXT: buffer_gl0_inv 333; GFX1032-NEXT: buffer_gl1_inv 334; GFX1032-NEXT: BB1_2: 335; GFX1032-NEXT: v_nop 336; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 337; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 338; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 339; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 340; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 341; GFX1032-NEXT: s_mov_b32 s6, -1 342; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 343; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 344; GFX1032-NEXT: s_endpgm 345entry: 346 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 347 store i32 %old, i32 addrspace(1)* %out 348 ret void 349} 350 351define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 352; 353; 354; GFX7LESS-LABEL: add_i32_varying: 355; GFX7LESS: ; %bb.0: ; %entry 356; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 357; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 358; GFX7LESS-NEXT: s_mov_b32 m0, -1 359; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 360; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 361; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 362; GFX7LESS-NEXT: buffer_wbinvl1 363; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 364; GFX7LESS-NEXT: s_mov_b32 s2, -1 365; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 366; GFX7LESS-NEXT: s_endpgm 367; 368; GFX8-LABEL: add_i32_varying: 369; GFX8: ; %bb.0: ; %entry 370; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 371; GFX8-NEXT: s_mov_b64 s[2:3], exec 372; GFX8-NEXT: v_mov_b32_e32 v2, v0 373; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 374; GFX8-NEXT: v_mov_b32_e32 v1, 0 375; GFX8-NEXT: s_mov_b64 exec, s[4:5] 376; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 377; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 378; GFX8-NEXT: s_not_b64 exec, exec 379; GFX8-NEXT: v_mov_b32_e32 v2, 0 380; GFX8-NEXT: s_not_b64 exec, exec 381; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 382; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 383; GFX8-NEXT: s_nop 1 384; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 385; GFX8-NEXT: s_nop 1 386; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 387; GFX8-NEXT: s_nop 1 388; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 389; GFX8-NEXT: s_nop 1 390; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 391; GFX8-NEXT: s_nop 1 392; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 393; GFX8-NEXT: v_readlane_b32 s2, v2, 63 394; GFX8-NEXT: s_nop 0 395; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 396; GFX8-NEXT: s_mov_b64 exec, s[4:5] 397; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 398; GFX8-NEXT: ; implicit-def: $vgpr0 399; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 400; GFX8-NEXT: s_cbranch_execz BB2_2 401; GFX8-NEXT: ; %bb.1: 402; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 403; GFX8-NEXT: v_mov_b32_e32 v3, s2 404; GFX8-NEXT: s_mov_b32 m0, -1 405; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 406; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 407; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 408; GFX8-NEXT: buffer_wbinvl1_vol 409; GFX8-NEXT: BB2_2: 410; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 411; GFX8-NEXT: v_readfirstlane_b32 s2, v0 412; GFX8-NEXT: v_mov_b32_e32 v0, v1 413; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 414; GFX8-NEXT: s_mov_b32 s3, 0xf000 415; GFX8-NEXT: s_mov_b32 s2, -1 416; GFX8-NEXT: s_nop 0 417; GFX8-NEXT: s_waitcnt lgkmcnt(0) 418; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 419; GFX8-NEXT: s_endpgm 420; 421; GFX9-LABEL: add_i32_varying: 422; GFX9: ; %bb.0: ; %entry 423; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 424; GFX9-NEXT: s_mov_b64 s[2:3], exec 425; GFX9-NEXT: v_mov_b32_e32 v2, v0 426; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 427; GFX9-NEXT: v_mov_b32_e32 v1, 0 428; GFX9-NEXT: s_mov_b64 exec, s[4:5] 429; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 430; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 431; GFX9-NEXT: s_not_b64 exec, exec 432; GFX9-NEXT: v_mov_b32_e32 v2, 0 433; GFX9-NEXT: s_not_b64 exec, exec 434; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 435; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 436; GFX9-NEXT: s_nop 1 437; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 438; GFX9-NEXT: s_nop 1 439; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 440; GFX9-NEXT: s_nop 1 441; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 442; GFX9-NEXT: s_nop 1 443; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 444; GFX9-NEXT: s_nop 1 445; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 446; GFX9-NEXT: v_readlane_b32 s2, v2, 63 447; GFX9-NEXT: s_nop 0 448; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 449; GFX9-NEXT: s_mov_b64 exec, s[4:5] 450; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 451; GFX9-NEXT: ; implicit-def: $vgpr0 452; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 453; GFX9-NEXT: s_cbranch_execz BB2_2 454; GFX9-NEXT: ; %bb.1: 455; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 456; GFX9-NEXT: v_mov_b32_e32 v3, s2 457; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 458; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 459; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 460; GFX9-NEXT: buffer_wbinvl1_vol 461; GFX9-NEXT: BB2_2: 462; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 463; GFX9-NEXT: v_readfirstlane_b32 s2, v0 464; GFX9-NEXT: v_mov_b32_e32 v0, v1 465; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 466; GFX9-NEXT: s_mov_b32 s3, 0xf000 467; GFX9-NEXT: s_mov_b32 s2, -1 468; GFX9-NEXT: s_nop 0 469; GFX9-NEXT: s_waitcnt lgkmcnt(0) 470; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 471; GFX9-NEXT: s_endpgm 472; 473; GFX1064-LABEL: add_i32_varying: 474; GFX1064: ; %bb.0: ; %entry 475; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 476; GFX1064-NEXT: s_mov_b64 s[2:3], exec 477; GFX1064-NEXT: v_mov_b32_e32 v2, v0 478; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 479; GFX1064-NEXT: v_mov_b32_e32 v1, 0 480; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 481; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 482; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 483; GFX1064-NEXT: s_not_b64 exec, exec 484; GFX1064-NEXT: v_mov_b32_e32 v2, 0 485; GFX1064-NEXT: s_not_b64 exec, exec 486; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 487; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 488; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 489; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 490; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 491; GFX1064-NEXT: v_mov_b32_e32 v3, v2 492; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 493; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 494; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 495; GFX1064-NEXT: v_mov_b32_e32 v3, s2 496; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 497; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 498; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 499; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 500; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 501; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 502; GFX1064-NEXT: s_mov_b32 s2, -1 503; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 504; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 505; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 506; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 507; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 508; GFX1064-NEXT: ; implicit-def: $vgpr0 509; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 510; GFX1064-NEXT: s_cbranch_execz BB2_2 511; GFX1064-NEXT: ; %bb.1: 512; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 513; GFX1064-NEXT: v_mov_b32_e32 v7, s3 514; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 515; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 516; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 517; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 518; GFX1064-NEXT: buffer_gl0_inv 519; GFX1064-NEXT: buffer_gl1_inv 520; GFX1064-NEXT: BB2_2: 521; GFX1064-NEXT: v_nop 522; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 523; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 524; GFX1064-NEXT: v_mov_b32_e32 v0, v1 525; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 526; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 527; GFX1064-NEXT: s_nop 1 528; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 529; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 530; GFX1064-NEXT: s_endpgm 531; 532; GFX1032-LABEL: add_i32_varying: 533; GFX1032: ; %bb.0: ; %entry 534; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 535; GFX1032-NEXT: s_mov_b32 s2, exec_lo 536; GFX1032-NEXT: ; implicit-def: $vcc_hi 537; GFX1032-NEXT: v_mov_b32_e32 v2, v0 538; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 539; GFX1032-NEXT: v_mov_b32_e32 v1, 0 540; GFX1032-NEXT: s_mov_b32 exec_lo, s3 541; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 542; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 543; GFX1032-NEXT: v_mov_b32_e32 v2, 0 544; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 545; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 546; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 547; GFX1032-NEXT: s_mov_b32 s2, -1 548; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 549; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 550; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 551; GFX1032-NEXT: v_mov_b32_e32 v3, v2 552; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 553; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 554; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 555; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 556; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 557; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 558; GFX1032-NEXT: s_mov_b32 exec_lo, s4 559; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 560; GFX1032-NEXT: ; implicit-def: $vgpr0 561; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 562; GFX1032-NEXT: s_cbranch_execz BB2_2 563; GFX1032-NEXT: ; %bb.1: 564; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 565; GFX1032-NEXT: v_mov_b32_e32 v7, s3 566; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 567; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 568; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 569; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 570; GFX1032-NEXT: buffer_gl0_inv 571; GFX1032-NEXT: buffer_gl1_inv 572; GFX1032-NEXT: BB2_2: 573; GFX1032-NEXT: v_nop 574; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 575; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 576; GFX1032-NEXT: v_mov_b32_e32 v0, v1 577; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 578; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 579; GFX1032-NEXT: s_nop 1 580; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 581; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 582; GFX1032-NEXT: s_endpgm 583entry: 584 %lane = call i32 @llvm.amdgcn.workitem.id.x() 585 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 586 store i32 %old, i32 addrspace(1)* %out 587 ret void 588} 589 590define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 591; 592; 593; GFX7LESS-LABEL: add_i32_varying_gfx1032: 594; GFX7LESS: ; %bb.0: ; %entry 595; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 596; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 597; GFX7LESS-NEXT: s_mov_b32 m0, -1 598; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 599; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 600; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 601; GFX7LESS-NEXT: buffer_wbinvl1 602; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 603; GFX7LESS-NEXT: s_mov_b32 s2, -1 604; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 605; GFX7LESS-NEXT: s_endpgm 606; 607; GFX8-LABEL: add_i32_varying_gfx1032: 608; GFX8: ; %bb.0: ; %entry 609; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 610; GFX8-NEXT: s_mov_b64 s[2:3], exec 611; GFX8-NEXT: v_mov_b32_e32 v2, v0 612; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 613; GFX8-NEXT: v_mov_b32_e32 v1, 0 614; GFX8-NEXT: s_mov_b64 exec, s[4:5] 615; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 616; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 617; GFX8-NEXT: s_not_b64 exec, exec 618; GFX8-NEXT: v_mov_b32_e32 v2, 0 619; GFX8-NEXT: s_not_b64 exec, exec 620; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 621; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 622; GFX8-NEXT: s_nop 1 623; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 624; GFX8-NEXT: s_nop 1 625; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 626; GFX8-NEXT: s_nop 1 627; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 628; GFX8-NEXT: s_nop 1 629; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 630; GFX8-NEXT: s_nop 1 631; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 632; GFX8-NEXT: v_readlane_b32 s2, v2, 63 633; GFX8-NEXT: s_nop 0 634; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 635; GFX8-NEXT: s_mov_b64 exec, s[4:5] 636; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 637; GFX8-NEXT: ; implicit-def: $vgpr0 638; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 639; GFX8-NEXT: s_cbranch_execz BB3_2 640; GFX8-NEXT: ; %bb.1: 641; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 642; GFX8-NEXT: v_mov_b32_e32 v3, s2 643; GFX8-NEXT: s_mov_b32 m0, -1 644; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 645; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 646; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 647; GFX8-NEXT: buffer_wbinvl1_vol 648; GFX8-NEXT: BB3_2: 649; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 650; GFX8-NEXT: v_readfirstlane_b32 s2, v0 651; GFX8-NEXT: v_mov_b32_e32 v0, v1 652; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 653; GFX8-NEXT: s_mov_b32 s3, 0xf000 654; GFX8-NEXT: s_mov_b32 s2, -1 655; GFX8-NEXT: s_nop 0 656; GFX8-NEXT: s_waitcnt lgkmcnt(0) 657; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 658; GFX8-NEXT: s_endpgm 659; 660; GFX9-LABEL: add_i32_varying_gfx1032: 661; GFX9: ; %bb.0: ; %entry 662; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 663; GFX9-NEXT: s_mov_b64 s[2:3], exec 664; GFX9-NEXT: v_mov_b32_e32 v2, v0 665; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 666; GFX9-NEXT: v_mov_b32_e32 v1, 0 667; GFX9-NEXT: s_mov_b64 exec, s[4:5] 668; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 669; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 670; GFX9-NEXT: s_not_b64 exec, exec 671; GFX9-NEXT: v_mov_b32_e32 v2, 0 672; GFX9-NEXT: s_not_b64 exec, exec 673; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 674; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 675; GFX9-NEXT: s_nop 1 676; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 677; GFX9-NEXT: s_nop 1 678; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 679; GFX9-NEXT: s_nop 1 680; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 681; GFX9-NEXT: s_nop 1 682; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 683; GFX9-NEXT: s_nop 1 684; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 685; GFX9-NEXT: v_readlane_b32 s2, v2, 63 686; GFX9-NEXT: s_nop 0 687; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 688; GFX9-NEXT: s_mov_b64 exec, s[4:5] 689; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 690; GFX9-NEXT: ; implicit-def: $vgpr0 691; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 692; GFX9-NEXT: s_cbranch_execz BB3_2 693; GFX9-NEXT: ; %bb.1: 694; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 695; GFX9-NEXT: v_mov_b32_e32 v3, s2 696; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 697; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 698; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 699; GFX9-NEXT: buffer_wbinvl1_vol 700; GFX9-NEXT: BB3_2: 701; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 702; GFX9-NEXT: v_readfirstlane_b32 s2, v0 703; GFX9-NEXT: v_mov_b32_e32 v0, v1 704; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 705; GFX9-NEXT: s_mov_b32 s3, 0xf000 706; GFX9-NEXT: s_mov_b32 s2, -1 707; GFX9-NEXT: s_nop 0 708; GFX9-NEXT: s_waitcnt lgkmcnt(0) 709; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 710; GFX9-NEXT: s_endpgm 711; 712; GFX1064-LABEL: add_i32_varying_gfx1032: 713; GFX1064: ; %bb.0: ; %entry 714; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 715; GFX1064-NEXT: s_mov_b64 s[2:3], exec 716; GFX1064-NEXT: v_mov_b32_e32 v2, v0 717; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 718; GFX1064-NEXT: v_mov_b32_e32 v1, 0 719; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 720; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 721; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 722; GFX1064-NEXT: s_not_b64 exec, exec 723; GFX1064-NEXT: v_mov_b32_e32 v2, 0 724; GFX1064-NEXT: s_not_b64 exec, exec 725; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 726; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 727; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 728; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 729; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 730; GFX1064-NEXT: v_mov_b32_e32 v3, v2 731; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 732; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 733; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 734; GFX1064-NEXT: v_mov_b32_e32 v3, s2 735; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 736; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 737; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 738; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 739; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 740; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 741; GFX1064-NEXT: s_mov_b32 s2, -1 742; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 743; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 744; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 745; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 746; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 747; GFX1064-NEXT: ; implicit-def: $vgpr0 748; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 749; GFX1064-NEXT: s_cbranch_execz BB3_2 750; GFX1064-NEXT: ; %bb.1: 751; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 752; GFX1064-NEXT: v_mov_b32_e32 v7, s3 753; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 754; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 755; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 756; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 757; GFX1064-NEXT: buffer_gl0_inv 758; GFX1064-NEXT: buffer_gl1_inv 759; GFX1064-NEXT: BB3_2: 760; GFX1064-NEXT: v_nop 761; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 762; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 763; GFX1064-NEXT: v_mov_b32_e32 v0, v1 764; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 765; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 766; GFX1064-NEXT: s_nop 1 767; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 768; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 769; GFX1064-NEXT: s_endpgm 770; 771; GFX1032-LABEL: add_i32_varying_gfx1032: 772; GFX1032: ; %bb.0: ; %entry 773; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 774; GFX1032-NEXT: s_mov_b32 s2, exec_lo 775; GFX1032-NEXT: ; implicit-def: $vcc_hi 776; GFX1032-NEXT: v_mov_b32_e32 v2, v0 777; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 778; GFX1032-NEXT: v_mov_b32_e32 v1, 0 779; GFX1032-NEXT: s_mov_b32 exec_lo, s3 780; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 781; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 782; GFX1032-NEXT: v_mov_b32_e32 v2, 0 783; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 784; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 785; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 786; GFX1032-NEXT: s_mov_b32 s2, -1 787; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 788; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 789; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 790; GFX1032-NEXT: v_mov_b32_e32 v3, v2 791; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 792; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 793; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 794; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 795; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 796; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 797; GFX1032-NEXT: s_mov_b32 exec_lo, s4 798; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 799; GFX1032-NEXT: ; implicit-def: $vgpr0 800; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 801; GFX1032-NEXT: s_cbranch_execz BB3_2 802; GFX1032-NEXT: ; %bb.1: 803; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 804; GFX1032-NEXT: v_mov_b32_e32 v7, s3 805; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 806; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 807; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 808; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 809; GFX1032-NEXT: buffer_gl0_inv 810; GFX1032-NEXT: buffer_gl1_inv 811; GFX1032-NEXT: BB3_2: 812; GFX1032-NEXT: v_nop 813; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 814; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 815; GFX1032-NEXT: v_mov_b32_e32 v0, v1 816; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 817; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 818; GFX1032-NEXT: s_nop 1 819; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 820; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 821; GFX1032-NEXT: s_endpgm 822entry: 823 %lane = call i32 @llvm.amdgcn.workitem.id.x() 824 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 825 store i32 %old, i32 addrspace(1)* %out 826 ret void 827} 828 829define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 830; 831; 832; GFX7LESS-LABEL: add_i32_varying_gfx1064: 833; GFX7LESS: ; %bb.0: ; %entry 834; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 835; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 836; GFX7LESS-NEXT: s_mov_b32 m0, -1 837; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 838; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 839; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 840; GFX7LESS-NEXT: buffer_wbinvl1 841; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 842; GFX7LESS-NEXT: s_mov_b32 s2, -1 843; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 844; GFX7LESS-NEXT: s_endpgm 845; 846; GFX8-LABEL: add_i32_varying_gfx1064: 847; GFX8: ; %bb.0: ; %entry 848; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 849; GFX8-NEXT: s_mov_b64 s[2:3], exec 850; GFX8-NEXT: v_mov_b32_e32 v2, v0 851; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 852; GFX8-NEXT: v_mov_b32_e32 v1, 0 853; GFX8-NEXT: s_mov_b64 exec, s[4:5] 854; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 855; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 856; GFX8-NEXT: s_not_b64 exec, exec 857; GFX8-NEXT: v_mov_b32_e32 v2, 0 858; GFX8-NEXT: s_not_b64 exec, exec 859; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 860; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 861; GFX8-NEXT: s_nop 1 862; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 863; GFX8-NEXT: s_nop 1 864; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 865; GFX8-NEXT: s_nop 1 866; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 867; GFX8-NEXT: s_nop 1 868; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 869; GFX8-NEXT: s_nop 1 870; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 871; GFX8-NEXT: v_readlane_b32 s2, v2, 63 872; GFX8-NEXT: s_nop 0 873; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 874; GFX8-NEXT: s_mov_b64 exec, s[4:5] 875; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 876; GFX8-NEXT: ; implicit-def: $vgpr0 877; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 878; GFX8-NEXT: s_cbranch_execz BB4_2 879; GFX8-NEXT: ; %bb.1: 880; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 881; GFX8-NEXT: v_mov_b32_e32 v3, s2 882; GFX8-NEXT: s_mov_b32 m0, -1 883; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 884; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 885; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 886; GFX8-NEXT: buffer_wbinvl1_vol 887; GFX8-NEXT: BB4_2: 888; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 889; GFX8-NEXT: v_readfirstlane_b32 s2, v0 890; GFX8-NEXT: v_mov_b32_e32 v0, v1 891; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 892; GFX8-NEXT: s_mov_b32 s3, 0xf000 893; GFX8-NEXT: s_mov_b32 s2, -1 894; GFX8-NEXT: s_nop 0 895; GFX8-NEXT: s_waitcnt lgkmcnt(0) 896; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 897; GFX8-NEXT: s_endpgm 898; 899; GFX9-LABEL: add_i32_varying_gfx1064: 900; GFX9: ; %bb.0: ; %entry 901; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 902; GFX9-NEXT: s_mov_b64 s[2:3], exec 903; GFX9-NEXT: v_mov_b32_e32 v2, v0 904; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 905; GFX9-NEXT: v_mov_b32_e32 v1, 0 906; GFX9-NEXT: s_mov_b64 exec, s[4:5] 907; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 908; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 909; GFX9-NEXT: s_not_b64 exec, exec 910; GFX9-NEXT: v_mov_b32_e32 v2, 0 911; GFX9-NEXT: s_not_b64 exec, exec 912; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 913; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 914; GFX9-NEXT: s_nop 1 915; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 916; GFX9-NEXT: s_nop 1 917; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 918; GFX9-NEXT: s_nop 1 919; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 920; GFX9-NEXT: s_nop 1 921; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 922; GFX9-NEXT: s_nop 1 923; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 924; GFX9-NEXT: v_readlane_b32 s2, v2, 63 925; GFX9-NEXT: s_nop 0 926; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 927; GFX9-NEXT: s_mov_b64 exec, s[4:5] 928; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 929; GFX9-NEXT: ; implicit-def: $vgpr0 930; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 931; GFX9-NEXT: s_cbranch_execz BB4_2 932; GFX9-NEXT: ; %bb.1: 933; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 934; GFX9-NEXT: v_mov_b32_e32 v3, s2 935; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 936; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 937; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 938; GFX9-NEXT: buffer_wbinvl1_vol 939; GFX9-NEXT: BB4_2: 940; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 941; GFX9-NEXT: v_readfirstlane_b32 s2, v0 942; GFX9-NEXT: v_mov_b32_e32 v0, v1 943; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 944; GFX9-NEXT: s_mov_b32 s3, 0xf000 945; GFX9-NEXT: s_mov_b32 s2, -1 946; GFX9-NEXT: s_nop 0 947; GFX9-NEXT: s_waitcnt lgkmcnt(0) 948; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 949; GFX9-NEXT: s_endpgm 950; 951; GFX1064-LABEL: add_i32_varying_gfx1064: 952; GFX1064: ; %bb.0: ; %entry 953; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 954; GFX1064-NEXT: s_mov_b64 s[2:3], exec 955; GFX1064-NEXT: v_mov_b32_e32 v2, v0 956; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 957; GFX1064-NEXT: v_mov_b32_e32 v1, 0 958; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 959; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 960; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 961; GFX1064-NEXT: s_not_b64 exec, exec 962; GFX1064-NEXT: v_mov_b32_e32 v2, 0 963; GFX1064-NEXT: s_not_b64 exec, exec 964; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 965; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 966; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 967; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 968; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 969; GFX1064-NEXT: v_mov_b32_e32 v3, v2 970; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 971; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 972; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 973; GFX1064-NEXT: v_mov_b32_e32 v3, s2 974; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 975; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 976; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 977; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 978; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 979; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 980; GFX1064-NEXT: s_mov_b32 s2, -1 981; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 982; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 983; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 984; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 985; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 986; GFX1064-NEXT: ; implicit-def: $vgpr0 987; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 988; GFX1064-NEXT: s_cbranch_execz BB4_2 989; GFX1064-NEXT: ; %bb.1: 990; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 991; GFX1064-NEXT: v_mov_b32_e32 v7, s3 992; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 993; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 994; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 995; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 996; GFX1064-NEXT: buffer_gl0_inv 997; GFX1064-NEXT: buffer_gl1_inv 998; GFX1064-NEXT: BB4_2: 999; GFX1064-NEXT: v_nop 1000; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1001; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1002; GFX1064-NEXT: v_mov_b32_e32 v0, v1 1003; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 1004; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1005; GFX1064-NEXT: s_nop 1 1006; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1008; GFX1064-NEXT: s_endpgm 1009; 1010; GFX1032-LABEL: add_i32_varying_gfx1064: 1011; GFX1032: ; %bb.0: ; %entry 1012; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1013; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1014; GFX1032-NEXT: ; implicit-def: $vcc_hi 1015; GFX1032-NEXT: v_mov_b32_e32 v2, v0 1016; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 1017; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1018; GFX1032-NEXT: s_mov_b32 exec_lo, s3 1019; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1020; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1021; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1022; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1023; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1024; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1025; GFX1032-NEXT: s_mov_b32 s2, -1 1026; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1027; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1028; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1029; GFX1032-NEXT: v_mov_b32_e32 v3, v2 1030; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 1031; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1032; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 1033; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 1034; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 1035; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 1036; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1037; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1038; GFX1032-NEXT: ; implicit-def: $vgpr0 1039; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1040; GFX1032-NEXT: s_cbranch_execz BB4_2 1041; GFX1032-NEXT: ; %bb.1: 1042; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1043; GFX1032-NEXT: v_mov_b32_e32 v7, s3 1044; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1045; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1046; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 1047; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1048; GFX1032-NEXT: buffer_gl0_inv 1049; GFX1032-NEXT: buffer_gl1_inv 1050; GFX1032-NEXT: BB4_2: 1051; GFX1032-NEXT: v_nop 1052; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1053; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1054; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1055; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1056; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1057; GFX1032-NEXT: s_nop 1 1058; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1059; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1060; GFX1032-NEXT: s_endpgm 1061entry: 1062 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1063 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1064 store i32 %old, i32 addrspace(1)* %out 1065 ret void 1066} 1067 1068define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1069; 1070; 1071; GFX7LESS-LABEL: add_i64_constant: 1072; GFX7LESS: ; %bb.0: ; %entry 1073; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1074; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1075; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1076; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1077; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1078; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1079; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1080; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1081; GFX7LESS-NEXT: ; %bb.1: 1082; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1083; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1084; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1085; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1086; GFX7LESS-NEXT: s_mov_b32 m0, -1 1087; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1088; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1089; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1090; GFX7LESS-NEXT: buffer_wbinvl1 1091; GFX7LESS-NEXT: BB5_2: 1092; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1093; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1094; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1095; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1096; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1097; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1098; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1099; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1100; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1101; GFX7LESS-NEXT: s_mov_b32 s2, -1 1102; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1103; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1104; GFX7LESS-NEXT: s_endpgm 1105; 1106; GFX8-LABEL: add_i64_constant: 1107; GFX8: ; %bb.0: ; %entry 1108; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1109; GFX8-NEXT: s_mov_b64 s[4:5], exec 1110; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1111; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1112; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1113; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1114; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1115; GFX8-NEXT: s_cbranch_execz BB5_2 1116; GFX8-NEXT: ; %bb.1: 1117; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1118; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1119; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1120; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1121; GFX8-NEXT: s_mov_b32 m0, -1 1122; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1123; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1124; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1125; GFX8-NEXT: buffer_wbinvl1_vol 1126; GFX8-NEXT: BB5_2: 1127; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1128; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1129; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1130; GFX8-NEXT: v_mov_b32_e32 v1, s2 1131; GFX8-NEXT: v_mov_b32_e32 v2, s3 1132; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1133; GFX8-NEXT: s_mov_b32 s3, 0xf000 1134; GFX8-NEXT: s_mov_b32 s2, -1 1135; GFX8-NEXT: s_nop 2 1136; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1138; GFX8-NEXT: s_endpgm 1139; 1140; GFX9-LABEL: add_i64_constant: 1141; GFX9: ; %bb.0: ; %entry 1142; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1143; GFX9-NEXT: s_mov_b64 s[4:5], exec 1144; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1145; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1146; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1147; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1148; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1149; GFX9-NEXT: s_cbranch_execz BB5_2 1150; GFX9-NEXT: ; %bb.1: 1151; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1152; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1153; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1154; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1155; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1156; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1157; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1158; GFX9-NEXT: buffer_wbinvl1_vol 1159; GFX9-NEXT: BB5_2: 1160; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1161; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1162; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1163; GFX9-NEXT: v_mov_b32_e32 v1, s2 1164; GFX9-NEXT: v_mov_b32_e32 v2, s3 1165; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1166; GFX9-NEXT: s_mov_b32 s3, 0xf000 1167; GFX9-NEXT: s_mov_b32 s2, -1 1168; GFX9-NEXT: s_nop 2 1169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1171; GFX9-NEXT: s_endpgm 1172; 1173; GFX1064-LABEL: add_i64_constant: 1174; GFX1064: ; %bb.0: ; %entry 1175; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1176; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1177; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1178; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1179; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 1180; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1181; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1182; GFX1064-NEXT: s_cbranch_execz BB5_2 1183; GFX1064-NEXT: ; %bb.1: 1184; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1185; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1186; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1187; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1188; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1189; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1190; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1191; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1192; GFX1064-NEXT: buffer_gl0_inv 1193; GFX1064-NEXT: buffer_gl1_inv 1194; GFX1064-NEXT: BB5_2: 1195; GFX1064-NEXT: v_nop 1196; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1197; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1198; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1199; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1200; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1201; GFX1064-NEXT: s_mov_b32 s2, -1 1202; GFX1064-NEXT: s_nop 2 1203; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1204; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1205; GFX1064-NEXT: s_endpgm 1206; 1207; GFX1032-LABEL: add_i64_constant: 1208; GFX1032: ; %bb.0: ; %entry 1209; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1210; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1211; GFX1032-NEXT: ; implicit-def: $vcc_hi 1212; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1213; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1214; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1215; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1216; GFX1032-NEXT: s_cbranch_execz BB5_2 1217; GFX1032-NEXT: ; %bb.1: 1218; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1219; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1220; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1221; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1222; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1223; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1224; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1225; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1226; GFX1032-NEXT: buffer_gl0_inv 1227; GFX1032-NEXT: buffer_gl1_inv 1228; GFX1032-NEXT: BB5_2: 1229; GFX1032-NEXT: v_nop 1230; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1231; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1232; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1233; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1234; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1235; GFX1032-NEXT: s_mov_b32 s2, -1 1236; GFX1032-NEXT: s_nop 2 1237; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1239; GFX1032-NEXT: s_endpgm 1240entry: 1241 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1242 store i64 %old, i64 addrspace(1)* %out 1243 ret void 1244} 1245 1246define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1247; 1248; 1249; GFX7LESS-LABEL: add_i64_uniform: 1250; GFX7LESS: ; %bb.0: ; %entry 1251; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1252; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1253; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1254; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1255; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1256; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1257; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1258; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1259; GFX7LESS-NEXT: ; %bb.1: 1260; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1261; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1262; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1263; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1264; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1265; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1266; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1267; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1268; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1269; GFX7LESS-NEXT: s_mov_b32 m0, -1 1270; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1271; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1272; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1273; GFX7LESS-NEXT: buffer_wbinvl1 1274; GFX7LESS-NEXT: BB6_2: 1275; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1276; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1277; GFX7LESS-NEXT: s_mov_b32 s6, -1 1278; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX7LESS-NEXT: s_mov_b32 s4, s0 1280; GFX7LESS-NEXT: s_mov_b32 s5, s1 1281; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1282; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1283; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1284; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1285; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1286; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1287; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1288; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1289; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1290; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1291; GFX7LESS-NEXT: s_endpgm 1292; 1293; GFX8-LABEL: add_i64_uniform: 1294; GFX8: ; %bb.0: ; %entry 1295; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1296; GFX8-NEXT: s_mov_b64 s[6:7], exec 1297; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1298; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1299; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1300; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1301; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1302; GFX8-NEXT: s_cbranch_execz BB6_2 1303; GFX8-NEXT: ; %bb.1: 1304; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1305; GFX8-NEXT: v_mov_b32_e32 v1, s6 1306; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1307; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1308; GFX8-NEXT: s_mul_i32 s7, s3, s6 1309; GFX8-NEXT: s_mul_i32 s6, s2, s6 1310; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1311; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1312; GFX8-NEXT: v_mov_b32_e32 v1, s6 1313; GFX8-NEXT: s_mov_b32 m0, -1 1314; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1315; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1316; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1317; GFX8-NEXT: buffer_wbinvl1_vol 1318; GFX8-NEXT: BB6_2: 1319; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1320; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1321; GFX8-NEXT: s_mov_b32 s4, s0 1322; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1323; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1324; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1325; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1326; GFX8-NEXT: s_mov_b32 s5, s1 1327; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1328; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1329; GFX8-NEXT: v_mov_b32_e32 v2, s1 1330; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1331; GFX8-NEXT: s_mov_b32 s7, 0xf000 1332; GFX8-NEXT: s_mov_b32 s6, -1 1333; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1334; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1335; GFX8-NEXT: s_endpgm 1336; 1337; GFX9-LABEL: add_i64_uniform: 1338; GFX9: ; %bb.0: ; %entry 1339; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1340; GFX9-NEXT: s_mov_b64 s[6:7], exec 1341; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1342; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1343; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1344; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1345; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1346; GFX9-NEXT: s_cbranch_execz BB6_2 1347; GFX9-NEXT: ; %bb.1: 1348; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1349; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1350; GFX9-NEXT: s_mul_i32 s7, s3, s6 1351; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1352; GFX9-NEXT: s_add_i32 s8, s8, s7 1353; GFX9-NEXT: s_mul_i32 s6, s2, s6 1354; GFX9-NEXT: v_mov_b32_e32 v1, s6 1355; GFX9-NEXT: v_mov_b32_e32 v2, s8 1356; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1357; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1358; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1359; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1360; GFX9-NEXT: buffer_wbinvl1_vol 1361; GFX9-NEXT: BB6_2: 1362; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1363; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1365; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1366; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1367; GFX9-NEXT: s_mov_b32 s4, s0 1368; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1369; GFX9-NEXT: s_mov_b32 s5, s1 1370; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1371; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1372; GFX9-NEXT: v_mov_b32_e32 v2, s1 1373; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1374; GFX9-NEXT: s_mov_b32 s7, 0xf000 1375; GFX9-NEXT: s_mov_b32 s6, -1 1376; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1377; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1378; GFX9-NEXT: s_endpgm 1379; 1380; GFX1064-LABEL: add_i64_uniform: 1381; GFX1064: ; %bb.0: ; %entry 1382; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1383; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1384; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1385; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1386; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1387; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1388; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1389; GFX1064-NEXT: s_cbranch_execz BB6_2 1390; GFX1064-NEXT: ; %bb.1: 1391; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1392; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1393; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1394; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1395; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1396; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1397; GFX1064-NEXT: s_add_i32 s8, s8, s7 1398; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1399; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1400; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1401; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1402; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1403; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1404; GFX1064-NEXT: buffer_gl0_inv 1405; GFX1064-NEXT: buffer_gl1_inv 1406; GFX1064-NEXT: BB6_2: 1407; GFX1064-NEXT: v_nop 1408; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1409; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1410; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1411; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1412; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1413; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 1414; GFX1064-NEXT: v_readfirstlane_b32 s5, v2 1415; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1416; GFX1064-NEXT: s_mov_b32 s2, -1 1417; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1418; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s4, v0 1419; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s5, v1, vcc 1420; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1421; GFX1064-NEXT: s_endpgm 1422; 1423; GFX1032-LABEL: add_i64_uniform: 1424; GFX1032: ; %bb.0: ; %entry 1425; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1426; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1427; GFX1032-NEXT: ; implicit-def: $vcc_hi 1428; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1429; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1430; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1431; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1432; GFX1032-NEXT: s_cbranch_execz BB6_2 1433; GFX1032-NEXT: ; %bb.1: 1434; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1435; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1436; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1437; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1438; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1439; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1440; GFX1032-NEXT: s_add_i32 s7, s7, s6 1441; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1442; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1443; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1444; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1445; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1446; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1447; GFX1032-NEXT: buffer_gl0_inv 1448; GFX1032-NEXT: buffer_gl1_inv 1449; GFX1032-NEXT: BB6_2: 1450; GFX1032-NEXT: v_nop 1451; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1452; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1454; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1455; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1456; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 1457; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 1458; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1459; GFX1032-NEXT: s_mov_b32 s2, -1 1460; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1461; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s4, v0 1462; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo 1463; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1464; GFX1032-NEXT: s_endpgm 1465entry: 1466 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1467 store i64 %old, i64 addrspace(1)* %out 1468 ret void 1469} 1470 1471; GCN-NOT: v_mbcnt_lo_u32_b32 1472; GCN-NOT: v_mbcnt_hi_u32_b32 1473; GCN-NOT: s_bcnt1_i32_b64 1474define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1475; 1476; 1477; GFX7LESS-LABEL: add_i64_varying: 1478; GFX7LESS: ; %bb.0: ; %entry 1479; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1480; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1481; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1482; GFX7LESS-NEXT: s_mov_b32 m0, -1 1483; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1484; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1485; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1486; GFX7LESS-NEXT: buffer_wbinvl1 1487; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1488; GFX7LESS-NEXT: s_mov_b32 s2, -1 1489; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1490; GFX7LESS-NEXT: s_endpgm 1491; 1492; GFX8-LABEL: add_i64_varying: 1493; GFX8: ; %bb.0: ; %entry 1494; GFX8-NEXT: v_mov_b32_e32 v1, 0 1495; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1496; GFX8-NEXT: s_mov_b32 m0, -1 1497; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1498; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1499; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1500; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1501; GFX8-NEXT: buffer_wbinvl1_vol 1502; GFX8-NEXT: s_mov_b32 s3, 0xf000 1503; GFX8-NEXT: s_mov_b32 s2, -1 1504; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1505; GFX8-NEXT: s_endpgm 1506; 1507; GFX9-LABEL: add_i64_varying: 1508; GFX9: ; %bb.0: ; %entry 1509; GFX9-NEXT: v_mov_b32_e32 v1, 0 1510; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1511; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1512; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1513; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1514; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1515; GFX9-NEXT: buffer_wbinvl1_vol 1516; GFX9-NEXT: s_mov_b32 s3, 0xf000 1517; GFX9-NEXT: s_mov_b32 s2, -1 1518; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1519; GFX9-NEXT: s_endpgm 1520; 1521; GFX1064-LABEL: add_i64_varying: 1522; GFX1064: ; %bb.0: ; %entry 1523; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1524; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1525; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1526; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1527; GFX1064-NEXT: s_mov_b32 s2, -1 1528; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1529; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1530; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1531; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1532; GFX1064-NEXT: buffer_gl0_inv 1533; GFX1064-NEXT: buffer_gl1_inv 1534; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1535; GFX1064-NEXT: s_endpgm 1536; 1537; GFX1032-LABEL: add_i64_varying: 1538; GFX1032: ; %bb.0: ; %entry 1539; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1540; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1541; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1542; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1543; GFX1032-NEXT: s_mov_b32 s2, -1 1544; GFX1032-NEXT: ; implicit-def: $vcc_hi 1545; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1546; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1547; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1548; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1549; GFX1032-NEXT: buffer_gl0_inv 1550; GFX1032-NEXT: buffer_gl1_inv 1551; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1552; GFX1032-NEXT: s_endpgm 1553entry: 1554 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1555 %zext = zext i32 %lane to i64 1556 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1557 store i64 %old, i64 addrspace(1)* %out 1558 ret void 1559} 1560 1561define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1562; 1563; 1564; GFX7LESS-LABEL: sub_i32_constant: 1565; GFX7LESS: ; %bb.0: ; %entry 1566; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1567; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1568; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1569; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1570; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1571; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1572; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1573; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1574; GFX7LESS-NEXT: ; %bb.1: 1575; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1576; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1577; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 1578; GFX7LESS-NEXT: s_mov_b32 m0, -1 1579; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1580; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1581; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1582; GFX7LESS-NEXT: buffer_wbinvl1 1583; GFX7LESS-NEXT: BB8_2: 1584; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1585; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1586; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1587; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1588; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1589; GFX7LESS-NEXT: s_mov_b32 s2, -1 1590; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1591; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1592; GFX7LESS-NEXT: s_endpgm 1593; 1594; GFX8-LABEL: sub_i32_constant: 1595; GFX8: ; %bb.0: ; %entry 1596; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1597; GFX8-NEXT: s_mov_b64 s[2:3], exec 1598; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1599; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1600; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1601; GFX8-NEXT: ; implicit-def: $vgpr1 1602; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1603; GFX8-NEXT: s_cbranch_execz BB8_2 1604; GFX8-NEXT: ; %bb.1: 1605; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1606; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1607; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1608; GFX8-NEXT: s_mov_b32 m0, -1 1609; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1610; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 1611; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1612; GFX8-NEXT: buffer_wbinvl1_vol 1613; GFX8-NEXT: BB8_2: 1614; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1615; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1616; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1617; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1618; GFX8-NEXT: s_mov_b32 s3, 0xf000 1619; GFX8-NEXT: s_mov_b32 s2, -1 1620; GFX8-NEXT: s_nop 0 1621; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1622; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1623; GFX8-NEXT: s_endpgm 1624; 1625; GFX9-LABEL: sub_i32_constant: 1626; GFX9: ; %bb.0: ; %entry 1627; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1628; GFX9-NEXT: s_mov_b64 s[2:3], exec 1629; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1630; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1631; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1632; GFX9-NEXT: ; implicit-def: $vgpr1 1633; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1634; GFX9-NEXT: s_cbranch_execz BB8_2 1635; GFX9-NEXT: ; %bb.1: 1636; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1637; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1638; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1639; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1640; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 1641; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1642; GFX9-NEXT: buffer_wbinvl1_vol 1643; GFX9-NEXT: BB8_2: 1644; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1645; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1646; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1647; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1648; GFX9-NEXT: s_mov_b32 s3, 0xf000 1649; GFX9-NEXT: s_mov_b32 s2, -1 1650; GFX9-NEXT: s_nop 0 1651; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1652; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1653; GFX9-NEXT: s_endpgm 1654; 1655; GFX1064-LABEL: sub_i32_constant: 1656; GFX1064: ; %bb.0: ; %entry 1657; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1658; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1659; GFX1064-NEXT: ; implicit-def: $vgpr1 1660; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1661; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1662; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1663; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1664; GFX1064-NEXT: s_cbranch_execz BB8_2 1665; GFX1064-NEXT: ; %bb.1: 1666; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1667; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1668; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1669; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1670; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1671; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 1672; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1673; GFX1064-NEXT: buffer_gl0_inv 1674; GFX1064-NEXT: buffer_gl1_inv 1675; GFX1064-NEXT: BB8_2: 1676; GFX1064-NEXT: v_nop 1677; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1678; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1679; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1680; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1681; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1682; GFX1064-NEXT: s_mov_b32 s2, -1 1683; GFX1064-NEXT: s_nop 0 1684; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1685; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1686; GFX1064-NEXT: s_endpgm 1687; 1688; GFX1032-LABEL: sub_i32_constant: 1689; GFX1032: ; %bb.0: ; %entry 1690; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1691; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1692; GFX1032-NEXT: ; implicit-def: $vcc_hi 1693; GFX1032-NEXT: ; implicit-def: $vgpr1 1694; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1695; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1696; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1697; GFX1032-NEXT: s_cbranch_execz BB8_2 1698; GFX1032-NEXT: ; %bb.1: 1699; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1700; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1701; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1702; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1703; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1704; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 1705; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1706; GFX1032-NEXT: buffer_gl0_inv 1707; GFX1032-NEXT: buffer_gl1_inv 1708; GFX1032-NEXT: BB8_2: 1709; GFX1032-NEXT: v_nop 1710; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1711; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1712; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1713; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1714; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1715; GFX1032-NEXT: s_mov_b32 s2, -1 1716; GFX1032-NEXT: s_nop 0 1717; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1718; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1719; GFX1032-NEXT: s_endpgm 1720entry: 1721 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1722 store i32 %old, i32 addrspace(1)* %out 1723 ret void 1724} 1725 1726define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1727; 1728; 1729; GFX7LESS-LABEL: sub_i32_uniform: 1730; GFX7LESS: ; %bb.0: ; %entry 1731; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1732; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1733; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb 1734; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1735; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1736; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1737; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1738; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1739; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1740; GFX7LESS-NEXT: ; %bb.1: 1741; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1742; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1743; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 1744; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1745; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 1746; GFX7LESS-NEXT: s_mov_b32 m0, -1 1747; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1748; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1749; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1750; GFX7LESS-NEXT: buffer_wbinvl1 1751; GFX7LESS-NEXT: BB9_2: 1752; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1753; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1754; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1756; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1757; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1758; GFX7LESS-NEXT: s_mov_b32 s6, -1 1759; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1760; GFX7LESS-NEXT: s_endpgm 1761; 1762; GFX8-LABEL: sub_i32_uniform: 1763; GFX8: ; %bb.0: ; %entry 1764; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1765; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1766; GFX8-NEXT: s_mov_b64 s[2:3], exec 1767; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1768; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1769; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1770; GFX8-NEXT: ; implicit-def: $vgpr1 1771; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1772; GFX8-NEXT: s_cbranch_execz BB9_2 1773; GFX8-NEXT: ; %bb.1: 1774; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1775; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1776; GFX8-NEXT: s_mul_i32 s1, s0, s1 1777; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1778; GFX8-NEXT: v_mov_b32_e32 v2, s1 1779; GFX8-NEXT: s_mov_b32 m0, -1 1780; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1781; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1782; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1783; GFX8-NEXT: buffer_wbinvl1_vol 1784; GFX8-NEXT: BB9_2: 1785; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1786; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1787; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1788; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1789; GFX8-NEXT: s_mov_b32 s7, 0xf000 1790; GFX8-NEXT: s_mov_b32 s6, -1 1791; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1792; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1793; GFX8-NEXT: s_endpgm 1794; 1795; GFX9-LABEL: sub_i32_uniform: 1796; GFX9: ; %bb.0: ; %entry 1797; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1798; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c 1799; GFX9-NEXT: s_mov_b64 s[2:3], exec 1800; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1801; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1802; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1803; GFX9-NEXT: ; implicit-def: $vgpr1 1804; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc 1805; GFX9-NEXT: s_cbranch_execz BB9_2 1806; GFX9-NEXT: ; %bb.1: 1807; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1808; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1809; GFX9-NEXT: s_mul_i32 s1, s0, s1 1810; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1811; GFX9-NEXT: v_mov_b32_e32 v2, s1 1812; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1813; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1814; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1815; GFX9-NEXT: buffer_wbinvl1_vol 1816; GFX9-NEXT: BB9_2: 1817; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 1818; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1819; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 1820; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1821; GFX9-NEXT: s_mov_b32 s7, 0xf000 1822; GFX9-NEXT: s_mov_b32 s6, -1 1823; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1824; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1825; GFX9-NEXT: s_endpgm 1826; 1827; GFX1064-LABEL: sub_i32_uniform: 1828; GFX1064: ; %bb.0: ; %entry 1829; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1830; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1831; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c 1832; GFX1064-NEXT: ; implicit-def: $vgpr1 1833; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1834; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1835; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1836; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc 1837; GFX1064-NEXT: s_cbranch_execz BB9_2 1838; GFX1064-NEXT: ; %bb.1: 1839; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1840; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1841; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1842; GFX1064-NEXT: s_mul_i32 s1, s0, s1 1843; GFX1064-NEXT: v_mov_b32_e32 v2, s1 1844; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1845; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1846; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1847; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1848; GFX1064-NEXT: buffer_gl0_inv 1849; GFX1064-NEXT: buffer_gl1_inv 1850; GFX1064-NEXT: BB9_2: 1851; GFX1064-NEXT: v_nop 1852; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 1853; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1854; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 1855; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1856; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1857; GFX1064-NEXT: s_mov_b32 s6, -1 1858; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1859; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1860; GFX1064-NEXT: s_endpgm 1861; 1862; GFX1032-LABEL: sub_i32_uniform: 1863; GFX1032: ; %bb.0: ; %entry 1864; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1865; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c 1866; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1867; GFX1032-NEXT: ; implicit-def: $vcc_hi 1868; GFX1032-NEXT: ; implicit-def: $vgpr1 1869; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1870; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1871; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1872; GFX1032-NEXT: s_cbranch_execz BB9_2 1873; GFX1032-NEXT: ; %bb.1: 1874; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1875; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1876; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1877; GFX1032-NEXT: s_mul_i32 s2, s0, s2 1878; GFX1032-NEXT: v_mov_b32_e32 v2, s2 1879; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1880; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1881; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1882; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1883; GFX1032-NEXT: buffer_gl0_inv 1884; GFX1032-NEXT: buffer_gl1_inv 1885; GFX1032-NEXT: BB9_2: 1886; GFX1032-NEXT: v_nop 1887; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 1888; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1889; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 1890; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1891; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1892; GFX1032-NEXT: s_mov_b32 s6, -1 1893; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1894; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1895; GFX1032-NEXT: s_endpgm 1896entry: 1897 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1898 store i32 %old, i32 addrspace(1)* %out 1899 ret void 1900} 1901 1902define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1903; 1904; 1905; GFX7LESS-LABEL: sub_i32_varying: 1906; GFX7LESS: ; %bb.0: ; %entry 1907; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1908; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1909; GFX7LESS-NEXT: s_mov_b32 m0, -1 1910; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1911; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1912; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1913; GFX7LESS-NEXT: buffer_wbinvl1 1914; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1915; GFX7LESS-NEXT: s_mov_b32 s2, -1 1916; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1917; GFX7LESS-NEXT: s_endpgm 1918; 1919; GFX8-LABEL: sub_i32_varying: 1920; GFX8: ; %bb.0: ; %entry 1921; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1922; GFX8-NEXT: s_mov_b64 s[2:3], exec 1923; GFX8-NEXT: v_mov_b32_e32 v2, v0 1924; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1925; GFX8-NEXT: v_mov_b32_e32 v1, 0 1926; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1927; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1928; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1929; GFX8-NEXT: s_not_b64 exec, exec 1930; GFX8-NEXT: v_mov_b32_e32 v2, 0 1931; GFX8-NEXT: s_not_b64 exec, exec 1932; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 1933; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1934; GFX8-NEXT: s_nop 1 1935; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1936; GFX8-NEXT: s_nop 1 1937; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1938; GFX8-NEXT: s_nop 1 1939; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1940; GFX8-NEXT: s_nop 1 1941; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1942; GFX8-NEXT: s_nop 1 1943; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1944; GFX8-NEXT: v_readlane_b32 s2, v2, 63 1945; GFX8-NEXT: s_nop 0 1946; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1947; GFX8-NEXT: s_mov_b64 exec, s[4:5] 1948; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1949; GFX8-NEXT: ; implicit-def: $vgpr0 1950; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1951; GFX8-NEXT: s_cbranch_execz BB10_2 1952; GFX8-NEXT: ; %bb.1: 1953; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1954; GFX8-NEXT: v_mov_b32_e32 v3, s2 1955; GFX8-NEXT: s_mov_b32 m0, -1 1956; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1957; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1958; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1959; GFX8-NEXT: buffer_wbinvl1_vol 1960; GFX8-NEXT: BB10_2: 1961; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1962; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1963; GFX8-NEXT: v_mov_b32_e32 v0, v1 1964; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1965; GFX8-NEXT: s_mov_b32 s3, 0xf000 1966; GFX8-NEXT: s_mov_b32 s2, -1 1967; GFX8-NEXT: s_nop 0 1968; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1969; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1970; GFX8-NEXT: s_endpgm 1971; 1972; GFX9-LABEL: sub_i32_varying: 1973; GFX9: ; %bb.0: ; %entry 1974; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1975; GFX9-NEXT: s_mov_b64 s[2:3], exec 1976; GFX9-NEXT: v_mov_b32_e32 v2, v0 1977; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1978; GFX9-NEXT: v_mov_b32_e32 v1, 0 1979; GFX9-NEXT: s_mov_b64 exec, s[4:5] 1980; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1981; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1982; GFX9-NEXT: s_not_b64 exec, exec 1983; GFX9-NEXT: v_mov_b32_e32 v2, 0 1984; GFX9-NEXT: s_not_b64 exec, exec 1985; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 1986; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1987; GFX9-NEXT: s_nop 1 1988; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1989; GFX9-NEXT: s_nop 1 1990; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1991; GFX9-NEXT: s_nop 1 1992; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1993; GFX9-NEXT: s_nop 1 1994; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1995; GFX9-NEXT: s_nop 1 1996; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1997; GFX9-NEXT: v_readlane_b32 s2, v2, 63 1998; GFX9-NEXT: s_nop 0 1999; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2000; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2001; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2002; GFX9-NEXT: ; implicit-def: $vgpr0 2003; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2004; GFX9-NEXT: s_cbranch_execz BB10_2 2005; GFX9-NEXT: ; %bb.1: 2006; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2007; GFX9-NEXT: v_mov_b32_e32 v3, s2 2008; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2009; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 2010; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2011; GFX9-NEXT: buffer_wbinvl1_vol 2012; GFX9-NEXT: BB10_2: 2013; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2014; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2015; GFX9-NEXT: v_mov_b32_e32 v0, v1 2016; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2017; GFX9-NEXT: s_mov_b32 s3, 0xf000 2018; GFX9-NEXT: s_mov_b32 s2, -1 2019; GFX9-NEXT: s_nop 0 2020; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2021; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2022; GFX9-NEXT: s_endpgm 2023; 2024; GFX1064-LABEL: sub_i32_varying: 2025; GFX1064: ; %bb.0: ; %entry 2026; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2027; GFX1064-NEXT: s_mov_b64 s[2:3], exec 2028; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2029; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2030; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2031; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2032; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2033; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 2034; GFX1064-NEXT: s_not_b64 exec, exec 2035; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2036; GFX1064-NEXT: s_not_b64 exec, exec 2037; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2038; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2039; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2040; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2041; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2042; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2043; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2044; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2045; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2046; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2047; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2048; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2049; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2050; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2051; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2052; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2053; GFX1064-NEXT: s_mov_b32 s2, -1 2054; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2055; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2056; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2057; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2058; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2059; GFX1064-NEXT: ; implicit-def: $vgpr0 2060; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2061; GFX1064-NEXT: s_cbranch_execz BB10_2 2062; GFX1064-NEXT: ; %bb.1: 2063; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2064; GFX1064-NEXT: v_mov_b32_e32 v7, s3 2065; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2066; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2067; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7 2068; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2069; GFX1064-NEXT: buffer_gl0_inv 2070; GFX1064-NEXT: buffer_gl1_inv 2071; GFX1064-NEXT: BB10_2: 2072; GFX1064-NEXT: v_nop 2073; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2074; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2075; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2076; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2077; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2078; GFX1064-NEXT: s_nop 1 2079; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2080; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2081; GFX1064-NEXT: s_endpgm 2082; 2083; GFX1032-LABEL: sub_i32_varying: 2084; GFX1032: ; %bb.0: ; %entry 2085; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2086; GFX1032-NEXT: s_mov_b32 s2, exec_lo 2087; GFX1032-NEXT: ; implicit-def: $vcc_hi 2088; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2089; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 2090; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2091; GFX1032-NEXT: s_mov_b32 exec_lo, s3 2092; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 2093; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2094; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2095; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2096; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2097; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2098; GFX1032-NEXT: s_mov_b32 s2, -1 2099; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2100; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2101; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2102; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2103; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2104; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2105; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2106; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2107; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2108; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2109; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2110; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2111; GFX1032-NEXT: ; implicit-def: $vgpr0 2112; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2113; GFX1032-NEXT: s_cbranch_execz BB10_2 2114; GFX1032-NEXT: ; %bb.1: 2115; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2116; GFX1032-NEXT: v_mov_b32_e32 v7, s3 2117; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2118; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2119; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7 2120; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2121; GFX1032-NEXT: buffer_gl0_inv 2122; GFX1032-NEXT: buffer_gl1_inv 2123; GFX1032-NEXT: BB10_2: 2124; GFX1032-NEXT: v_nop 2125; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2126; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2127; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2128; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2129; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2130; GFX1032-NEXT: s_nop 1 2131; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2132; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2133; GFX1032-NEXT: s_endpgm 2134entry: 2135 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2136 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2137 store i32 %old, i32 addrspace(1)* %out 2138 ret void 2139} 2140 2141define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2142; 2143; 2144; GFX7LESS-LABEL: sub_i64_constant: 2145; GFX7LESS: ; %bb.0: ; %entry 2146; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2147; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2148; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2149; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2150; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2151; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2152; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2153; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2154; GFX7LESS-NEXT: ; %bb.1: 2155; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2156; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2157; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2158; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2159; GFX7LESS-NEXT: s_mov_b32 m0, -1 2160; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2161; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2162; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2163; GFX7LESS-NEXT: buffer_wbinvl1 2164; GFX7LESS-NEXT: BB11_2: 2165; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2166; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2167; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2168; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2169; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2170; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2171; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2172; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2173; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2174; GFX7LESS-NEXT: s_mov_b32 s2, -1 2175; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2176; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2177; GFX7LESS-NEXT: s_endpgm 2178; 2179; GFX8-LABEL: sub_i64_constant: 2180; GFX8: ; %bb.0: ; %entry 2181; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2182; GFX8-NEXT: s_mov_b64 s[4:5], exec 2183; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2184; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2185; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2186; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2187; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2188; GFX8-NEXT: s_cbranch_execz BB11_2 2189; GFX8-NEXT: ; %bb.1: 2190; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2191; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2192; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2193; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2194; GFX8-NEXT: s_mov_b32 m0, -1 2195; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2196; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2197; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2198; GFX8-NEXT: buffer_wbinvl1_vol 2199; GFX8-NEXT: BB11_2: 2200; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2201; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2202; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2203; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2204; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2205; GFX8-NEXT: v_mov_b32_e32 v2, s3 2206; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2207; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2208; GFX8-NEXT: s_mov_b32 s3, 0xf000 2209; GFX8-NEXT: s_mov_b32 s2, -1 2210; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2211; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2212; GFX8-NEXT: s_endpgm 2213; 2214; GFX9-LABEL: sub_i64_constant: 2215; GFX9: ; %bb.0: ; %entry 2216; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2217; GFX9-NEXT: s_mov_b64 s[4:5], exec 2218; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2219; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2220; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2221; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2222; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2223; GFX9-NEXT: s_cbranch_execz BB11_2 2224; GFX9-NEXT: ; %bb.1: 2225; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2226; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2227; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2228; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2229; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2230; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2231; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2232; GFX9-NEXT: buffer_wbinvl1_vol 2233; GFX9-NEXT: BB11_2: 2234; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2235; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2236; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2237; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2238; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2239; GFX9-NEXT: v_mov_b32_e32 v2, s3 2240; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2241; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2242; GFX9-NEXT: s_mov_b32 s3, 0xf000 2243; GFX9-NEXT: s_mov_b32 s2, -1 2244; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2245; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2246; GFX9-NEXT: s_endpgm 2247; 2248; GFX1064-LABEL: sub_i64_constant: 2249; GFX1064: ; %bb.0: ; %entry 2250; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2251; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2252; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2253; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2254; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2255; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2256; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2257; GFX1064-NEXT: s_cbranch_execz BB11_2 2258; GFX1064-NEXT: ; %bb.1: 2259; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2260; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2261; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2262; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2263; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2264; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2265; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2266; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2267; GFX1064-NEXT: buffer_gl0_inv 2268; GFX1064-NEXT: buffer_gl1_inv 2269; GFX1064-NEXT: BB11_2: 2270; GFX1064-NEXT: v_nop 2271; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2272; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2273; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2274; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2275; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2276; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2277; GFX1064-NEXT: s_mov_b32 s2, -1 2278; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2279; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2280; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2281; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2282; GFX1064-NEXT: s_endpgm 2283; 2284; GFX1032-LABEL: sub_i64_constant: 2285; GFX1032: ; %bb.0: ; %entry 2286; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2287; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2288; GFX1032-NEXT: ; implicit-def: $vcc_hi 2289; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2290; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2291; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2292; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2293; GFX1032-NEXT: s_cbranch_execz BB11_2 2294; GFX1032-NEXT: ; %bb.1: 2295; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2296; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2297; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2298; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 2299; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2300; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2301; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2302; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2303; GFX1032-NEXT: buffer_gl0_inv 2304; GFX1032-NEXT: buffer_gl1_inv 2305; GFX1032-NEXT: BB11_2: 2306; GFX1032-NEXT: v_nop 2307; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2308; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2309; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2310; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2311; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2312; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2313; GFX1032-NEXT: s_mov_b32 s2, -1 2314; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2315; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2316; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2317; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2318; GFX1032-NEXT: s_endpgm 2319entry: 2320 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2321 store i64 %old, i64 addrspace(1)* %out 2322 ret void 2323} 2324 2325define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2326; 2327; 2328; GFX7LESS-LABEL: sub_i64_uniform: 2329; GFX7LESS: ; %bb.0: ; %entry 2330; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2331; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2332; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2333; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2334; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2335; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2336; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2337; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2338; GFX7LESS-NEXT: ; %bb.1: 2339; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2340; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2341; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2342; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2343; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2344; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2345; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2346; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2347; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2348; GFX7LESS-NEXT: s_mov_b32 m0, -1 2349; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2350; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2351; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2352; GFX7LESS-NEXT: buffer_wbinvl1 2353; GFX7LESS-NEXT: BB12_2: 2354; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2355; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2356; GFX7LESS-NEXT: s_mov_b32 s6, -1 2357; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2358; GFX7LESS-NEXT: s_mov_b32 s4, s0 2359; GFX7LESS-NEXT: s_mov_b32 s5, s1 2360; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2361; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2362; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2363; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2364; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2365; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2366; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2367; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2368; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2369; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2370; GFX7LESS-NEXT: s_endpgm 2371; 2372; GFX8-LABEL: sub_i64_uniform: 2373; GFX8: ; %bb.0: ; %entry 2374; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2375; GFX8-NEXT: s_mov_b64 s[6:7], exec 2376; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2377; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2378; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2379; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2380; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2381; GFX8-NEXT: s_cbranch_execz BB12_2 2382; GFX8-NEXT: ; %bb.1: 2383; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2384; GFX8-NEXT: v_mov_b32_e32 v1, s6 2385; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2386; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2387; GFX8-NEXT: s_mul_i32 s7, s3, s6 2388; GFX8-NEXT: s_mul_i32 s6, s2, s6 2389; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2390; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2391; GFX8-NEXT: v_mov_b32_e32 v1, s6 2392; GFX8-NEXT: s_mov_b32 m0, -1 2393; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2394; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2395; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2396; GFX8-NEXT: buffer_wbinvl1_vol 2397; GFX8-NEXT: BB12_2: 2398; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2399; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2400; GFX8-NEXT: s_mov_b32 s4, s0 2401; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2402; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2403; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2404; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2405; GFX8-NEXT: s_mov_b32 s5, s1 2406; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2407; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2408; GFX8-NEXT: v_mov_b32_e32 v2, s1 2409; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2410; GFX8-NEXT: s_mov_b32 s7, 0xf000 2411; GFX8-NEXT: s_mov_b32 s6, -1 2412; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2413; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2414; GFX8-NEXT: s_endpgm 2415; 2416; GFX9-LABEL: sub_i64_uniform: 2417; GFX9: ; %bb.0: ; %entry 2418; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2419; GFX9-NEXT: s_mov_b64 s[6:7], exec 2420; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2421; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2422; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2423; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2424; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2425; GFX9-NEXT: s_cbranch_execz BB12_2 2426; GFX9-NEXT: ; %bb.1: 2427; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2428; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2429; GFX9-NEXT: s_mul_i32 s7, s3, s6 2430; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2431; GFX9-NEXT: s_add_i32 s8, s8, s7 2432; GFX9-NEXT: s_mul_i32 s6, s2, s6 2433; GFX9-NEXT: v_mov_b32_e32 v1, s6 2434; GFX9-NEXT: v_mov_b32_e32 v2, s8 2435; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2436; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2437; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2438; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2439; GFX9-NEXT: buffer_wbinvl1_vol 2440; GFX9-NEXT: BB12_2: 2441; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2442; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2443; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2444; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2445; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2446; GFX9-NEXT: s_mov_b32 s4, s0 2447; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2448; GFX9-NEXT: s_mov_b32 s5, s1 2449; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2450; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2451; GFX9-NEXT: v_mov_b32_e32 v2, s1 2452; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2453; GFX9-NEXT: s_mov_b32 s7, 0xf000 2454; GFX9-NEXT: s_mov_b32 s6, -1 2455; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2456; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2457; GFX9-NEXT: s_endpgm 2458; 2459; GFX1064-LABEL: sub_i64_uniform: 2460; GFX1064: ; %bb.0: ; %entry 2461; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2462; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2463; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2464; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2465; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2466; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2467; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2468; GFX1064-NEXT: s_cbranch_execz BB12_2 2469; GFX1064-NEXT: ; %bb.1: 2470; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2471; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2472; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2473; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2474; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2475; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2476; GFX1064-NEXT: s_add_i32 s8, s8, s7 2477; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2478; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2479; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2480; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2481; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2482; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2483; GFX1064-NEXT: buffer_gl0_inv 2484; GFX1064-NEXT: buffer_gl1_inv 2485; GFX1064-NEXT: BB12_2: 2486; GFX1064-NEXT: v_nop 2487; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2488; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2489; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2490; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2491; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2492; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 2493; GFX1064-NEXT: v_readfirstlane_b32 s5, v2 2494; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2495; GFX1064-NEXT: s_mov_b32 s2, -1 2496; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2497; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s4, v0 2498; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s5, v1, vcc 2499; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2500; GFX1064-NEXT: s_endpgm 2501; 2502; GFX1032-LABEL: sub_i64_uniform: 2503; GFX1032: ; %bb.0: ; %entry 2504; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2505; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2506; GFX1032-NEXT: ; implicit-def: $vcc_hi 2507; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2508; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2509; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2510; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2511; GFX1032-NEXT: s_cbranch_execz BB12_2 2512; GFX1032-NEXT: ; %bb.1: 2513; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2514; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2515; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2516; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2517; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2518; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2519; GFX1032-NEXT: s_add_i32 s7, s7, s6 2520; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2521; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2522; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2523; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2524; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2525; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2526; GFX1032-NEXT: buffer_gl0_inv 2527; GFX1032-NEXT: buffer_gl1_inv 2528; GFX1032-NEXT: BB12_2: 2529; GFX1032-NEXT: v_nop 2530; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2531; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2532; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2533; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2534; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2535; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 2536; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 2537; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2538; GFX1032-NEXT: s_mov_b32 s2, -1 2539; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2540; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s4, v0 2541; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo 2542; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2543; GFX1032-NEXT: s_endpgm 2544entry: 2545 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2546 store i64 %old, i64 addrspace(1)* %out 2547 ret void 2548} 2549 2550; GCN-NOT: v_mbcnt_lo_u32_b32 2551; GCN-NOT: v_mbcnt_hi_u32_b32 2552; GCN-NOT: s_bcnt1_i32_b64 2553define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2554; 2555; 2556; GFX7LESS-LABEL: sub_i64_varying: 2557; GFX7LESS: ; %bb.0: ; %entry 2558; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2559; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2560; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2561; GFX7LESS-NEXT: s_mov_b32 m0, -1 2562; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2563; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2564; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2565; GFX7LESS-NEXT: buffer_wbinvl1 2566; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2567; GFX7LESS-NEXT: s_mov_b32 s2, -1 2568; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2569; GFX7LESS-NEXT: s_endpgm 2570; 2571; GFX8-LABEL: sub_i64_varying: 2572; GFX8: ; %bb.0: ; %entry 2573; GFX8-NEXT: v_mov_b32_e32 v1, 0 2574; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2575; GFX8-NEXT: s_mov_b32 m0, -1 2576; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2577; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2578; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2579; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2580; GFX8-NEXT: buffer_wbinvl1_vol 2581; GFX8-NEXT: s_mov_b32 s3, 0xf000 2582; GFX8-NEXT: s_mov_b32 s2, -1 2583; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2584; GFX8-NEXT: s_endpgm 2585; 2586; GFX9-LABEL: sub_i64_varying: 2587; GFX9: ; %bb.0: ; %entry 2588; GFX9-NEXT: v_mov_b32_e32 v1, 0 2589; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2590; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2591; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2592; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2593; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2594; GFX9-NEXT: buffer_wbinvl1_vol 2595; GFX9-NEXT: s_mov_b32 s3, 0xf000 2596; GFX9-NEXT: s_mov_b32 s2, -1 2597; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2598; GFX9-NEXT: s_endpgm 2599; 2600; GFX1064-LABEL: sub_i64_varying: 2601; GFX1064: ; %bb.0: ; %entry 2602; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2603; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2604; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2605; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2606; GFX1064-NEXT: s_mov_b32 s2, -1 2607; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2608; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2609; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2610; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2611; GFX1064-NEXT: buffer_gl0_inv 2612; GFX1064-NEXT: buffer_gl1_inv 2613; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2614; GFX1064-NEXT: s_endpgm 2615; 2616; GFX1032-LABEL: sub_i64_varying: 2617; GFX1032: ; %bb.0: ; %entry 2618; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2619; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2620; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2621; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2622; GFX1032-NEXT: s_mov_b32 s2, -1 2623; GFX1032-NEXT: ; implicit-def: $vcc_hi 2624; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2625; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2626; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2627; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2628; GFX1032-NEXT: buffer_gl0_inv 2629; GFX1032-NEXT: buffer_gl1_inv 2630; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2631; GFX1032-NEXT: s_endpgm 2632entry: 2633 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2634 %zext = zext i32 %lane to i64 2635 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2636 store i64 %old, i64 addrspace(1)* %out 2637 ret void 2638} 2639 2640define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2641; 2642; 2643; GFX7LESS-LABEL: and_i32_varying: 2644; GFX7LESS: ; %bb.0: ; %entry 2645; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2646; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2647; GFX7LESS-NEXT: s_mov_b32 m0, -1 2648; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2649; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2650; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2651; GFX7LESS-NEXT: buffer_wbinvl1 2652; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2653; GFX7LESS-NEXT: s_mov_b32 s2, -1 2654; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2655; GFX7LESS-NEXT: s_endpgm 2656; 2657; GFX8-LABEL: and_i32_varying: 2658; GFX8: ; %bb.0: ; %entry 2659; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2660; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2661; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2662; GFX8-NEXT: v_mov_b32_e32 v2, v0 2663; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2664; GFX8-NEXT: v_mov_b32_e32 v1, -1 2665; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2666; GFX8-NEXT: s_not_b64 exec, exec 2667; GFX8-NEXT: v_mov_b32_e32 v2, -1 2668; GFX8-NEXT: s_not_b64 exec, exec 2669; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2670; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2671; GFX8-NEXT: s_nop 1 2672; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2673; GFX8-NEXT: s_nop 1 2674; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2675; GFX8-NEXT: s_nop 1 2676; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2677; GFX8-NEXT: s_nop 1 2678; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2679; GFX8-NEXT: s_nop 1 2680; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2681; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2682; GFX8-NEXT: s_nop 0 2683; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2684; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2685; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2686; GFX8-NEXT: ; implicit-def: $vgpr0 2687; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2688; GFX8-NEXT: s_cbranch_execz BB14_2 2689; GFX8-NEXT: ; %bb.1: 2690; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2691; GFX8-NEXT: v_mov_b32_e32 v3, s2 2692; GFX8-NEXT: s_mov_b32 m0, -1 2693; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2694; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2695; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2696; GFX8-NEXT: buffer_wbinvl1_vol 2697; GFX8-NEXT: BB14_2: 2698; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2699; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2700; GFX8-NEXT: v_mov_b32_e32 v0, v1 2701; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2702; GFX8-NEXT: s_mov_b32 s3, 0xf000 2703; GFX8-NEXT: s_mov_b32 s2, -1 2704; GFX8-NEXT: s_nop 0 2705; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2706; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2707; GFX8-NEXT: s_endpgm 2708; 2709; GFX9-LABEL: and_i32_varying: 2710; GFX9: ; %bb.0: ; %entry 2711; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2712; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2713; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2714; GFX9-NEXT: v_mov_b32_e32 v2, v0 2715; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2716; GFX9-NEXT: v_mov_b32_e32 v1, -1 2717; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2718; GFX9-NEXT: s_not_b64 exec, exec 2719; GFX9-NEXT: v_mov_b32_e32 v2, -1 2720; GFX9-NEXT: s_not_b64 exec, exec 2721; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2722; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2723; GFX9-NEXT: s_nop 1 2724; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2725; GFX9-NEXT: s_nop 1 2726; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2727; GFX9-NEXT: s_nop 1 2728; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2729; GFX9-NEXT: s_nop 1 2730; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2731; GFX9-NEXT: s_nop 1 2732; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2733; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2734; GFX9-NEXT: s_nop 0 2735; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2736; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2737; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2738; GFX9-NEXT: ; implicit-def: $vgpr0 2739; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2740; GFX9-NEXT: s_cbranch_execz BB14_2 2741; GFX9-NEXT: ; %bb.1: 2742; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2743; GFX9-NEXT: v_mov_b32_e32 v3, s2 2744; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2745; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2746; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2747; GFX9-NEXT: buffer_wbinvl1_vol 2748; GFX9-NEXT: BB14_2: 2749; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2750; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2751; GFX9-NEXT: v_mov_b32_e32 v0, v1 2752; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2753; GFX9-NEXT: s_mov_b32 s3, 0xf000 2754; GFX9-NEXT: s_mov_b32 s2, -1 2755; GFX9-NEXT: s_nop 0 2756; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2757; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2758; GFX9-NEXT: s_endpgm 2759; 2760; GFX1064-LABEL: and_i32_varying: 2761; GFX1064: ; %bb.0: ; %entry 2762; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2763; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 2764; GFX1064-NEXT: v_mov_b32_e32 v2, v0 2765; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 2766; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2767; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2768; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2769; GFX1064-NEXT: s_not_b64 exec, exec 2770; GFX1064-NEXT: v_mov_b32_e32 v2, -1 2771; GFX1064-NEXT: s_not_b64 exec, exec 2772; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2773; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2774; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2775; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2776; GFX1064-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2777; GFX1064-NEXT: v_mov_b32_e32 v3, v2 2778; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2779; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2780; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 2781; GFX1064-NEXT: v_mov_b32_e32 v3, s2 2782; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2783; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 2784; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2785; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 2786; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 2787; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 2788; GFX1064-NEXT: s_mov_b32 s2, -1 2789; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 2790; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 2791; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 2792; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2793; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 2794; GFX1064-NEXT: ; implicit-def: $vgpr0 2795; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2796; GFX1064-NEXT: s_cbranch_execz BB14_2 2797; GFX1064-NEXT: ; %bb.1: 2798; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2799; GFX1064-NEXT: v_mov_b32_e32 v7, s3 2800; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2801; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2802; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7 2803; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2804; GFX1064-NEXT: buffer_gl0_inv 2805; GFX1064-NEXT: buffer_gl1_inv 2806; GFX1064-NEXT: BB14_2: 2807; GFX1064-NEXT: v_nop 2808; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2809; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2810; GFX1064-NEXT: v_mov_b32_e32 v0, v1 2811; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2812; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2813; GFX1064-NEXT: s_nop 1 2814; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2815; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2816; GFX1064-NEXT: s_endpgm 2817; 2818; GFX1032-LABEL: and_i32_varying: 2819; GFX1032: ; %bb.0: ; %entry 2820; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2821; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 2822; GFX1032-NEXT: ; implicit-def: $vcc_hi 2823; GFX1032-NEXT: v_mov_b32_e32 v2, v0 2824; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2825; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2826; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2827; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2828; GFX1032-NEXT: v_mov_b32_e32 v2, -1 2829; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2830; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 2831; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2832; GFX1032-NEXT: s_mov_b32 s2, -1 2833; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2834; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2835; GFX1032-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2836; GFX1032-NEXT: v_mov_b32_e32 v3, v2 2837; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 2838; GFX1032-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2839; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 2840; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2841; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 2842; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 2843; GFX1032-NEXT: s_mov_b32 exec_lo, s4 2844; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 2845; GFX1032-NEXT: ; implicit-def: $vgpr0 2846; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2847; GFX1032-NEXT: s_cbranch_execz BB14_2 2848; GFX1032-NEXT: ; %bb.1: 2849; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2850; GFX1032-NEXT: v_mov_b32_e32 v7, s3 2851; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2852; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2853; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7 2854; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2855; GFX1032-NEXT: buffer_gl0_inv 2856; GFX1032-NEXT: buffer_gl1_inv 2857; GFX1032-NEXT: BB14_2: 2858; GFX1032-NEXT: v_nop 2859; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2860; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2861; GFX1032-NEXT: v_mov_b32_e32 v0, v1 2862; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2863; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2864; GFX1032-NEXT: s_nop 1 2865; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2866; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2867; GFX1032-NEXT: s_endpgm 2868entry: 2869 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2870 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2871 store i32 %old, i32 addrspace(1)* %out 2872 ret void 2873} 2874 2875define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2876; 2877; 2878; GFX7LESS-LABEL: or_i32_varying: 2879; GFX7LESS: ; %bb.0: ; %entry 2880; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2881; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2882; GFX7LESS-NEXT: s_mov_b32 m0, -1 2883; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2884; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2885; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2886; GFX7LESS-NEXT: buffer_wbinvl1 2887; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2888; GFX7LESS-NEXT: s_mov_b32 s2, -1 2889; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2890; GFX7LESS-NEXT: s_endpgm 2891; 2892; GFX8-LABEL: or_i32_varying: 2893; GFX8: ; %bb.0: ; %entry 2894; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2895; GFX8-NEXT: s_mov_b64 s[2:3], exec 2896; GFX8-NEXT: v_mov_b32_e32 v2, v0 2897; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2898; GFX8-NEXT: v_mov_b32_e32 v1, 0 2899; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2900; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2901; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2902; GFX8-NEXT: s_not_b64 exec, exec 2903; GFX8-NEXT: v_mov_b32_e32 v2, 0 2904; GFX8-NEXT: s_not_b64 exec, exec 2905; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 2906; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2907; GFX8-NEXT: s_nop 1 2908; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2909; GFX8-NEXT: s_nop 1 2910; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2911; GFX8-NEXT: s_nop 1 2912; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2913; GFX8-NEXT: s_nop 1 2914; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2915; GFX8-NEXT: s_nop 1 2916; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2917; GFX8-NEXT: v_readlane_b32 s2, v2, 63 2918; GFX8-NEXT: s_nop 0 2919; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2920; GFX8-NEXT: s_mov_b64 exec, s[4:5] 2921; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2922; GFX8-NEXT: ; implicit-def: $vgpr0 2923; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2924; GFX8-NEXT: s_cbranch_execz BB15_2 2925; GFX8-NEXT: ; %bb.1: 2926; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2927; GFX8-NEXT: v_mov_b32_e32 v3, s2 2928; GFX8-NEXT: s_mov_b32 m0, -1 2929; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2930; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2931; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2932; GFX8-NEXT: buffer_wbinvl1_vol 2933; GFX8-NEXT: BB15_2: 2934; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2935; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2936; GFX8-NEXT: v_mov_b32_e32 v0, v1 2937; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2938; GFX8-NEXT: s_mov_b32 s3, 0xf000 2939; GFX8-NEXT: s_mov_b32 s2, -1 2940; GFX8-NEXT: s_nop 0 2941; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2942; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2943; GFX8-NEXT: s_endpgm 2944; 2945; GFX9-LABEL: or_i32_varying: 2946; GFX9: ; %bb.0: ; %entry 2947; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2948; GFX9-NEXT: s_mov_b64 s[2:3], exec 2949; GFX9-NEXT: v_mov_b32_e32 v2, v0 2950; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2951; GFX9-NEXT: v_mov_b32_e32 v1, 0 2952; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2953; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 2954; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 2955; GFX9-NEXT: s_not_b64 exec, exec 2956; GFX9-NEXT: v_mov_b32_e32 v2, 0 2957; GFX9-NEXT: s_not_b64 exec, exec 2958; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 2959; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2960; GFX9-NEXT: s_nop 1 2961; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2962; GFX9-NEXT: s_nop 1 2963; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2964; GFX9-NEXT: s_nop 1 2965; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2966; GFX9-NEXT: s_nop 1 2967; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2968; GFX9-NEXT: s_nop 1 2969; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2970; GFX9-NEXT: v_readlane_b32 s2, v2, 63 2971; GFX9-NEXT: s_nop 0 2972; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2973; GFX9-NEXT: s_mov_b64 exec, s[4:5] 2974; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2975; GFX9-NEXT: ; implicit-def: $vgpr0 2976; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2977; GFX9-NEXT: s_cbranch_execz BB15_2 2978; GFX9-NEXT: ; %bb.1: 2979; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2980; GFX9-NEXT: v_mov_b32_e32 v3, s2 2981; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2982; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2983; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2984; GFX9-NEXT: buffer_wbinvl1_vol 2985; GFX9-NEXT: BB15_2: 2986; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2987; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2988; GFX9-NEXT: v_mov_b32_e32 v0, v1 2989; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2990; GFX9-NEXT: s_mov_b32 s3, 0xf000 2991; GFX9-NEXT: s_mov_b32 s2, -1 2992; GFX9-NEXT: s_nop 0 2993; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2994; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2995; GFX9-NEXT: s_endpgm 2996; 2997; GFX1064-LABEL: or_i32_varying: 2998; GFX1064: ; %bb.0: ; %entry 2999; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3000; GFX1064-NEXT: s_mov_b64 s[2:3], exec 3001; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3002; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3003; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3004; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3005; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3006; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3007; GFX1064-NEXT: s_not_b64 exec, exec 3008; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3009; GFX1064-NEXT: s_not_b64 exec, exec 3010; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3011; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3012; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3013; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3014; GFX1064-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3015; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3016; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3017; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3018; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3019; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3020; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3021; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3022; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3023; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3024; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3025; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3026; GFX1064-NEXT: s_mov_b32 s2, -1 3027; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3028; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3029; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3030; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3031; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3032; GFX1064-NEXT: ; implicit-def: $vgpr0 3033; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3034; GFX1064-NEXT: s_cbranch_execz BB15_2 3035; GFX1064-NEXT: ; %bb.1: 3036; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3037; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3038; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3039; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3040; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7 3041; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3042; GFX1064-NEXT: buffer_gl0_inv 3043; GFX1064-NEXT: buffer_gl1_inv 3044; GFX1064-NEXT: BB15_2: 3045; GFX1064-NEXT: v_nop 3046; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3047; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3048; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3049; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 3050; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3051; GFX1064-NEXT: s_nop 1 3052; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3053; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3054; GFX1064-NEXT: s_endpgm 3055; 3056; GFX1032-LABEL: or_i32_varying: 3057; GFX1032: ; %bb.0: ; %entry 3058; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3059; GFX1032-NEXT: s_mov_b32 s2, exec_lo 3060; GFX1032-NEXT: ; implicit-def: $vcc_hi 3061; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3062; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 3063; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3064; GFX1032-NEXT: s_mov_b32 exec_lo, s3 3065; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3066; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3067; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3068; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3069; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3070; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3071; GFX1032-NEXT: s_mov_b32 s2, -1 3072; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3073; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3074; GFX1032-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3075; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3076; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3077; GFX1032-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3078; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3079; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3080; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3081; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3082; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3083; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3084; GFX1032-NEXT: ; implicit-def: $vgpr0 3085; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3086; GFX1032-NEXT: s_cbranch_execz BB15_2 3087; GFX1032-NEXT: ; %bb.1: 3088; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3089; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3090; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3091; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3092; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7 3093; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3094; GFX1032-NEXT: buffer_gl0_inv 3095; GFX1032-NEXT: buffer_gl1_inv 3096; GFX1032-NEXT: BB15_2: 3097; GFX1032-NEXT: v_nop 3098; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3099; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3100; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3101; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3102; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3103; GFX1032-NEXT: s_nop 1 3104; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3105; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3106; GFX1032-NEXT: s_endpgm 3107entry: 3108 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3109 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3110 store i32 %old, i32 addrspace(1)* %out 3111 ret void 3112} 3113 3114define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3115; 3116; 3117; GFX7LESS-LABEL: xor_i32_varying: 3118; GFX7LESS: ; %bb.0: ; %entry 3119; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3120; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3121; GFX7LESS-NEXT: s_mov_b32 m0, -1 3122; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3123; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3124; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3125; GFX7LESS-NEXT: buffer_wbinvl1 3126; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3127; GFX7LESS-NEXT: s_mov_b32 s2, -1 3128; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3129; GFX7LESS-NEXT: s_endpgm 3130; 3131; GFX8-LABEL: xor_i32_varying: 3132; GFX8: ; %bb.0: ; %entry 3133; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3134; GFX8-NEXT: s_mov_b64 s[2:3], exec 3135; GFX8-NEXT: v_mov_b32_e32 v2, v0 3136; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3137; GFX8-NEXT: v_mov_b32_e32 v1, 0 3138; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3139; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3140; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3141; GFX8-NEXT: s_not_b64 exec, exec 3142; GFX8-NEXT: v_mov_b32_e32 v2, 0 3143; GFX8-NEXT: s_not_b64 exec, exec 3144; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3145; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3146; GFX8-NEXT: s_nop 1 3147; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3148; GFX8-NEXT: s_nop 1 3149; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3150; GFX8-NEXT: s_nop 1 3151; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3152; GFX8-NEXT: s_nop 1 3153; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3154; GFX8-NEXT: s_nop 1 3155; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3156; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3157; GFX8-NEXT: s_nop 0 3158; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3159; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3160; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3161; GFX8-NEXT: ; implicit-def: $vgpr0 3162; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3163; GFX8-NEXT: s_cbranch_execz BB16_2 3164; GFX8-NEXT: ; %bb.1: 3165; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3166; GFX8-NEXT: v_mov_b32_e32 v3, s2 3167; GFX8-NEXT: s_mov_b32 m0, -1 3168; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3169; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3170; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3171; GFX8-NEXT: buffer_wbinvl1_vol 3172; GFX8-NEXT: BB16_2: 3173; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3174; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3175; GFX8-NEXT: v_mov_b32_e32 v0, v1 3176; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3177; GFX8-NEXT: s_mov_b32 s3, 0xf000 3178; GFX8-NEXT: s_mov_b32 s2, -1 3179; GFX8-NEXT: s_nop 0 3180; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3181; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3182; GFX8-NEXT: s_endpgm 3183; 3184; GFX9-LABEL: xor_i32_varying: 3185; GFX9: ; %bb.0: ; %entry 3186; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3187; GFX9-NEXT: s_mov_b64 s[2:3], exec 3188; GFX9-NEXT: v_mov_b32_e32 v2, v0 3189; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3190; GFX9-NEXT: v_mov_b32_e32 v1, 0 3191; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3192; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 3193; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 3194; GFX9-NEXT: s_not_b64 exec, exec 3195; GFX9-NEXT: v_mov_b32_e32 v2, 0 3196; GFX9-NEXT: s_not_b64 exec, exec 3197; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3198; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3199; GFX9-NEXT: s_nop 1 3200; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3201; GFX9-NEXT: s_nop 1 3202; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3203; GFX9-NEXT: s_nop 1 3204; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3205; GFX9-NEXT: s_nop 1 3206; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3207; GFX9-NEXT: s_nop 1 3208; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3209; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3210; GFX9-NEXT: s_nop 0 3211; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3212; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3213; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3214; GFX9-NEXT: ; implicit-def: $vgpr0 3215; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3216; GFX9-NEXT: s_cbranch_execz BB16_2 3217; GFX9-NEXT: ; %bb.1: 3218; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3219; GFX9-NEXT: v_mov_b32_e32 v3, s2 3220; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3221; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3222; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3223; GFX9-NEXT: buffer_wbinvl1_vol 3224; GFX9-NEXT: BB16_2: 3225; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3226; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3227; GFX9-NEXT: v_mov_b32_e32 v0, v1 3228; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3229; GFX9-NEXT: s_mov_b32 s3, 0xf000 3230; GFX9-NEXT: s_mov_b32 s2, -1 3231; GFX9-NEXT: s_nop 0 3232; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3233; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3234; GFX9-NEXT: s_endpgm 3235; 3236; GFX1064-LABEL: xor_i32_varying: 3237; GFX1064: ; %bb.0: ; %entry 3238; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3239; GFX1064-NEXT: s_mov_b64 s[2:3], exec 3240; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3241; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3242; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3243; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3244; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3245; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 3246; GFX1064-NEXT: s_not_b64 exec, exec 3247; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3248; GFX1064-NEXT: s_not_b64 exec, exec 3249; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3250; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3251; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3252; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3253; GFX1064-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3254; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3255; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3256; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3257; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3258; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3259; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3260; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3261; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3262; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3263; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3264; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3265; GFX1064-NEXT: s_mov_b32 s2, -1 3266; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3267; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3268; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3269; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3270; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3271; GFX1064-NEXT: ; implicit-def: $vgpr0 3272; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3273; GFX1064-NEXT: s_cbranch_execz BB16_2 3274; GFX1064-NEXT: ; %bb.1: 3275; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3276; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3277; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3278; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3279; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7 3280; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3281; GFX1064-NEXT: buffer_gl0_inv 3282; GFX1064-NEXT: buffer_gl1_inv 3283; GFX1064-NEXT: BB16_2: 3284; GFX1064-NEXT: v_nop 3285; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3286; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3287; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3288; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3289; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3290; GFX1064-NEXT: s_nop 1 3291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3292; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3293; GFX1064-NEXT: s_endpgm 3294; 3295; GFX1032-LABEL: xor_i32_varying: 3296; GFX1032: ; %bb.0: ; %entry 3297; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3298; GFX1032-NEXT: s_mov_b32 s2, exec_lo 3299; GFX1032-NEXT: ; implicit-def: $vcc_hi 3300; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3301; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 3302; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3303; GFX1032-NEXT: s_mov_b32 exec_lo, s3 3304; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 3305; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3306; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3307; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3308; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3309; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3310; GFX1032-NEXT: s_mov_b32 s2, -1 3311; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3312; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3313; GFX1032-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3314; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3315; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3316; GFX1032-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3317; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3318; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3319; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3320; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3321; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3322; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3323; GFX1032-NEXT: ; implicit-def: $vgpr0 3324; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3325; GFX1032-NEXT: s_cbranch_execz BB16_2 3326; GFX1032-NEXT: ; %bb.1: 3327; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3328; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3329; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3330; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3331; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7 3332; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3333; GFX1032-NEXT: buffer_gl0_inv 3334; GFX1032-NEXT: buffer_gl1_inv 3335; GFX1032-NEXT: BB16_2: 3336; GFX1032-NEXT: v_nop 3337; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3338; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3339; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3340; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3341; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3342; GFX1032-NEXT: s_nop 1 3343; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3344; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3345; GFX1032-NEXT: s_endpgm 3346entry: 3347 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3348 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3349 store i32 %old, i32 addrspace(1)* %out 3350 ret void 3351} 3352 3353define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3354; 3355; 3356; GFX7LESS-LABEL: max_i32_varying: 3357; GFX7LESS: ; %bb.0: ; %entry 3358; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3359; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3360; GFX7LESS-NEXT: s_mov_b32 m0, -1 3361; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3362; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3363; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3364; GFX7LESS-NEXT: buffer_wbinvl1 3365; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3366; GFX7LESS-NEXT: s_mov_b32 s2, -1 3367; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3368; GFX7LESS-NEXT: s_endpgm 3369; 3370; GFX8-LABEL: max_i32_varying: 3371; GFX8: ; %bb.0: ; %entry 3372; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3373; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3374; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3375; GFX8-NEXT: v_mov_b32_e32 v2, v0 3376; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3377; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3378; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3379; GFX8-NEXT: s_not_b64 exec, exec 3380; GFX8-NEXT: v_mov_b32_e32 v2, v1 3381; GFX8-NEXT: s_not_b64 exec, exec 3382; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3383; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3384; GFX8-NEXT: s_nop 1 3385; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3386; GFX8-NEXT: s_nop 1 3387; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3388; GFX8-NEXT: s_nop 1 3389; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3390; GFX8-NEXT: s_nop 1 3391; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3392; GFX8-NEXT: s_nop 1 3393; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3394; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3395; GFX8-NEXT: s_nop 0 3396; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3397; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3398; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3399; GFX8-NEXT: ; implicit-def: $vgpr0 3400; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3401; GFX8-NEXT: s_cbranch_execz BB17_2 3402; GFX8-NEXT: ; %bb.1: 3403; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3404; GFX8-NEXT: v_mov_b32_e32 v3, s2 3405; GFX8-NEXT: s_mov_b32 m0, -1 3406; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3407; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3408; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3409; GFX8-NEXT: buffer_wbinvl1_vol 3410; GFX8-NEXT: BB17_2: 3411; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3412; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3413; GFX8-NEXT: v_mov_b32_e32 v0, v1 3414; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3415; GFX8-NEXT: s_mov_b32 s3, 0xf000 3416; GFX8-NEXT: s_mov_b32 s2, -1 3417; GFX8-NEXT: s_nop 0 3418; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3419; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3420; GFX8-NEXT: s_endpgm 3421; 3422; GFX9-LABEL: max_i32_varying: 3423; GFX9: ; %bb.0: ; %entry 3424; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3425; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3426; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3427; GFX9-NEXT: v_mov_b32_e32 v2, v0 3428; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3429; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3430; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3431; GFX9-NEXT: s_not_b64 exec, exec 3432; GFX9-NEXT: v_mov_b32_e32 v2, v1 3433; GFX9-NEXT: s_not_b64 exec, exec 3434; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3435; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3436; GFX9-NEXT: s_nop 1 3437; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3438; GFX9-NEXT: s_nop 1 3439; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3440; GFX9-NEXT: s_nop 1 3441; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3442; GFX9-NEXT: s_nop 1 3443; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3444; GFX9-NEXT: s_nop 1 3445; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3446; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3447; GFX9-NEXT: s_nop 0 3448; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3449; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3450; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3451; GFX9-NEXT: ; implicit-def: $vgpr0 3452; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3453; GFX9-NEXT: s_cbranch_execz BB17_2 3454; GFX9-NEXT: ; %bb.1: 3455; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3456; GFX9-NEXT: v_mov_b32_e32 v3, s2 3457; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3458; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3459; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3460; GFX9-NEXT: buffer_wbinvl1_vol 3461; GFX9-NEXT: BB17_2: 3462; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3463; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3464; GFX9-NEXT: v_mov_b32_e32 v0, v1 3465; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3466; GFX9-NEXT: s_mov_b32 s3, 0xf000 3467; GFX9-NEXT: s_mov_b32 s2, -1 3468; GFX9-NEXT: s_nop 0 3469; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3470; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3471; GFX9-NEXT: s_endpgm 3472; 3473; GFX1064-LABEL: max_i32_varying: 3474; GFX1064: ; %bb.0: ; %entry 3475; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3476; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 3477; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3478; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 3479; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3480; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3481; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3482; GFX1064-NEXT: s_not_b64 exec, exec 3483; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3484; GFX1064-NEXT: s_not_b64 exec, exec 3485; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3486; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3487; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3488; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3489; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3490; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3491; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3492; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3493; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3494; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3495; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3496; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3497; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3498; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3499; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3500; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3501; GFX1064-NEXT: s_mov_b32 s2, -1 3502; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3503; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3504; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3505; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3506; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3507; GFX1064-NEXT: ; implicit-def: $vgpr0 3508; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3509; GFX1064-NEXT: s_cbranch_execz BB17_2 3510; GFX1064-NEXT: ; %bb.1: 3511; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3512; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3513; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3514; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3515; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7 3516; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3517; GFX1064-NEXT: buffer_gl0_inv 3518; GFX1064-NEXT: buffer_gl1_inv 3519; GFX1064-NEXT: BB17_2: 3520; GFX1064-NEXT: v_nop 3521; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3522; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3523; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3524; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3525; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3526; GFX1064-NEXT: s_nop 1 3527; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3528; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3529; GFX1064-NEXT: s_endpgm 3530; 3531; GFX1032-LABEL: max_i32_varying: 3532; GFX1032: ; %bb.0: ; %entry 3533; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3534; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 3535; GFX1032-NEXT: ; implicit-def: $vcc_hi 3536; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3538; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3539; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3540; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3541; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3542; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3543; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3544; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3545; GFX1032-NEXT: s_mov_b32 s2, -1 3546; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3547; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3548; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3549; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3550; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3551; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3552; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3553; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3554; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3555; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3556; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3557; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 3558; GFX1032-NEXT: ; implicit-def: $vgpr0 3559; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3560; GFX1032-NEXT: s_cbranch_execz BB17_2 3561; GFX1032-NEXT: ; %bb.1: 3562; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3563; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3564; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3565; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3566; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7 3567; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3568; GFX1032-NEXT: buffer_gl0_inv 3569; GFX1032-NEXT: buffer_gl1_inv 3570; GFX1032-NEXT: BB17_2: 3571; GFX1032-NEXT: v_nop 3572; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3573; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3574; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3575; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3576; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3577; GFX1032-NEXT: s_nop 1 3578; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3579; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3580; GFX1032-NEXT: s_endpgm 3581entry: 3582 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3583 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3584 store i32 %old, i32 addrspace(1)* %out 3585 ret void 3586} 3587 3588define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3589; 3590; 3591; GFX7LESS-LABEL: max_i64_constant: 3592; GFX7LESS: ; %bb.0: ; %entry 3593; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3594; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3595; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3596; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3597; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3598; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3599; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3600; GFX7LESS-NEXT: ; %bb.1: 3601; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3602; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3603; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3604; GFX7LESS-NEXT: s_mov_b32 m0, -1 3605; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3606; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3607; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3608; GFX7LESS-NEXT: buffer_wbinvl1 3609; GFX7LESS-NEXT: BB18_2: 3610; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3611; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3612; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3613; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3614; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3615; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3616; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3617; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3618; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3619; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3620; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3621; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3622; GFX7LESS-NEXT: s_mov_b32 s2, -1 3623; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3624; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3625; GFX7LESS-NEXT: s_endpgm 3626; 3627; GFX8-LABEL: max_i64_constant: 3628; GFX8: ; %bb.0: ; %entry 3629; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3630; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3631; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3632; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3633; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3634; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3635; GFX8-NEXT: s_cbranch_execz BB18_2 3636; GFX8-NEXT: ; %bb.1: 3637; GFX8-NEXT: v_mov_b32_e32 v0, 5 3638; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3639; GFX8-NEXT: v_mov_b32_e32 v1, 0 3640; GFX8-NEXT: s_mov_b32 m0, -1 3641; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3642; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3643; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3644; GFX8-NEXT: buffer_wbinvl1_vol 3645; GFX8-NEXT: BB18_2: 3646; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3647; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3648; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3649; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3650; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3651; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3652; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3653; GFX8-NEXT: v_mov_b32_e32 v2, s3 3654; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3655; GFX8-NEXT: v_mov_b32_e32 v2, s2 3656; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3657; GFX8-NEXT: s_mov_b32 s3, 0xf000 3658; GFX8-NEXT: s_mov_b32 s2, -1 3659; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3660; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3661; GFX8-NEXT: s_endpgm 3662; 3663; GFX9-LABEL: max_i64_constant: 3664; GFX9: ; %bb.0: ; %entry 3665; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3666; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3667; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3668; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3669; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3670; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3671; GFX9-NEXT: s_cbranch_execz BB18_2 3672; GFX9-NEXT: ; %bb.1: 3673; GFX9-NEXT: v_mov_b32_e32 v0, 5 3674; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3675; GFX9-NEXT: v_mov_b32_e32 v1, 0 3676; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3677; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3678; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3679; GFX9-NEXT: buffer_wbinvl1_vol 3680; GFX9-NEXT: BB18_2: 3681; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3682; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3683; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3684; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3685; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3686; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3687; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3688; GFX9-NEXT: v_mov_b32_e32 v2, s3 3689; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3690; GFX9-NEXT: v_mov_b32_e32 v2, s2 3691; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3692; GFX9-NEXT: s_mov_b32 s3, 0xf000 3693; GFX9-NEXT: s_mov_b32 s2, -1 3694; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3695; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3696; GFX9-NEXT: s_endpgm 3697; 3698; GFX1064-LABEL: max_i64_constant: 3699; GFX1064: ; %bb.0: ; %entry 3700; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3701; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3702; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3703; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3704; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3705; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3706; GFX1064-NEXT: s_cbranch_execz BB18_2 3707; GFX1064-NEXT: ; %bb.1: 3708; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3709; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3710; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3711; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3712; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3713; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3714; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3715; GFX1064-NEXT: buffer_gl0_inv 3716; GFX1064-NEXT: buffer_gl1_inv 3717; GFX1064-NEXT: BB18_2: 3718; GFX1064-NEXT: v_nop 3719; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3720; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 3721; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 3722; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3723; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3724; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3725; GFX1064-NEXT: s_mov_b32 s2, -1 3726; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3727; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 3728; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 3729; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3730; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3731; GFX1064-NEXT: s_endpgm 3732; 3733; GFX1032-LABEL: max_i64_constant: 3734; GFX1032: ; %bb.0: ; %entry 3735; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3736; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3737; GFX1032-NEXT: ; implicit-def: $vcc_hi 3738; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3739; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3740; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3741; GFX1032-NEXT: s_cbranch_execz BB18_2 3742; GFX1032-NEXT: ; %bb.1: 3743; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3744; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3745; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3746; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3747; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3748; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3749; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3750; GFX1032-NEXT: buffer_gl0_inv 3751; GFX1032-NEXT: buffer_gl1_inv 3752; GFX1032-NEXT: BB18_2: 3753; GFX1032-NEXT: v_nop 3754; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3755; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 3756; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 3757; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3758; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3759; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3760; GFX1032-NEXT: s_mov_b32 s2, -1 3761; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[0:1] 3762; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 3763; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 3764; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3765; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3766; GFX1032-NEXT: s_endpgm 3767entry: 3768 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3769 store i64 %old, i64 addrspace(1)* %out 3770 ret void 3771} 3772 3773define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3774; 3775; 3776; GFX7LESS-LABEL: min_i32_varying: 3777; GFX7LESS: ; %bb.0: ; %entry 3778; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3779; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3780; GFX7LESS-NEXT: s_mov_b32 m0, -1 3781; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3782; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3783; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3784; GFX7LESS-NEXT: buffer_wbinvl1 3785; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3786; GFX7LESS-NEXT: s_mov_b32 s2, -1 3787; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3788; GFX7LESS-NEXT: s_endpgm 3789; 3790; GFX8-LABEL: min_i32_varying: 3791; GFX8: ; %bb.0: ; %entry 3792; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3793; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3794; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3795; GFX8-NEXT: v_mov_b32_e32 v2, v0 3796; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3797; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3798; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3799; GFX8-NEXT: s_not_b64 exec, exec 3800; GFX8-NEXT: v_mov_b32_e32 v2, v1 3801; GFX8-NEXT: s_not_b64 exec, exec 3802; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 3803; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3804; GFX8-NEXT: s_nop 1 3805; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3806; GFX8-NEXT: s_nop 1 3807; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3808; GFX8-NEXT: s_nop 1 3809; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3810; GFX8-NEXT: s_nop 1 3811; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3812; GFX8-NEXT: s_nop 1 3813; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3814; GFX8-NEXT: v_readlane_b32 s2, v2, 63 3815; GFX8-NEXT: s_nop 0 3816; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3817; GFX8-NEXT: s_mov_b64 exec, s[4:5] 3818; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3819; GFX8-NEXT: ; implicit-def: $vgpr0 3820; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3821; GFX8-NEXT: s_cbranch_execz BB19_2 3822; GFX8-NEXT: ; %bb.1: 3823; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3824; GFX8-NEXT: v_mov_b32_e32 v3, s2 3825; GFX8-NEXT: s_mov_b32 m0, -1 3826; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3827; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3828; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3829; GFX8-NEXT: buffer_wbinvl1_vol 3830; GFX8-NEXT: BB19_2: 3831; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3832; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3833; GFX8-NEXT: v_mov_b32_e32 v0, v1 3834; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3835; GFX8-NEXT: s_mov_b32 s3, 0xf000 3836; GFX8-NEXT: s_mov_b32 s2, -1 3837; GFX8-NEXT: s_nop 0 3838; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3839; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3840; GFX8-NEXT: s_endpgm 3841; 3842; GFX9-LABEL: min_i32_varying: 3843; GFX9: ; %bb.0: ; %entry 3844; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3845; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3846; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3847; GFX9-NEXT: v_mov_b32_e32 v2, v0 3848; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3849; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3850; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3851; GFX9-NEXT: s_not_b64 exec, exec 3852; GFX9-NEXT: v_mov_b32_e32 v2, v1 3853; GFX9-NEXT: s_not_b64 exec, exec 3854; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 3855; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3856; GFX9-NEXT: s_nop 1 3857; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3858; GFX9-NEXT: s_nop 1 3859; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3860; GFX9-NEXT: s_nop 1 3861; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3862; GFX9-NEXT: s_nop 1 3863; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3864; GFX9-NEXT: s_nop 1 3865; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3866; GFX9-NEXT: v_readlane_b32 s2, v2, 63 3867; GFX9-NEXT: s_nop 0 3868; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3869; GFX9-NEXT: s_mov_b64 exec, s[4:5] 3870; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3871; GFX9-NEXT: ; implicit-def: $vgpr0 3872; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 3873; GFX9-NEXT: s_cbranch_execz BB19_2 3874; GFX9-NEXT: ; %bb.1: 3875; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3876; GFX9-NEXT: v_mov_b32_e32 v3, s2 3877; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3878; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3879; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3880; GFX9-NEXT: buffer_wbinvl1_vol 3881; GFX9-NEXT: BB19_2: 3882; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 3883; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3884; GFX9-NEXT: v_mov_b32_e32 v0, v1 3885; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3886; GFX9-NEXT: s_mov_b32 s3, 0xf000 3887; GFX9-NEXT: s_mov_b32 s2, -1 3888; GFX9-NEXT: s_nop 0 3889; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3890; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3891; GFX9-NEXT: s_endpgm 3892; 3893; GFX1064-LABEL: min_i32_varying: 3894; GFX1064: ; %bb.0: ; %entry 3895; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3896; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 3897; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3898; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 3899; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3900; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3901; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3902; GFX1064-NEXT: s_not_b64 exec, exec 3903; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3904; GFX1064-NEXT: s_not_b64 exec, exec 3905; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3906; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3907; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3908; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3909; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3910; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3911; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3912; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3913; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 3914; GFX1064-NEXT: v_mov_b32_e32 v3, s2 3915; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3916; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 3917; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3918; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 3919; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3920; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 3921; GFX1064-NEXT: s_mov_b32 s2, -1 3922; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 3923; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 3924; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3925; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3926; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 3927; GFX1064-NEXT: ; implicit-def: $vgpr0 3928; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3929; GFX1064-NEXT: s_cbranch_execz BB19_2 3930; GFX1064-NEXT: ; %bb.1: 3931; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3932; GFX1064-NEXT: v_mov_b32_e32 v7, s3 3933; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3934; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3935; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7 3936; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3937; GFX1064-NEXT: buffer_gl0_inv 3938; GFX1064-NEXT: buffer_gl1_inv 3939; GFX1064-NEXT: BB19_2: 3940; GFX1064-NEXT: v_nop 3941; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3942; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3943; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3944; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3945; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3946; GFX1064-NEXT: s_nop 1 3947; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3948; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3949; GFX1064-NEXT: s_endpgm 3950; 3951; GFX1032-LABEL: min_i32_varying: 3952; GFX1032: ; %bb.0: ; %entry 3953; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3954; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 3955; GFX1032-NEXT: ; implicit-def: $vcc_hi 3956; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3957; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3958; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3959; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3960; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3961; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3962; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3963; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 3964; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3965; GFX1032-NEXT: s_mov_b32 s2, -1 3966; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3967; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3968; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3969; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3970; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3971; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3972; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 3973; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3974; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 3975; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 3976; GFX1032-NEXT: s_mov_b32 exec_lo, s4 3977; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 3978; GFX1032-NEXT: ; implicit-def: $vgpr0 3979; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 3980; GFX1032-NEXT: s_cbranch_execz BB19_2 3981; GFX1032-NEXT: ; %bb.1: 3982; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3983; GFX1032-NEXT: v_mov_b32_e32 v7, s3 3984; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3985; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3986; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7 3987; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3988; GFX1032-NEXT: buffer_gl0_inv 3989; GFX1032-NEXT: buffer_gl1_inv 3990; GFX1032-NEXT: BB19_2: 3991; GFX1032-NEXT: v_nop 3992; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 3993; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3994; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3995; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3996; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3997; GFX1032-NEXT: s_nop 1 3998; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3999; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4000; GFX1032-NEXT: s_endpgm 4001entry: 4002 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4003 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4004 store i32 %old, i32 addrspace(1)* %out 4005 ret void 4006} 4007 4008define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 4009; 4010; 4011; GFX7LESS-LABEL: min_i64_constant: 4012; GFX7LESS: ; %bb.0: ; %entry 4013; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4014; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4015; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4016; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4017; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4018; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4019; GFX7LESS-NEXT: s_cbranch_execz BB20_2 4020; GFX7LESS-NEXT: ; %bb.1: 4021; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4022; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4023; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4024; GFX7LESS-NEXT: s_mov_b32 m0, -1 4025; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4026; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4027; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4028; GFX7LESS-NEXT: buffer_wbinvl1 4029; GFX7LESS-NEXT: BB20_2: 4030; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4031; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4032; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4033; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 4034; GFX7LESS-NEXT: s_mov_b32 s2, -1 4035; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4036; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4037; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4038; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 4039; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4040; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4041; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4042; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4043; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4044; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4045; GFX7LESS-NEXT: s_endpgm 4046; 4047; GFX8-LABEL: min_i64_constant: 4048; GFX8: ; %bb.0: ; %entry 4049; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4050; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4051; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4052; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4053; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4054; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4055; GFX8-NEXT: s_cbranch_execz BB20_2 4056; GFX8-NEXT: ; %bb.1: 4057; GFX8-NEXT: v_mov_b32_e32 v0, 5 4058; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4059; GFX8-NEXT: v_mov_b32_e32 v1, 0 4060; GFX8-NEXT: s_mov_b32 m0, -1 4061; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4062; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4063; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4064; GFX8-NEXT: buffer_wbinvl1_vol 4065; GFX8-NEXT: BB20_2: 4066; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4067; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4068; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 4069; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4070; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4071; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4072; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4073; GFX8-NEXT: v_mov_b32_e32 v2, s5 4074; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4075; GFX8-NEXT: v_mov_b32_e32 v2, s4 4076; GFX8-NEXT: s_mov_b32 s2, -1 4077; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4078; GFX8-NEXT: s_mov_b32 s3, 0xf000 4079; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4080; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4081; GFX8-NEXT: s_endpgm 4082; 4083; GFX9-LABEL: min_i64_constant: 4084; GFX9: ; %bb.0: ; %entry 4085; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4086; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4087; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4088; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4089; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4090; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4091; GFX9-NEXT: s_cbranch_execz BB20_2 4092; GFX9-NEXT: ; %bb.1: 4093; GFX9-NEXT: v_mov_b32_e32 v0, 5 4094; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4095; GFX9-NEXT: v_mov_b32_e32 v1, 0 4096; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4097; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4098; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4099; GFX9-NEXT: buffer_wbinvl1_vol 4100; GFX9-NEXT: BB20_2: 4101; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4102; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4103; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4104; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4105; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4106; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4107; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4108; GFX9-NEXT: v_mov_b32_e32 v2, s5 4109; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4110; GFX9-NEXT: v_mov_b32_e32 v2, s4 4111; GFX9-NEXT: s_mov_b32 s2, -1 4112; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4113; GFX9-NEXT: s_mov_b32 s3, 0xf000 4114; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4115; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4116; GFX9-NEXT: s_endpgm 4117; 4118; GFX1064-LABEL: min_i64_constant: 4119; GFX1064: ; %bb.0: ; %entry 4120; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4121; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4122; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4123; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4124; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4125; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4126; GFX1064-NEXT: s_cbranch_execz BB20_2 4127; GFX1064-NEXT: ; %bb.1: 4128; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4129; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4130; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4131; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4132; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4133; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4134; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4135; GFX1064-NEXT: buffer_gl0_inv 4136; GFX1064-NEXT: buffer_gl1_inv 4137; GFX1064-NEXT: BB20_2: 4138; GFX1064-NEXT: v_nop 4139; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4140; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4141; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4142; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4143; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4144; GFX1064-NEXT: s_mov_b32 s2, -1 4145; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4146; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4147; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 4148; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4149; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4150; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4151; GFX1064-NEXT: s_endpgm 4152; 4153; GFX1032-LABEL: min_i64_constant: 4154; GFX1032: ; %bb.0: ; %entry 4155; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4156; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4157; GFX1032-NEXT: ; implicit-def: $vcc_hi 4158; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4159; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4160; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4161; GFX1032-NEXT: s_cbranch_execz BB20_2 4162; GFX1032-NEXT: ; %bb.1: 4163; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4164; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4165; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4166; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4167; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4168; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4169; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4170; GFX1032-NEXT: buffer_gl0_inv 4171; GFX1032-NEXT: buffer_gl1_inv 4172; GFX1032-NEXT: BB20_2: 4173; GFX1032-NEXT: v_nop 4174; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4175; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 4176; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 4177; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4178; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4179; GFX1032-NEXT: s_mov_b32 s2, -1 4180; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4181; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[0:1] 4182; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 4183; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 4184; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4185; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4186; GFX1032-NEXT: s_endpgm 4187entry: 4188 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4189 store i64 %old, i64 addrspace(1)* %out 4190 ret void 4191} 4192 4193define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4194; 4195; 4196; GFX7LESS-LABEL: umax_i32_varying: 4197; GFX7LESS: ; %bb.0: ; %entry 4198; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4199; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4200; GFX7LESS-NEXT: s_mov_b32 m0, -1 4201; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4202; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4203; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4204; GFX7LESS-NEXT: buffer_wbinvl1 4205; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4206; GFX7LESS-NEXT: s_mov_b32 s2, -1 4207; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4208; GFX7LESS-NEXT: s_endpgm 4209; 4210; GFX8-LABEL: umax_i32_varying: 4211; GFX8: ; %bb.0: ; %entry 4212; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4213; GFX8-NEXT: s_mov_b64 s[2:3], exec 4214; GFX8-NEXT: v_mov_b32_e32 v2, v0 4215; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4216; GFX8-NEXT: v_mov_b32_e32 v1, 0 4217; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4218; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4219; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4220; GFX8-NEXT: s_not_b64 exec, exec 4221; GFX8-NEXT: v_mov_b32_e32 v2, 0 4222; GFX8-NEXT: s_not_b64 exec, exec 4223; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4224; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4225; GFX8-NEXT: s_nop 1 4226; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4227; GFX8-NEXT: s_nop 1 4228; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4229; GFX8-NEXT: s_nop 1 4230; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4231; GFX8-NEXT: s_nop 1 4232; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4233; GFX8-NEXT: s_nop 1 4234; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4235; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4236; GFX8-NEXT: s_nop 0 4237; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4238; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4239; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4240; GFX8-NEXT: ; implicit-def: $vgpr0 4241; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4242; GFX8-NEXT: s_cbranch_execz BB21_2 4243; GFX8-NEXT: ; %bb.1: 4244; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4245; GFX8-NEXT: v_mov_b32_e32 v3, s2 4246; GFX8-NEXT: s_mov_b32 m0, -1 4247; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4248; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4249; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4250; GFX8-NEXT: buffer_wbinvl1_vol 4251; GFX8-NEXT: BB21_2: 4252; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4253; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4254; GFX8-NEXT: v_mov_b32_e32 v0, v1 4255; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4256; GFX8-NEXT: s_mov_b32 s3, 0xf000 4257; GFX8-NEXT: s_mov_b32 s2, -1 4258; GFX8-NEXT: s_nop 0 4259; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4260; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4261; GFX8-NEXT: s_endpgm 4262; 4263; GFX9-LABEL: umax_i32_varying: 4264; GFX9: ; %bb.0: ; %entry 4265; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4266; GFX9-NEXT: s_mov_b64 s[2:3], exec 4267; GFX9-NEXT: v_mov_b32_e32 v2, v0 4268; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4269; GFX9-NEXT: v_mov_b32_e32 v1, 0 4270; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4271; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 4272; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 4273; GFX9-NEXT: s_not_b64 exec, exec 4274; GFX9-NEXT: v_mov_b32_e32 v2, 0 4275; GFX9-NEXT: s_not_b64 exec, exec 4276; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4277; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4278; GFX9-NEXT: s_nop 1 4279; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4280; GFX9-NEXT: s_nop 1 4281; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4282; GFX9-NEXT: s_nop 1 4283; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4284; GFX9-NEXT: s_nop 1 4285; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4286; GFX9-NEXT: s_nop 1 4287; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4288; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4289; GFX9-NEXT: s_nop 0 4290; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4291; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4292; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4293; GFX9-NEXT: ; implicit-def: $vgpr0 4294; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4295; GFX9-NEXT: s_cbranch_execz BB21_2 4296; GFX9-NEXT: ; %bb.1: 4297; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4298; GFX9-NEXT: v_mov_b32_e32 v3, s2 4299; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4300; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4301; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4302; GFX9-NEXT: buffer_wbinvl1_vol 4303; GFX9-NEXT: BB21_2: 4304; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4305; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4306; GFX9-NEXT: v_mov_b32_e32 v0, v1 4307; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4308; GFX9-NEXT: s_mov_b32 s3, 0xf000 4309; GFX9-NEXT: s_mov_b32 s2, -1 4310; GFX9-NEXT: s_nop 0 4311; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4312; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4313; GFX9-NEXT: s_endpgm 4314; 4315; GFX1064-LABEL: umax_i32_varying: 4316; GFX1064: ; %bb.0: ; %entry 4317; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4318; GFX1064-NEXT: s_mov_b64 s[2:3], exec 4319; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4320; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4321; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4322; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4323; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4324; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 4325; GFX1064-NEXT: s_not_b64 exec, exec 4326; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4327; GFX1064-NEXT: s_not_b64 exec, exec 4328; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4329; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4330; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4331; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4332; GFX1064-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4333; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4334; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4335; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4336; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4337; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4338; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4339; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4340; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4341; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4342; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4343; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4344; GFX1064-NEXT: s_mov_b32 s2, -1 4345; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4346; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4347; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4348; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4349; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4350; GFX1064-NEXT: ; implicit-def: $vgpr0 4351; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4352; GFX1064-NEXT: s_cbranch_execz BB21_2 4353; GFX1064-NEXT: ; %bb.1: 4354; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4355; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4356; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4357; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4358; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7 4359; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4360; GFX1064-NEXT: buffer_gl0_inv 4361; GFX1064-NEXT: buffer_gl1_inv 4362; GFX1064-NEXT: BB21_2: 4363; GFX1064-NEXT: v_nop 4364; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4365; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4366; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4367; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4368; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4369; GFX1064-NEXT: s_nop 1 4370; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4371; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4372; GFX1064-NEXT: s_endpgm 4373; 4374; GFX1032-LABEL: umax_i32_varying: 4375; GFX1032: ; %bb.0: ; %entry 4376; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4377; GFX1032-NEXT: s_mov_b32 s2, exec_lo 4378; GFX1032-NEXT: ; implicit-def: $vcc_hi 4379; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4380; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 4381; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4382; GFX1032-NEXT: s_mov_b32 exec_lo, s3 4383; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 4384; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4385; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4386; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4387; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4388; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4389; GFX1032-NEXT: s_mov_b32 s2, -1 4390; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4391; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4392; GFX1032-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4393; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4394; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4395; GFX1032-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4396; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4397; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4398; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4399; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4400; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4401; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4402; GFX1032-NEXT: ; implicit-def: $vgpr0 4403; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4404; GFX1032-NEXT: s_cbranch_execz BB21_2 4405; GFX1032-NEXT: ; %bb.1: 4406; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4407; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4408; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4409; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4410; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7 4411; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4412; GFX1032-NEXT: buffer_gl0_inv 4413; GFX1032-NEXT: buffer_gl1_inv 4414; GFX1032-NEXT: BB21_2: 4415; GFX1032-NEXT: v_nop 4416; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4417; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4418; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4419; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4420; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4421; GFX1032-NEXT: s_nop 1 4422; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4423; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4424; GFX1032-NEXT: s_endpgm 4425entry: 4426 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4427 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4428 store i32 %old, i32 addrspace(1)* %out 4429 ret void 4430} 4431 4432define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4433; 4434; 4435; GFX7LESS-LABEL: umax_i64_constant: 4436; GFX7LESS: ; %bb.0: ; %entry 4437; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4438; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4439; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4440; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4441; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4442; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4443; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4444; GFX7LESS-NEXT: ; %bb.1: 4445; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4446; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4447; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4448; GFX7LESS-NEXT: s_mov_b32 m0, -1 4449; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4450; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4451; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4452; GFX7LESS-NEXT: buffer_wbinvl1 4453; GFX7LESS-NEXT: BB22_2: 4454; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4455; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4456; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4457; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4458; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4459; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4460; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4461; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4462; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4463; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4464; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4465; GFX7LESS-NEXT: s_mov_b32 s2, -1 4466; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4467; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4468; GFX7LESS-NEXT: s_endpgm 4469; 4470; GFX8-LABEL: umax_i64_constant: 4471; GFX8: ; %bb.0: ; %entry 4472; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4473; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4474; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4475; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4476; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4477; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4478; GFX8-NEXT: s_cbranch_execz BB22_2 4479; GFX8-NEXT: ; %bb.1: 4480; GFX8-NEXT: v_mov_b32_e32 v0, 5 4481; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4482; GFX8-NEXT: v_mov_b32_e32 v1, 0 4483; GFX8-NEXT: s_mov_b32 m0, -1 4484; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4485; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4486; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4487; GFX8-NEXT: buffer_wbinvl1_vol 4488; GFX8-NEXT: BB22_2: 4489; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4490; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4491; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4492; GFX8-NEXT: v_mov_b32_e32 v1, 0 4493; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4494; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4495; GFX8-NEXT: v_mov_b32_e32 v1, s3 4496; GFX8-NEXT: v_mov_b32_e32 v2, s2 4497; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4498; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4499; GFX8-NEXT: s_mov_b32 s3, 0xf000 4500; GFX8-NEXT: s_mov_b32 s2, -1 4501; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4502; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4503; GFX8-NEXT: s_endpgm 4504; 4505; GFX9-LABEL: umax_i64_constant: 4506; GFX9: ; %bb.0: ; %entry 4507; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4508; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4509; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4510; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4511; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4512; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4513; GFX9-NEXT: s_cbranch_execz BB22_2 4514; GFX9-NEXT: ; %bb.1: 4515; GFX9-NEXT: v_mov_b32_e32 v0, 5 4516; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4517; GFX9-NEXT: v_mov_b32_e32 v1, 0 4518; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4519; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4520; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4521; GFX9-NEXT: buffer_wbinvl1_vol 4522; GFX9-NEXT: BB22_2: 4523; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4524; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4525; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4526; GFX9-NEXT: v_mov_b32_e32 v1, 0 4527; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4528; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4529; GFX9-NEXT: v_mov_b32_e32 v1, s3 4530; GFX9-NEXT: v_mov_b32_e32 v2, s2 4531; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4532; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4533; GFX9-NEXT: s_mov_b32 s3, 0xf000 4534; GFX9-NEXT: s_mov_b32 s2, -1 4535; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4536; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4537; GFX9-NEXT: s_endpgm 4538; 4539; GFX1064-LABEL: umax_i64_constant: 4540; GFX1064: ; %bb.0: ; %entry 4541; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4542; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4543; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4544; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4545; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4546; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4547; GFX1064-NEXT: s_cbranch_execz BB22_2 4548; GFX1064-NEXT: ; %bb.1: 4549; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4550; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4551; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4552; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4553; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4554; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4555; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4556; GFX1064-NEXT: buffer_gl0_inv 4557; GFX1064-NEXT: buffer_gl1_inv 4558; GFX1064-NEXT: BB22_2: 4559; GFX1064-NEXT: v_nop 4560; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4561; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4562; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4563; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4564; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4565; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4566; GFX1064-NEXT: s_mov_b32 s2, -1 4567; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4568; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4569; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc 4570; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4571; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4572; GFX1064-NEXT: s_endpgm 4573; 4574; GFX1032-LABEL: umax_i64_constant: 4575; GFX1032: ; %bb.0: ; %entry 4576; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4577; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4578; GFX1032-NEXT: ; implicit-def: $vcc_hi 4579; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4580; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4581; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4582; GFX1032-NEXT: s_cbranch_execz BB22_2 4583; GFX1032-NEXT: ; %bb.1: 4584; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4585; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4586; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4587; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4588; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4589; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4590; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4591; GFX1032-NEXT: buffer_gl0_inv 4592; GFX1032-NEXT: buffer_gl1_inv 4593; GFX1032-NEXT: BB22_2: 4594; GFX1032-NEXT: v_nop 4595; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4596; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 4597; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 4598; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4599; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4600; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4601; GFX1032-NEXT: s_mov_b32 s2, -1 4602; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] 4603; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 4604; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo 4605; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4606; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4607; GFX1032-NEXT: s_endpgm 4608entry: 4609 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4610 store i64 %old, i64 addrspace(1)* %out 4611 ret void 4612} 4613 4614define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4615; 4616; 4617; GFX7LESS-LABEL: umin_i32_varying: 4618; GFX7LESS: ; %bb.0: ; %entry 4619; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4620; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4621; GFX7LESS-NEXT: s_mov_b32 m0, -1 4622; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4623; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4624; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4625; GFX7LESS-NEXT: buffer_wbinvl1 4626; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4627; GFX7LESS-NEXT: s_mov_b32 s2, -1 4628; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4629; GFX7LESS-NEXT: s_endpgm 4630; 4631; GFX8-LABEL: umin_i32_varying: 4632; GFX8: ; %bb.0: ; %entry 4633; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4634; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4635; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4636; GFX8-NEXT: v_mov_b32_e32 v2, v0 4637; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4638; GFX8-NEXT: v_mov_b32_e32 v1, -1 4639; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4640; GFX8-NEXT: s_not_b64 exec, exec 4641; GFX8-NEXT: v_mov_b32_e32 v2, -1 4642; GFX8-NEXT: s_not_b64 exec, exec 4643; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 4644; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4645; GFX8-NEXT: s_nop 1 4646; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4647; GFX8-NEXT: s_nop 1 4648; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4649; GFX8-NEXT: s_nop 1 4650; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4651; GFX8-NEXT: s_nop 1 4652; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4653; GFX8-NEXT: s_nop 1 4654; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4655; GFX8-NEXT: v_readlane_b32 s2, v2, 63 4656; GFX8-NEXT: s_nop 0 4657; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4658; GFX8-NEXT: s_mov_b64 exec, s[4:5] 4659; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4660; GFX8-NEXT: ; implicit-def: $vgpr0 4661; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4662; GFX8-NEXT: s_cbranch_execz BB23_2 4663; GFX8-NEXT: ; %bb.1: 4664; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4665; GFX8-NEXT: v_mov_b32_e32 v3, s2 4666; GFX8-NEXT: s_mov_b32 m0, -1 4667; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4668; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4669; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4670; GFX8-NEXT: buffer_wbinvl1_vol 4671; GFX8-NEXT: BB23_2: 4672; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4673; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4674; GFX8-NEXT: v_mov_b32_e32 v0, v1 4675; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4676; GFX8-NEXT: s_mov_b32 s3, 0xf000 4677; GFX8-NEXT: s_mov_b32 s2, -1 4678; GFX8-NEXT: s_nop 0 4679; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4680; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4681; GFX8-NEXT: s_endpgm 4682; 4683; GFX9-LABEL: umin_i32_varying: 4684; GFX9: ; %bb.0: ; %entry 4685; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4686; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4687; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4688; GFX9-NEXT: v_mov_b32_e32 v2, v0 4689; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4690; GFX9-NEXT: v_mov_b32_e32 v1, -1 4691; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4692; GFX9-NEXT: s_not_b64 exec, exec 4693; GFX9-NEXT: v_mov_b32_e32 v2, -1 4694; GFX9-NEXT: s_not_b64 exec, exec 4695; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 4696; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4697; GFX9-NEXT: s_nop 1 4698; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4699; GFX9-NEXT: s_nop 1 4700; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4701; GFX9-NEXT: s_nop 1 4702; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4703; GFX9-NEXT: s_nop 1 4704; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4705; GFX9-NEXT: s_nop 1 4706; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4707; GFX9-NEXT: v_readlane_b32 s2, v2, 63 4708; GFX9-NEXT: s_nop 0 4709; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4710; GFX9-NEXT: s_mov_b64 exec, s[4:5] 4711; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4712; GFX9-NEXT: ; implicit-def: $vgpr0 4713; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 4714; GFX9-NEXT: s_cbranch_execz BB23_2 4715; GFX9-NEXT: ; %bb.1: 4716; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4717; GFX9-NEXT: v_mov_b32_e32 v3, s2 4718; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4719; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4720; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4721; GFX9-NEXT: buffer_wbinvl1_vol 4722; GFX9-NEXT: BB23_2: 4723; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 4724; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4725; GFX9-NEXT: v_mov_b32_e32 v0, v1 4726; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4727; GFX9-NEXT: s_mov_b32 s3, 0xf000 4728; GFX9-NEXT: s_mov_b32 s2, -1 4729; GFX9-NEXT: s_nop 0 4730; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4731; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4732; GFX9-NEXT: s_endpgm 4733; 4734; GFX1064-LABEL: umin_i32_varying: 4735; GFX1064: ; %bb.0: ; %entry 4736; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4737; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 4738; GFX1064-NEXT: v_mov_b32_e32 v2, v0 4739; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v4, exec_hi, v4 4740; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4741; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4742; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4743; GFX1064-NEXT: s_not_b64 exec, exec 4744; GFX1064-NEXT: v_mov_b32_e32 v2, -1 4745; GFX1064-NEXT: s_not_b64 exec, exec 4746; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4747; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4748; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4749; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4750; GFX1064-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4751; GFX1064-NEXT: v_mov_b32_e32 v3, v2 4752; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4753; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4754; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 4755; GFX1064-NEXT: v_mov_b32_e32 v3, s2 4756; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4757; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 4758; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4759; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 4760; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 4761; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 4762; GFX1064-NEXT: s_mov_b32 s2, -1 4763; GFX1064-NEXT: v_writelane_b32 v1, s3, 32 4764; GFX1064-NEXT: v_readlane_b32 s3, v2, 63 4765; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 4766; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4767; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 4768; GFX1064-NEXT: ; implicit-def: $vgpr0 4769; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4770; GFX1064-NEXT: s_cbranch_execz BB23_2 4771; GFX1064-NEXT: ; %bb.1: 4772; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4773; GFX1064-NEXT: v_mov_b32_e32 v7, s3 4774; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4775; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4776; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7 4777; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4778; GFX1064-NEXT: buffer_gl0_inv 4779; GFX1064-NEXT: buffer_gl1_inv 4780; GFX1064-NEXT: BB23_2: 4781; GFX1064-NEXT: v_nop 4782; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4783; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4784; GFX1064-NEXT: v_mov_b32_e32 v0, v1 4785; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4786; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4787; GFX1064-NEXT: s_nop 1 4788; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4789; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4790; GFX1064-NEXT: s_endpgm 4791; 4792; GFX1032-LABEL: umin_i32_varying: 4793; GFX1032: ; %bb.0: ; %entry 4794; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4795; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v4, exec_lo, 0 4796; GFX1032-NEXT: ; implicit-def: $vcc_hi 4797; GFX1032-NEXT: v_mov_b32_e32 v2, v0 4798; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4799; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4800; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4801; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4802; GFX1032-NEXT: v_mov_b32_e32 v2, -1 4803; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4804; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 4805; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4806; GFX1032-NEXT: s_mov_b32 s2, -1 4807; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4808; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4809; GFX1032-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4810; GFX1032-NEXT: v_mov_b32_e32 v3, v2 4811; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 4812; GFX1032-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4813; GFX1032-NEXT: v_readlane_b32 s3, v2, 31 4814; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4815; GFX1032-NEXT: v_readlane_b32 s5, v2, 15 4816; GFX1032-NEXT: v_writelane_b32 v1, s5, 16 4817; GFX1032-NEXT: s_mov_b32 exec_lo, s4 4818; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 4819; GFX1032-NEXT: ; implicit-def: $vgpr0 4820; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 4821; GFX1032-NEXT: s_cbranch_execz BB23_2 4822; GFX1032-NEXT: ; %bb.1: 4823; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4824; GFX1032-NEXT: v_mov_b32_e32 v7, s3 4825; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4826; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4827; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7 4828; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4829; GFX1032-NEXT: buffer_gl0_inv 4830; GFX1032-NEXT: buffer_gl1_inv 4831; GFX1032-NEXT: BB23_2: 4832; GFX1032-NEXT: v_nop 4833; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 4834; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4835; GFX1032-NEXT: v_mov_b32_e32 v0, v1 4836; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4837; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4838; GFX1032-NEXT: s_nop 1 4839; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4840; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4841; GFX1032-NEXT: s_endpgm 4842entry: 4843 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4844 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4845 store i32 %old, i32 addrspace(1)* %out 4846 ret void 4847} 4848 4849define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4850; 4851; 4852; GFX7LESS-LABEL: umin_i64_constant: 4853; GFX7LESS: ; %bb.0: ; %entry 4854; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4855; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4856; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4857; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4858; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4859; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4860; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4861; GFX7LESS-NEXT: ; %bb.1: 4862; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4863; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4864; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4865; GFX7LESS-NEXT: s_mov_b32 m0, -1 4866; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4867; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4868; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4869; GFX7LESS-NEXT: buffer_wbinvl1 4870; GFX7LESS-NEXT: BB24_2: 4871; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4872; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4873; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4874; GFX7LESS-NEXT: s_mov_b32 s2, -1 4875; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4876; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4877; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4878; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4879; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4880; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4881; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4882; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4883; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4884; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4885; GFX7LESS-NEXT: s_endpgm 4886; 4887; GFX8-LABEL: umin_i64_constant: 4888; GFX8: ; %bb.0: ; %entry 4889; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4890; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4891; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4892; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4893; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4894; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4895; GFX8-NEXT: s_cbranch_execz BB24_2 4896; GFX8-NEXT: ; %bb.1: 4897; GFX8-NEXT: v_mov_b32_e32 v0, 5 4898; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4899; GFX8-NEXT: v_mov_b32_e32 v1, 0 4900; GFX8-NEXT: s_mov_b32 m0, -1 4901; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4902; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4903; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4904; GFX8-NEXT: buffer_wbinvl1_vol 4905; GFX8-NEXT: BB24_2: 4906; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4907; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4908; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4909; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4910; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4911; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4912; GFX8-NEXT: v_mov_b32_e32 v2, s5 4913; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4914; GFX8-NEXT: v_mov_b32_e32 v2, s4 4915; GFX8-NEXT: s_mov_b32 s2, -1 4916; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4917; GFX8-NEXT: s_mov_b32 s3, 0xf000 4918; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4919; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4920; GFX8-NEXT: s_endpgm 4921; 4922; GFX9-LABEL: umin_i64_constant: 4923; GFX9: ; %bb.0: ; %entry 4924; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4925; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4926; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4927; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4928; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4929; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4930; GFX9-NEXT: s_cbranch_execz BB24_2 4931; GFX9-NEXT: ; %bb.1: 4932; GFX9-NEXT: v_mov_b32_e32 v0, 5 4933; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4934; GFX9-NEXT: v_mov_b32_e32 v1, 0 4935; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4936; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4937; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4938; GFX9-NEXT: buffer_wbinvl1_vol 4939; GFX9-NEXT: BB24_2: 4940; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4941; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4942; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4943; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4944; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4945; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4946; GFX9-NEXT: v_mov_b32_e32 v2, s5 4947; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4948; GFX9-NEXT: v_mov_b32_e32 v2, s4 4949; GFX9-NEXT: s_mov_b32 s2, -1 4950; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4951; GFX9-NEXT: s_mov_b32 s3, 0xf000 4952; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4953; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4954; GFX9-NEXT: s_endpgm 4955; 4956; GFX1064-LABEL: umin_i64_constant: 4957; GFX1064: ; %bb.0: ; %entry 4958; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4959; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4960; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4961; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4962; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4963; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4964; GFX1064-NEXT: s_cbranch_execz BB24_2 4965; GFX1064-NEXT: ; %bb.1: 4966; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4967; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4968; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4969; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4970; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4971; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4972; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4973; GFX1064-NEXT: buffer_gl0_inv 4974; GFX1064-NEXT: buffer_gl1_inv 4975; GFX1064-NEXT: BB24_2: 4976; GFX1064-NEXT: v_nop 4977; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4978; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 4979; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 4980; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4981; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4982; GFX1064-NEXT: s_mov_b32 s2, -1 4983; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4984; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4985; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc 4986; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc 4987; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4988; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4989; GFX1064-NEXT: s_endpgm 4990; 4991; GFX1032-LABEL: umin_i64_constant: 4992; GFX1032: ; %bb.0: ; %entry 4993; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4994; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4995; GFX1032-NEXT: ; implicit-def: $vcc_hi 4996; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4997; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4998; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4999; GFX1032-NEXT: s_cbranch_execz BB24_2 5000; GFX1032-NEXT: ; %bb.1: 5001; GFX1032-NEXT: v_mov_b32_e32 v0, 5 5002; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 5003; GFX1032-NEXT: v_mov_b32_e32 v1, 0 5004; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5005; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 5006; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 5007; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5008; GFX1032-NEXT: buffer_gl0_inv 5009; GFX1032-NEXT: buffer_gl1_inv 5010; GFX1032-NEXT: BB24_2: 5011; GFX1032-NEXT: v_nop 5012; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 5013; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 5014; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 5015; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 5016; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 5017; GFX1032-NEXT: s_mov_b32 s2, -1 5018; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 5019; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] 5020; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo 5021; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo 5022; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 5023; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5024; GFX1032-NEXT: s_endpgm 5025entry: 5026 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 5027 store i64 %old, i64 addrspace(1)* %out 5028 ret void 5029} 5030