1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: .LBB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz .LBB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, 0 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: .LBB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz .LBB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, 0 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: .LBB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz .LBB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, 0 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: .LBB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 140; GFX1032-NEXT: s_mov_b32 s3, exec_lo 141; GFX1032-NEXT: ; implicit-def: $vgpr1 142; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 143; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 144; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 145; GFX1032-NEXT: s_cbranch_execz .LBB0_2 146; GFX1032-NEXT: ; %bb.1: 147; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 148; GFX1032-NEXT: v_mov_b32_e32 v1, 0 149; GFX1032-NEXT: s_mul_i32 s3, s3, 5 150; GFX1032-NEXT: v_mov_b32_e32 v2, s3 151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 154; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 155; GFX1032-NEXT: buffer_gl0_inv 156; GFX1032-NEXT: .LBB0_2: 157; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 158; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 159; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 161; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 162; GFX1032-NEXT: s_mov_b32 s2, -1 163; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 164; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 165; GFX1032-NEXT: s_endpgm 166entry: 167 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 168 store i32 %old, i32 addrspace(1)* %out 169 ret void 170} 171 172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 173; 174; 175; GFX7LESS-LABEL: add_i32_uniform: 176; GFX7LESS: ; %bb.0: ; %entry 177; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 178; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 179; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 180; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 181; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 182; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 183; GFX7LESS-NEXT: ; implicit-def: $vgpr1 184; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 185; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 186; GFX7LESS-NEXT: ; %bb.1: 187; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 190; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 191; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 192; GFX7LESS-NEXT: s_mov_b32 m0, -1 193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 194; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: .LBB1_2: 197; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 198; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 199; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 200; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 201; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 202; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 203; GFX7LESS-NEXT: s_mov_b32 s6, -1 204; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 205; GFX7LESS-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_uniform: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 210; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 211; GFX8-NEXT: s_mov_b64 s[2:3], exec 212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 215; GFX8-NEXT: ; implicit-def: $vgpr1 216; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 217; GFX8-NEXT: s_cbranch_execz .LBB1_2 218; GFX8-NEXT: ; %bb.1: 219; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_mul_i32 s2, s6, s2 222; GFX8-NEXT: v_mov_b32_e32 v1, 0 223; GFX8-NEXT: v_mov_b32_e32 v2, s2 224; GFX8-NEXT: s_mov_b32 m0, -1 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: .LBB1_2: 229; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 231; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 232; GFX8-NEXT: v_readfirstlane_b32 s0, v1 233; GFX8-NEXT: s_mov_b32 s7, 0xf000 234; GFX8-NEXT: s_mov_b32 s6, -1 235; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 236; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 237; GFX8-NEXT: s_endpgm 238; 239; GFX9-LABEL: add_i32_uniform: 240; GFX9: ; %bb.0: ; %entry 241; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 242; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 243; GFX9-NEXT: s_mov_b64 s[2:3], exec 244; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 245; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX9-NEXT: ; implicit-def: $vgpr1 248; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 249; GFX9-NEXT: s_cbranch_execz .LBB1_2 250; GFX9-NEXT: ; %bb.1: 251; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: s_mul_i32 s2, s6, s2 254; GFX9-NEXT: v_mov_b32_e32 v1, 0 255; GFX9-NEXT: v_mov_b32_e32 v2, s2 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: .LBB1_2: 260; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 263; GFX9-NEXT: v_readfirstlane_b32 s0, v1 264; GFX9-NEXT: s_mov_b32 s7, 0xf000 265; GFX9-NEXT: s_mov_b32 s6, -1 266; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 267; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 268; GFX9-NEXT: s_endpgm 269; 270; GFX1064-LABEL: add_i32_uniform: 271; GFX1064: ; %bb.0: ; %entry 272; GFX1064-NEXT: s_clause 0x1 273; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 274; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 275; GFX1064-NEXT: s_mov_b64 s[2:3], exec 276; GFX1064-NEXT: ; implicit-def: $vgpr1 277; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 278; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 279; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 280; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 281; GFX1064-NEXT: s_cbranch_execz .LBB1_2 282; GFX1064-NEXT: ; %bb.1: 283; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 284; GFX1064-NEXT: v_mov_b32_e32 v1, 0 285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 286; GFX1064-NEXT: s_mul_i32 s2, s6, s2 287; GFX1064-NEXT: v_mov_b32_e32 v2, s2 288; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: buffer_gl0_inv 293; GFX1064-NEXT: .LBB1_2: 294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 295; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 296; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 297; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 298; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 299; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 300; GFX1064-NEXT: s_mov_b32 s6, -1 301; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 302; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 303; GFX1064-NEXT: s_endpgm 304; 305; GFX1032-LABEL: add_i32_uniform: 306; GFX1032: ; %bb.0: ; %entry 307; GFX1032-NEXT: s_clause 0x1 308; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 309; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 310; GFX1032-NEXT: s_mov_b32 s3, exec_lo 311; GFX1032-NEXT: ; implicit-def: $vgpr1 312; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 313; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 314; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 315; GFX1032-NEXT: s_cbranch_execz .LBB1_2 316; GFX1032-NEXT: ; %bb.1: 317; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 318; GFX1032-NEXT: v_mov_b32_e32 v1, 0 319; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 320; GFX1032-NEXT: s_mul_i32 s1, s2, s1 321; GFX1032-NEXT: v_mov_b32_e32 v2, s1 322; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 323; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 324; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: buffer_gl0_inv 327; GFX1032-NEXT: .LBB1_2: 328; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 329; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 330; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 331; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 332; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 333; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 334; GFX1032-NEXT: s_mov_b32 s6, -1 335; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 336; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX1032-NEXT: s_endpgm 338entry: 339 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 340 store i32 %old, i32 addrspace(1)* %out 341 ret void 342} 343 344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 345; 346; 347; GFX7LESS-LABEL: add_i32_varying: 348; GFX7LESS: ; %bb.0: ; %entry 349; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 350; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 351; GFX7LESS-NEXT: s_mov_b32 m0, -1 352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 356; GFX7LESS-NEXT: s_mov_b32 s2, -1 357; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 358; GFX7LESS-NEXT: s_endpgm 359; 360; GFX8-LABEL: add_i32_varying: 361; GFX8: ; %bb.0: ; %entry 362; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 363; GFX8-NEXT: v_mov_b32_e32 v2, v0 364; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 365; GFX8-NEXT: v_mov_b32_e32 v1, 0 366; GFX8-NEXT: s_mov_b64 exec, s[2:3] 367; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 368; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 369; GFX8-NEXT: s_not_b64 exec, exec 370; GFX8-NEXT: v_mov_b32_e32 v2, 0 371; GFX8-NEXT: s_not_b64 exec, exec 372; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 373; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 374; GFX8-NEXT: s_nop 1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 382; GFX8-NEXT: s_nop 1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 384; GFX8-NEXT: v_readlane_b32 s4, v2, 63 385; GFX8-NEXT: s_nop 0 386; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 387; GFX8-NEXT: s_mov_b64 exec, s[2:3] 388; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 389; GFX8-NEXT: ; implicit-def: $vgpr0 390; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 391; GFX8-NEXT: s_cbranch_execz .LBB2_2 392; GFX8-NEXT: ; %bb.1: 393; GFX8-NEXT: v_mov_b32_e32 v0, 0 394; GFX8-NEXT: v_mov_b32_e32 v3, s4 395; GFX8-NEXT: s_mov_b32 m0, -1 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: .LBB2_2: 400; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 402; GFX8-NEXT: v_readfirstlane_b32 s2, v0 403; GFX8-NEXT: v_mov_b32_e32 v0, v1 404; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 405; GFX8-NEXT: s_mov_b32 s3, 0xf000 406; GFX8-NEXT: s_mov_b32 s2, -1 407; GFX8-NEXT: s_nop 0 408; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 409; GFX8-NEXT: s_endpgm 410; 411; GFX9-LABEL: add_i32_varying: 412; GFX9: ; %bb.0: ; %entry 413; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 414; GFX9-NEXT: v_mov_b32_e32 v2, v0 415; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 416; GFX9-NEXT: v_mov_b32_e32 v1, 0 417; GFX9-NEXT: s_mov_b64 exec, s[2:3] 418; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 419; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 420; GFX9-NEXT: s_not_b64 exec, exec 421; GFX9-NEXT: v_mov_b32_e32 v2, 0 422; GFX9-NEXT: s_not_b64 exec, exec 423; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 424; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 425; GFX9-NEXT: s_nop 1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 435; GFX9-NEXT: v_readlane_b32 s4, v2, 63 436; GFX9-NEXT: s_nop 0 437; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 438; GFX9-NEXT: s_mov_b64 exec, s[2:3] 439; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 440; GFX9-NEXT: ; implicit-def: $vgpr0 441; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 442; GFX9-NEXT: s_cbranch_execz .LBB2_2 443; GFX9-NEXT: ; %bb.1: 444; GFX9-NEXT: v_mov_b32_e32 v0, 0 445; GFX9-NEXT: v_mov_b32_e32 v3, s4 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: .LBB2_2: 450; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: v_readfirstlane_b32 s2, v0 453; GFX9-NEXT: v_mov_b32_e32 v0, v1 454; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 455; GFX9-NEXT: s_mov_b32 s3, 0xf000 456; GFX9-NEXT: s_mov_b32 s2, -1 457; GFX9-NEXT: s_nop 0 458; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX9-NEXT: s_endpgm 460; 461; GFX1064-LABEL: add_i32_varying: 462; GFX1064: ; %bb.0: ; %entry 463; GFX1064-NEXT: v_mov_b32_e32 v1, v0 464; GFX1064-NEXT: s_not_b64 exec, exec 465; GFX1064-NEXT: v_mov_b32_e32 v1, 0 466; GFX1064-NEXT: s_not_b64 exec, exec 467; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX1064-NEXT: v_mov_b32_e32 v3, 0 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX1064-NEXT: v_mov_b32_e32 v2, v1 474; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 475; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 476; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 477; GFX1064-NEXT: v_mov_b32_e32 v2, s4 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 479; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 480; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 481; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 482; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 484; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 485; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 486; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 487; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 488; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 489; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 490; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 491; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 492; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 493; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 498; GFX1064-NEXT: s_mov_b32 s2, -1 499; GFX1064-NEXT: ; implicit-def: $vgpr0 500; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 501; GFX1064-NEXT: s_cbranch_execz .LBB2_2 502; GFX1064-NEXT: ; %bb.1: 503; GFX1064-NEXT: v_mov_b32_e32 v0, 0 504; GFX1064-NEXT: v_mov_b32_e32 v4, s7 505; GFX1064-NEXT: s_mov_b32 s3, s7 506; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 507; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 508; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 509; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 510; GFX1064-NEXT: buffer_gl0_inv 511; GFX1064-NEXT: .LBB2_2: 512; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 513; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 514; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 515; GFX1064-NEXT: v_mov_b32_e32 v0, v3 516; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 517; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 518; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 519; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 520; GFX1064-NEXT: s_endpgm 521; 522; GFX1032-LABEL: add_i32_varying: 523; GFX1032: ; %bb.0: ; %entry 524; GFX1032-NEXT: v_mov_b32_e32 v1, v0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: v_mov_b32_e32 v1, 0 527; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 528; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 532; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX1032-NEXT: v_mov_b32_e32 v2, v1 534; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 535; GFX1032-NEXT: s_mov_b32 exec_lo, s2 536; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 538; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 539; GFX1032-NEXT: v_mov_b32_e32 v3, 0 540; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 541; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 542; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 543; GFX1032-NEXT: s_mov_b32 exec_lo, s2 544; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 545; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 546; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 549; GFX1032-NEXT: s_mov_b32 s2, -1 550; GFX1032-NEXT: ; implicit-def: $vgpr0 551; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 552; GFX1032-NEXT: s_cbranch_execz .LBB2_2 553; GFX1032-NEXT: ; %bb.1: 554; GFX1032-NEXT: v_mov_b32_e32 v0, 0 555; GFX1032-NEXT: v_mov_b32_e32 v4, s4 556; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 557; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 558; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 559; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 560; GFX1032-NEXT: buffer_gl0_inv 561; GFX1032-NEXT: .LBB2_2: 562; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 563; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 564; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 565; GFX1032-NEXT: v_mov_b32_e32 v0, v3 566; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 567; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 568; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 569; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; GFX1032-NEXT: s_endpgm 571entry: 572 %lane = call i32 @llvm.amdgcn.workitem.id.x() 573 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 574 store i32 %old, i32 addrspace(1)* %out 575 ret void 576} 577 578define amdgpu_kernel void @add_i32_varying_nouse() { 579; GFX7LESS-LABEL: add_i32_varying_nouse: 580; GFX7LESS: ; %bb.0: ; %entry 581; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 582; GFX7LESS-NEXT: s_mov_b32 m0, -1 583; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 584; GFX7LESS-NEXT: ds_add_u32 v1, v0 585; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 586; GFX7LESS-NEXT: s_endpgm 587; 588; GFX8-LABEL: add_i32_varying_nouse: 589; GFX8: ; %bb.0: ; %entry 590; GFX8-NEXT: v_mov_b32_e32 v1, v0 591; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 592; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 593; GFX8-NEXT: s_not_b64 exec, exec 594; GFX8-NEXT: v_mov_b32_e32 v1, 0 595; GFX8-NEXT: s_not_b64 exec, exec 596; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 597; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 598; GFX8-NEXT: s_nop 1 599; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 600; GFX8-NEXT: s_nop 1 601; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 602; GFX8-NEXT: s_nop 1 603; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 604; GFX8-NEXT: s_nop 1 605; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 606; GFX8-NEXT: s_nop 1 607; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 608; GFX8-NEXT: v_readlane_b32 s2, v1, 63 609; GFX8-NEXT: s_mov_b64 exec, s[0:1] 610; GFX8-NEXT: s_mov_b32 s0, s2 611; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 612; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 613; GFX8-NEXT: s_cbranch_execz .LBB3_2 614; GFX8-NEXT: ; %bb.1: 615; GFX8-NEXT: v_mov_b32_e32 v0, 0 616; GFX8-NEXT: v_mov_b32_e32 v2, s0 617; GFX8-NEXT: s_mov_b32 m0, -1 618; GFX8-NEXT: s_waitcnt lgkmcnt(0) 619; GFX8-NEXT: ds_add_u32 v0, v2 620; GFX8-NEXT: s_waitcnt lgkmcnt(0) 621; GFX8-NEXT: .LBB3_2: 622; GFX8-NEXT: s_endpgm 623; 624; GFX9-LABEL: add_i32_varying_nouse: 625; GFX9: ; %bb.0: ; %entry 626; GFX9-NEXT: v_mov_b32_e32 v1, v0 627; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 628; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 629; GFX9-NEXT: s_not_b64 exec, exec 630; GFX9-NEXT: v_mov_b32_e32 v1, 0 631; GFX9-NEXT: s_not_b64 exec, exec 632; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 633; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 634; GFX9-NEXT: s_nop 1 635; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX9-NEXT: s_nop 1 637; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 638; GFX9-NEXT: s_nop 1 639; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 640; GFX9-NEXT: s_nop 1 641; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 642; GFX9-NEXT: s_nop 1 643; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 644; GFX9-NEXT: v_readlane_b32 s2, v1, 63 645; GFX9-NEXT: s_mov_b64 exec, s[0:1] 646; GFX9-NEXT: s_mov_b32 s0, s2 647; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 648; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 649; GFX9-NEXT: s_cbranch_execz .LBB3_2 650; GFX9-NEXT: ; %bb.1: 651; GFX9-NEXT: v_mov_b32_e32 v0, 0 652; GFX9-NEXT: v_mov_b32_e32 v2, s0 653; GFX9-NEXT: s_waitcnt lgkmcnt(0) 654; GFX9-NEXT: ds_add_u32 v0, v2 655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-NEXT: .LBB3_2: 657; GFX9-NEXT: s_endpgm 658; 659; GFX1064-LABEL: add_i32_varying_nouse: 660; GFX1064: ; %bb.0: ; %entry 661; GFX1064-NEXT: v_mov_b32_e32 v1, v0 662; GFX1064-NEXT: s_not_b64 exec, exec 663; GFX1064-NEXT: v_mov_b32_e32 v1, 0 664; GFX1064-NEXT: s_not_b64 exec, exec 665; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 666; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX1064-NEXT: v_mov_b32_e32 v2, v1 671; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 672; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 673; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 674; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 675; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 676; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 677; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 678; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 679; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 680; GFX1064-NEXT: s_add_i32 s0, s2, s3 681; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 682; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 683; GFX1064-NEXT: s_cbranch_execz .LBB3_2 684; GFX1064-NEXT: ; %bb.1: 685; GFX1064-NEXT: v_mov_b32_e32 v0, 0 686; GFX1064-NEXT: v_mov_b32_e32 v3, s0 687; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 688; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 689; GFX1064-NEXT: ds_add_u32 v0, v3 690; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 691; GFX1064-NEXT: buffer_gl0_inv 692; GFX1064-NEXT: .LBB3_2: 693; GFX1064-NEXT: s_endpgm 694; 695; GFX1032-LABEL: add_i32_varying_nouse: 696; GFX1032: ; %bb.0: ; %entry 697; GFX1032-NEXT: v_mov_b32_e32 v1, v0 698; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 699; GFX1032-NEXT: v_mov_b32_e32 v1, 0 700; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 701; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 702; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 703; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 704; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 705; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1032-NEXT: v_mov_b32_e32 v2, v1 707; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 708; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 709; GFX1032-NEXT: s_mov_b32 exec_lo, s0 710; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 711; GFX1032-NEXT: v_mov_b32_e32 v0, v1 712; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 713; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 714; GFX1032-NEXT: s_cbranch_execz .LBB3_2 715; GFX1032-NEXT: ; %bb.1: 716; GFX1032-NEXT: v_mov_b32_e32 v3, 0 717; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 718; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 719; GFX1032-NEXT: ds_add_u32 v3, v0 720; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 721; GFX1032-NEXT: buffer_gl0_inv 722; GFX1032-NEXT: .LBB3_2: 723; GFX1032-NEXT: s_endpgm 724entry: 725 %lane = call i32 @llvm.amdgcn.workitem.id.x() 726 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 727 ret void 728} 729 730define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 731; 732; 733; GFX7LESS-LABEL: add_i64_constant: 734; GFX7LESS: ; %bb.0: ; %entry 735; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 736; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 737; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 738; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 739; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 740; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 741; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 742; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 743; GFX7LESS-NEXT: ; %bb.1: 744; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 745; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 746; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 747; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 748; GFX7LESS-NEXT: s_mov_b32 m0, -1 749; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 750; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 751; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 752; GFX7LESS-NEXT: .LBB4_2: 753; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 754; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 755; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 756; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 757; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 758; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 759; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 760; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 761; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 762; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 763; GFX7LESS-NEXT: s_mov_b32 s2, -1 764; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 765; GFX7LESS-NEXT: s_endpgm 766; 767; GFX8-LABEL: add_i64_constant: 768; GFX8: ; %bb.0: ; %entry 769; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 770; GFX8-NEXT: s_mov_b64 s[4:5], exec 771; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 772; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 773; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 774; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 775; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 776; GFX8-NEXT: s_cbranch_execz .LBB4_2 777; GFX8-NEXT: ; %bb.1: 778; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 779; GFX8-NEXT: s_mul_i32 s4, s4, 5 780; GFX8-NEXT: v_mov_b32_e32 v0, s4 781; GFX8-NEXT: v_mov_b32_e32 v1, 0 782; GFX8-NEXT: s_mov_b32 m0, -1 783; GFX8-NEXT: s_waitcnt lgkmcnt(0) 784; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 785; GFX8-NEXT: s_waitcnt lgkmcnt(0) 786; GFX8-NEXT: .LBB4_2: 787; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 788; GFX8-NEXT: s_waitcnt lgkmcnt(0) 789; GFX8-NEXT: v_readfirstlane_b32 s2, v0 790; GFX8-NEXT: v_readfirstlane_b32 s3, v1 791; GFX8-NEXT: v_mov_b32_e32 v0, s2 792; GFX8-NEXT: v_mov_b32_e32 v1, s3 793; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 794; GFX8-NEXT: s_mov_b32 s3, 0xf000 795; GFX8-NEXT: s_mov_b32 s2, -1 796; GFX8-NEXT: s_nop 2 797; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 798; GFX8-NEXT: s_endpgm 799; 800; GFX9-LABEL: add_i64_constant: 801; GFX9: ; %bb.0: ; %entry 802; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 803; GFX9-NEXT: s_mov_b64 s[4:5], exec 804; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 805; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 806; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 807; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 808; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 809; GFX9-NEXT: s_cbranch_execz .LBB4_2 810; GFX9-NEXT: ; %bb.1: 811; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 812; GFX9-NEXT: s_mul_i32 s4, s4, 5 813; GFX9-NEXT: v_mov_b32_e32 v0, s4 814; GFX9-NEXT: v_mov_b32_e32 v1, 0 815; GFX9-NEXT: s_waitcnt lgkmcnt(0) 816; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 817; GFX9-NEXT: s_waitcnt lgkmcnt(0) 818; GFX9-NEXT: .LBB4_2: 819; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 820; GFX9-NEXT: s_waitcnt lgkmcnt(0) 821; GFX9-NEXT: v_readfirstlane_b32 s2, v0 822; GFX9-NEXT: v_readfirstlane_b32 s3, v1 823; GFX9-NEXT: v_mov_b32_e32 v0, s2 824; GFX9-NEXT: v_mov_b32_e32 v1, s3 825; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 826; GFX9-NEXT: s_mov_b32 s3, 0xf000 827; GFX9-NEXT: s_mov_b32 s2, -1 828; GFX9-NEXT: s_nop 2 829; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 830; GFX9-NEXT: s_endpgm 831; 832; GFX1064-LABEL: add_i64_constant: 833; GFX1064: ; %bb.0: ; %entry 834; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 835; GFX1064-NEXT: s_mov_b64 s[4:5], exec 836; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 837; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 838; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 839; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 840; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 841; GFX1064-NEXT: s_cbranch_execz .LBB4_2 842; GFX1064-NEXT: ; %bb.1: 843; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 844; GFX1064-NEXT: v_mov_b32_e32 v1, 0 845; GFX1064-NEXT: s_mul_i32 s4, s4, 5 846; GFX1064-NEXT: v_mov_b32_e32 v0, s4 847; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 848; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 849; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 850; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 851; GFX1064-NEXT: buffer_gl0_inv 852; GFX1064-NEXT: .LBB4_2: 853; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 854; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 855; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 856; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 857; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 858; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 859; GFX1064-NEXT: s_mov_b32 s2, -1 860; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 861; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 862; GFX1064-NEXT: s_endpgm 863; 864; GFX1032-LABEL: add_i64_constant: 865; GFX1032: ; %bb.0: ; %entry 866; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 867; GFX1032-NEXT: s_mov_b32 s3, exec_lo 868; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 869; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 870; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 871; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 872; GFX1032-NEXT: s_cbranch_execz .LBB4_2 873; GFX1032-NEXT: ; %bb.1: 874; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 875; GFX1032-NEXT: v_mov_b32_e32 v1, 0 876; GFX1032-NEXT: s_mul_i32 s3, s3, 5 877; GFX1032-NEXT: v_mov_b32_e32 v0, s3 878; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 879; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 880; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 881; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 882; GFX1032-NEXT: buffer_gl0_inv 883; GFX1032-NEXT: .LBB4_2: 884; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 885; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 886; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 887; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 888; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 889; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 890; GFX1032-NEXT: s_mov_b32 s2, -1 891; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 892; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 893; GFX1032-NEXT: s_endpgm 894entry: 895 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 896 store i64 %old, i64 addrspace(1)* %out 897 ret void 898} 899 900define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 901; 902; 903; GFX7LESS-LABEL: add_i64_uniform: 904; GFX7LESS: ; %bb.0: ; %entry 905; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 906; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 907; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 908; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 909; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 910; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 911; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 912; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 913; GFX7LESS-NEXT: ; %bb.1: 914; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 915; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 916; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 917; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 918; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 919; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 920; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 921; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 922; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 923; GFX7LESS-NEXT: s_mov_b32 m0, -1 924; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 925; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 926; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 927; GFX7LESS-NEXT: .LBB5_2: 928; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 929; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 930; GFX7LESS-NEXT: s_mov_b32 s6, -1 931; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 932; GFX7LESS-NEXT: s_mov_b32 s4, s0 933; GFX7LESS-NEXT: s_mov_b32 s5, s1 934; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 935; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 936; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 937; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 938; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 939; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 940; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 941; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 942; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 943; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 944; GFX7LESS-NEXT: s_endpgm 945; 946; GFX8-LABEL: add_i64_uniform: 947; GFX8: ; %bb.0: ; %entry 948; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 949; GFX8-NEXT: s_mov_b64 s[6:7], exec 950; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 951; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 952; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 953; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 954; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 955; GFX8-NEXT: s_cbranch_execz .LBB5_2 956; GFX8-NEXT: ; %bb.1: 957; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 958; GFX8-NEXT: v_mov_b32_e32 v0, s8 959; GFX8-NEXT: s_waitcnt lgkmcnt(0) 960; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 961; GFX8-NEXT: s_mul_i32 s6, s3, s8 962; GFX8-NEXT: v_mov_b32_e32 v3, 0 963; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 964; GFX8-NEXT: s_mov_b32 m0, -1 965; GFX8-NEXT: s_waitcnt lgkmcnt(0) 966; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 967; GFX8-NEXT: s_waitcnt lgkmcnt(0) 968; GFX8-NEXT: .LBB5_2: 969; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 970; GFX8-NEXT: s_waitcnt lgkmcnt(0) 971; GFX8-NEXT: s_mov_b32 s4, s0 972; GFX8-NEXT: s_mov_b32 s5, s1 973; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 974; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 975; GFX8-NEXT: v_readfirstlane_b32 s0, v0 976; GFX8-NEXT: v_readfirstlane_b32 s1, v1 977; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 978; GFX8-NEXT: v_mov_b32_e32 v3, s1 979; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 980; GFX8-NEXT: s_mov_b32 s7, 0xf000 981; GFX8-NEXT: s_mov_b32 s6, -1 982; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 983; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 984; GFX8-NEXT: s_endpgm 985; 986; GFX9-LABEL: add_i64_uniform: 987; GFX9: ; %bb.0: ; %entry 988; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 989; GFX9-NEXT: s_mov_b64 s[6:7], exec 990; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 991; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 992; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 993; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 994; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 995; GFX9-NEXT: s_cbranch_execz .LBB5_2 996; GFX9-NEXT: ; %bb.1: 997; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 998; GFX9-NEXT: s_waitcnt lgkmcnt(0) 999; GFX9-NEXT: s_mul_i32 s7, s3, s6 1000; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1001; GFX9-NEXT: s_add_i32 s8, s8, s7 1002; GFX9-NEXT: s_mul_i32 s6, s2, s6 1003; GFX9-NEXT: v_mov_b32_e32 v0, s6 1004; GFX9-NEXT: v_mov_b32_e32 v1, s8 1005; GFX9-NEXT: v_mov_b32_e32 v3, 0 1006; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1008; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1009; GFX9-NEXT: .LBB5_2: 1010; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1011; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1012; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 1013; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 1014; GFX9-NEXT: s_mov_b32 s4, s0 1015; GFX9-NEXT: s_mov_b32 s5, s1 1016; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1017; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1018; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 1019; GFX9-NEXT: v_mov_b32_e32 v3, s1 1020; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 1021; GFX9-NEXT: s_mov_b32 s7, 0xf000 1022; GFX9-NEXT: s_mov_b32 s6, -1 1023; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc 1024; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1025; GFX9-NEXT: s_endpgm 1026; 1027; GFX1064-LABEL: add_i64_uniform: 1028; GFX1064: ; %bb.0: ; %entry 1029; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1030; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1031; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1032; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1033; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1034; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1035; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1036; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1037; GFX1064-NEXT: ; %bb.1: 1038; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1039; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1040; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1041; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1042; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1043; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1044; GFX1064-NEXT: s_add_i32 s8, s8, s7 1045; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1046; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1047; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1048; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1049; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1050; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1051; GFX1064-NEXT: buffer_gl0_inv 1052; GFX1064-NEXT: .LBB5_2: 1053; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1054; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1055; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1056; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 1057; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 1058; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 1059; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 1060; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1061; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 1062; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v2 1063; GFX1064-NEXT: s_mov_b32 s2, -1 1064; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1065; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1066; GFX1064-NEXT: s_endpgm 1067; 1068; GFX1032-LABEL: add_i64_uniform: 1069; GFX1032: ; %bb.0: ; %entry 1070; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1071; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1072; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1073; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1074; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1075; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1076; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1077; GFX1032-NEXT: ; %bb.1: 1078; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1079; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1080; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1081; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1082; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1083; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1084; GFX1032-NEXT: s_add_i32 s7, s7, s6 1085; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1086; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1087; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1088; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1089; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1090; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1091; GFX1032-NEXT: buffer_gl0_inv 1092; GFX1032-NEXT: .LBB5_2: 1093; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1094; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1095; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1096; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 1097; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s2, v2, 0 1098; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 1099; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 1100; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1101; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 1102; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v2 1103; GFX1032-NEXT: s_mov_b32 s2, -1 1104; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1105; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1106; GFX1032-NEXT: s_endpgm 1107entry: 1108 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1109 store i64 %old, i64 addrspace(1)* %out 1110 ret void 1111} 1112 1113define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1114; 1115; 1116; GFX7LESS-LABEL: add_i64_varying: 1117; GFX7LESS: ; %bb.0: ; %entry 1118; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1119; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1120; GFX7LESS-NEXT: s_mov_b32 m0, -1 1121; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1122; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1123; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1124; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1125; GFX7LESS-NEXT: s_mov_b32 s2, -1 1126; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1127; GFX7LESS-NEXT: s_endpgm 1128; 1129; GFX8-LABEL: add_i64_varying: 1130; GFX8: ; %bb.0: ; %entry 1131; GFX8-NEXT: v_mov_b32_e32 v1, 0 1132; GFX8-NEXT: s_mov_b32 m0, -1 1133; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1134; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1135; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1136; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX8-NEXT: s_mov_b32 s3, 0xf000 1138; GFX8-NEXT: s_mov_b32 s2, -1 1139; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1140; GFX8-NEXT: s_endpgm 1141; 1142; GFX9-LABEL: add_i64_varying: 1143; GFX9: ; %bb.0: ; %entry 1144; GFX9-NEXT: v_mov_b32_e32 v1, 0 1145; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1147; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1148; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX9-NEXT: s_mov_b32 s3, 0xf000 1150; GFX9-NEXT: s_mov_b32 s2, -1 1151; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1152; GFX9-NEXT: s_endpgm 1153; 1154; GFX10-LABEL: add_i64_varying: 1155; GFX10: ; %bb.0: ; %entry 1156; GFX10-NEXT: v_mov_b32_e32 v1, 0 1157; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1158; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1159; GFX10-NEXT: s_mov_b32 s2, -1 1160; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1161; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1162; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1163; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1164; GFX10-NEXT: buffer_gl0_inv 1165; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1166; GFX10-NEXT: s_endpgm 1167entry: 1168 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1169 %zext = zext i32 %lane to i64 1170 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1171 store i64 %old, i64 addrspace(1)* %out 1172 ret void 1173} 1174 1175define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1176; 1177; 1178; GFX7LESS-LABEL: sub_i32_constant: 1179; GFX7LESS: ; %bb.0: ; %entry 1180; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1181; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1182; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1183; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1184; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1185; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1186; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1187; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1188; GFX7LESS-NEXT: ; %bb.1: 1189; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1190; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1191; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1192; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1193; GFX7LESS-NEXT: s_mov_b32 m0, -1 1194; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1196; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1197; GFX7LESS-NEXT: .LBB7_2: 1198; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1199; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1200; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1201; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1202; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1203; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1204; GFX7LESS-NEXT: s_mov_b32 s2, -1 1205; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1206; GFX7LESS-NEXT: s_endpgm 1207; 1208; GFX8-LABEL: sub_i32_constant: 1209; GFX8: ; %bb.0: ; %entry 1210; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1211; GFX8-NEXT: s_mov_b64 s[2:3], exec 1212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1215; GFX8-NEXT: ; implicit-def: $vgpr1 1216; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1217; GFX8-NEXT: s_cbranch_execz .LBB7_2 1218; GFX8-NEXT: ; %bb.1: 1219; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1220; GFX8-NEXT: s_mul_i32 s2, s2, 5 1221; GFX8-NEXT: v_mov_b32_e32 v1, 0 1222; GFX8-NEXT: v_mov_b32_e32 v2, s2 1223; GFX8-NEXT: s_mov_b32 m0, -1 1224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1225; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1226; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1227; GFX8-NEXT: .LBB7_2: 1228; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1229; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1230; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1231; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1232; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1233; GFX8-NEXT: s_mov_b32 s3, 0xf000 1234; GFX8-NEXT: s_mov_b32 s2, -1 1235; GFX8-NEXT: s_nop 0 1236; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1237; GFX8-NEXT: s_endpgm 1238; 1239; GFX9-LABEL: sub_i32_constant: 1240; GFX9: ; %bb.0: ; %entry 1241; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1242; GFX9-NEXT: s_mov_b64 s[2:3], exec 1243; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1244; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1245; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1246; GFX9-NEXT: ; implicit-def: $vgpr1 1247; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1248; GFX9-NEXT: s_cbranch_execz .LBB7_2 1249; GFX9-NEXT: ; %bb.1: 1250; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1251; GFX9-NEXT: s_mul_i32 s2, s2, 5 1252; GFX9-NEXT: v_mov_b32_e32 v1, 0 1253; GFX9-NEXT: v_mov_b32_e32 v2, s2 1254; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1255; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1257; GFX9-NEXT: .LBB7_2: 1258; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1259; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1260; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1261; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1262; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1263; GFX9-NEXT: s_mov_b32 s3, 0xf000 1264; GFX9-NEXT: s_mov_b32 s2, -1 1265; GFX9-NEXT: s_nop 0 1266; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1267; GFX9-NEXT: s_endpgm 1268; 1269; GFX1064-LABEL: sub_i32_constant: 1270; GFX1064: ; %bb.0: ; %entry 1271; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1272; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1273; GFX1064-NEXT: ; implicit-def: $vgpr1 1274; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1275; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1276; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1277; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1278; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1279; GFX1064-NEXT: ; %bb.1: 1280; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1281; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1282; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1283; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1284; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1285; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1286; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1287; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1288; GFX1064-NEXT: buffer_gl0_inv 1289; GFX1064-NEXT: .LBB7_2: 1290; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1291; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1292; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1293; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1294; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1295; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1296; GFX1064-NEXT: s_mov_b32 s2, -1 1297; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1298; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1299; GFX1064-NEXT: s_endpgm 1300; 1301; GFX1032-LABEL: sub_i32_constant: 1302; GFX1032: ; %bb.0: ; %entry 1303; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1304; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1305; GFX1032-NEXT: ; implicit-def: $vgpr1 1306; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1307; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1308; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1309; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1310; GFX1032-NEXT: ; %bb.1: 1311; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1312; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1313; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1314; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1315; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1316; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1317; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1318; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1319; GFX1032-NEXT: buffer_gl0_inv 1320; GFX1032-NEXT: .LBB7_2: 1321; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1322; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1323; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1324; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1325; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1326; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1327; GFX1032-NEXT: s_mov_b32 s2, -1 1328; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1329; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1330; GFX1032-NEXT: s_endpgm 1331entry: 1332 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1333 store i32 %old, i32 addrspace(1)* %out 1334 ret void 1335} 1336 1337define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1338; 1339; 1340; GFX7LESS-LABEL: sub_i32_uniform: 1341; GFX7LESS: ; %bb.0: ; %entry 1342; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1343; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1344; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1345; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1346; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1347; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1348; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1349; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1350; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1351; GFX7LESS-NEXT: ; %bb.1: 1352; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1353; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1354; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1355; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1356; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1357; GFX7LESS-NEXT: s_mov_b32 m0, -1 1358; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1359; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1360; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1361; GFX7LESS-NEXT: .LBB8_2: 1362; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1363; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1365; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1366; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1367; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1368; GFX7LESS-NEXT: s_mov_b32 s6, -1 1369; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1370; GFX7LESS-NEXT: s_endpgm 1371; 1372; GFX8-LABEL: sub_i32_uniform: 1373; GFX8: ; %bb.0: ; %entry 1374; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1375; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1376; GFX8-NEXT: s_mov_b64 s[2:3], exec 1377; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1378; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1379; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1380; GFX8-NEXT: ; implicit-def: $vgpr1 1381; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1382; GFX8-NEXT: s_cbranch_execz .LBB8_2 1383; GFX8-NEXT: ; %bb.1: 1384; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1385; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1386; GFX8-NEXT: s_mul_i32 s2, s6, s2 1387; GFX8-NEXT: v_mov_b32_e32 v1, 0 1388; GFX8-NEXT: v_mov_b32_e32 v2, s2 1389; GFX8-NEXT: s_mov_b32 m0, -1 1390; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1391; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1392; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1393; GFX8-NEXT: .LBB8_2: 1394; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1395; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1396; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1397; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1398; GFX8-NEXT: s_mov_b32 s7, 0xf000 1399; GFX8-NEXT: s_mov_b32 s6, -1 1400; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1401; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1402; GFX8-NEXT: s_endpgm 1403; 1404; GFX9-LABEL: sub_i32_uniform: 1405; GFX9: ; %bb.0: ; %entry 1406; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1407; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1408; GFX9-NEXT: s_mov_b64 s[2:3], exec 1409; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1410; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1411; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1412; GFX9-NEXT: ; implicit-def: $vgpr1 1413; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1414; GFX9-NEXT: s_cbranch_execz .LBB8_2 1415; GFX9-NEXT: ; %bb.1: 1416; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1417; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1418; GFX9-NEXT: s_mul_i32 s2, s6, s2 1419; GFX9-NEXT: v_mov_b32_e32 v1, 0 1420; GFX9-NEXT: v_mov_b32_e32 v2, s2 1421; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1423; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1424; GFX9-NEXT: .LBB8_2: 1425; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1426; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1427; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1428; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1429; GFX9-NEXT: s_mov_b32 s7, 0xf000 1430; GFX9-NEXT: s_mov_b32 s6, -1 1431; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1432; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1433; GFX9-NEXT: s_endpgm 1434; 1435; GFX1064-LABEL: sub_i32_uniform: 1436; GFX1064: ; %bb.0: ; %entry 1437; GFX1064-NEXT: s_clause 0x1 1438; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1439; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 1440; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1441; GFX1064-NEXT: ; implicit-def: $vgpr1 1442; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1443; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1444; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1445; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1446; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1447; GFX1064-NEXT: ; %bb.1: 1448; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1449; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1450; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX1064-NEXT: s_mul_i32 s2, s6, s2 1452; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1453; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1454; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1455; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1456; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1457; GFX1064-NEXT: buffer_gl0_inv 1458; GFX1064-NEXT: .LBB8_2: 1459; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1460; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1461; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1462; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 1463; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1464; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1465; GFX1064-NEXT: s_mov_b32 s6, -1 1466; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1467; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1468; GFX1064-NEXT: s_endpgm 1469; 1470; GFX1032-LABEL: sub_i32_uniform: 1471; GFX1032: ; %bb.0: ; %entry 1472; GFX1032-NEXT: s_clause 0x1 1473; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1474; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1475; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1476; GFX1032-NEXT: ; implicit-def: $vgpr1 1477; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1478; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1479; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1480; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1481; GFX1032-NEXT: ; %bb.1: 1482; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1483; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1484; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1485; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1486; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1487; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1488; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1489; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1490; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX1032-NEXT: buffer_gl0_inv 1492; GFX1032-NEXT: .LBB8_2: 1493; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1494; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1495; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1496; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1497; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1498; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1499; GFX1032-NEXT: s_mov_b32 s6, -1 1500; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1501; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1502; GFX1032-NEXT: s_endpgm 1503entry: 1504 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1505 store i32 %old, i32 addrspace(1)* %out 1506 ret void 1507} 1508 1509define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1510; 1511; 1512; GFX7LESS-LABEL: sub_i32_varying: 1513; GFX7LESS: ; %bb.0: ; %entry 1514; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1515; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1516; GFX7LESS-NEXT: s_mov_b32 m0, -1 1517; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1518; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1519; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1520; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1521; GFX7LESS-NEXT: s_mov_b32 s2, -1 1522; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1523; GFX7LESS-NEXT: s_endpgm 1524; 1525; GFX8-LABEL: sub_i32_varying: 1526; GFX8: ; %bb.0: ; %entry 1527; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1528; GFX8-NEXT: v_mov_b32_e32 v2, v0 1529; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1530; GFX8-NEXT: v_mov_b32_e32 v1, 0 1531; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1532; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1533; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1534; GFX8-NEXT: s_not_b64 exec, exec 1535; GFX8-NEXT: v_mov_b32_e32 v2, 0 1536; GFX8-NEXT: s_not_b64 exec, exec 1537; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1538; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1539; GFX8-NEXT: s_nop 1 1540; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1541; GFX8-NEXT: s_nop 1 1542; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1543; GFX8-NEXT: s_nop 1 1544; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1545; GFX8-NEXT: s_nop 1 1546; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1547; GFX8-NEXT: s_nop 1 1548; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1549; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1550; GFX8-NEXT: s_nop 0 1551; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1552; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1553; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1554; GFX8-NEXT: ; implicit-def: $vgpr0 1555; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1556; GFX8-NEXT: s_cbranch_execz .LBB9_2 1557; GFX8-NEXT: ; %bb.1: 1558; GFX8-NEXT: v_mov_b32_e32 v0, 0 1559; GFX8-NEXT: v_mov_b32_e32 v3, s4 1560; GFX8-NEXT: s_mov_b32 m0, -1 1561; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1562; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1563; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1564; GFX8-NEXT: .LBB9_2: 1565; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1566; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1567; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1568; GFX8-NEXT: v_mov_b32_e32 v0, v1 1569; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1570; GFX8-NEXT: s_mov_b32 s3, 0xf000 1571; GFX8-NEXT: s_mov_b32 s2, -1 1572; GFX8-NEXT: s_nop 0 1573; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1574; GFX8-NEXT: s_endpgm 1575; 1576; GFX9-LABEL: sub_i32_varying: 1577; GFX9: ; %bb.0: ; %entry 1578; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1579; GFX9-NEXT: v_mov_b32_e32 v2, v0 1580; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1581; GFX9-NEXT: v_mov_b32_e32 v1, 0 1582; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1583; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1584; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1585; GFX9-NEXT: s_not_b64 exec, exec 1586; GFX9-NEXT: v_mov_b32_e32 v2, 0 1587; GFX9-NEXT: s_not_b64 exec, exec 1588; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1589; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1590; GFX9-NEXT: s_nop 1 1591; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1592; GFX9-NEXT: s_nop 1 1593; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1594; GFX9-NEXT: s_nop 1 1595; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1596; GFX9-NEXT: s_nop 1 1597; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1598; GFX9-NEXT: s_nop 1 1599; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1600; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1601; GFX9-NEXT: s_nop 0 1602; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1603; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1604; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1605; GFX9-NEXT: ; implicit-def: $vgpr0 1606; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1607; GFX9-NEXT: s_cbranch_execz .LBB9_2 1608; GFX9-NEXT: ; %bb.1: 1609; GFX9-NEXT: v_mov_b32_e32 v0, 0 1610; GFX9-NEXT: v_mov_b32_e32 v3, s4 1611; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1612; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1613; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX9-NEXT: .LBB9_2: 1615; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1616; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1617; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1618; GFX9-NEXT: v_mov_b32_e32 v0, v1 1619; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1620; GFX9-NEXT: s_mov_b32 s3, 0xf000 1621; GFX9-NEXT: s_mov_b32 s2, -1 1622; GFX9-NEXT: s_nop 0 1623; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1624; GFX9-NEXT: s_endpgm 1625; 1626; GFX1064-LABEL: sub_i32_varying: 1627; GFX1064: ; %bb.0: ; %entry 1628; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1629; GFX1064-NEXT: s_not_b64 exec, exec 1630; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1631; GFX1064-NEXT: s_not_b64 exec, exec 1632; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1633; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1634; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1635; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1636; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1637; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1638; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1639; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1640; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1641; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1642; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1643; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1644; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1645; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1646; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1647; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1648; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1649; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1650; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 1651; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1652; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1653; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1654; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 1655; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 1656; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 1657; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1658; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1659; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1660; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 1661; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1662; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1663; GFX1064-NEXT: s_mov_b32 s2, -1 1664; GFX1064-NEXT: ; implicit-def: $vgpr0 1665; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1666; GFX1064-NEXT: s_cbranch_execz .LBB9_2 1667; GFX1064-NEXT: ; %bb.1: 1668; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1669; GFX1064-NEXT: v_mov_b32_e32 v4, s7 1670; GFX1064-NEXT: s_mov_b32 s3, s7 1671; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1672; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1673; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 1674; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1675; GFX1064-NEXT: buffer_gl0_inv 1676; GFX1064-NEXT: .LBB9_2: 1677; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1678; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1679; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1680; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1681; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1682; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1683; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1684; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1685; GFX1064-NEXT: s_endpgm 1686; 1687; GFX1032-LABEL: sub_i32_varying: 1688; GFX1032: ; %bb.0: ; %entry 1689; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1690; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1691; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1692; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1693; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1694; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1695; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1696; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1697; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1698; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1699; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1700; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1701; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1702; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1703; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1704; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1705; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1706; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1707; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1708; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1709; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1710; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1711; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1712; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1713; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1714; GFX1032-NEXT: s_mov_b32 s2, -1 1715; GFX1032-NEXT: ; implicit-def: $vgpr0 1716; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1717; GFX1032-NEXT: s_cbranch_execz .LBB9_2 1718; GFX1032-NEXT: ; %bb.1: 1719; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1720; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1721; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1722; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1723; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 1724; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1725; GFX1032-NEXT: buffer_gl0_inv 1726; GFX1032-NEXT: .LBB9_2: 1727; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1728; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1729; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1730; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1731; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1732; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1733; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1734; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1735; GFX1032-NEXT: s_endpgm 1736entry: 1737 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1738 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1739 store i32 %old, i32 addrspace(1)* %out 1740 ret void 1741} 1742 1743define amdgpu_kernel void @sub_i32_varying_nouse() { 1744; GFX7LESS-LABEL: sub_i32_varying_nouse: 1745; GFX7LESS: ; %bb.0: ; %entry 1746; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1747; GFX7LESS-NEXT: s_mov_b32 m0, -1 1748; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1749; GFX7LESS-NEXT: ds_sub_u32 v1, v0 1750; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1751; GFX7LESS-NEXT: s_endpgm 1752; 1753; GFX8-LABEL: sub_i32_varying_nouse: 1754; GFX8: ; %bb.0: ; %entry 1755; GFX8-NEXT: v_mov_b32_e32 v1, v0 1756; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1757; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1758; GFX8-NEXT: s_not_b64 exec, exec 1759; GFX8-NEXT: v_mov_b32_e32 v1, 0 1760; GFX8-NEXT: s_not_b64 exec, exec 1761; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 1762; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1763; GFX8-NEXT: s_nop 1 1764; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1765; GFX8-NEXT: s_nop 1 1766; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1767; GFX8-NEXT: s_nop 1 1768; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1769; GFX8-NEXT: s_nop 1 1770; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1771; GFX8-NEXT: s_nop 1 1772; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1773; GFX8-NEXT: v_readlane_b32 s2, v1, 63 1774; GFX8-NEXT: s_mov_b64 exec, s[0:1] 1775; GFX8-NEXT: s_mov_b32 s0, s2 1776; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1777; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1778; GFX8-NEXT: s_cbranch_execz .LBB10_2 1779; GFX8-NEXT: ; %bb.1: 1780; GFX8-NEXT: v_mov_b32_e32 v0, 0 1781; GFX8-NEXT: v_mov_b32_e32 v2, s0 1782; GFX8-NEXT: s_mov_b32 m0, -1 1783; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1784; GFX8-NEXT: ds_sub_u32 v0, v2 1785; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1786; GFX8-NEXT: .LBB10_2: 1787; GFX8-NEXT: s_endpgm 1788; 1789; GFX9-LABEL: sub_i32_varying_nouse: 1790; GFX9: ; %bb.0: ; %entry 1791; GFX9-NEXT: v_mov_b32_e32 v1, v0 1792; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1793; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1794; GFX9-NEXT: s_not_b64 exec, exec 1795; GFX9-NEXT: v_mov_b32_e32 v1, 0 1796; GFX9-NEXT: s_not_b64 exec, exec 1797; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 1798; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1799; GFX9-NEXT: s_nop 1 1800; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1801; GFX9-NEXT: s_nop 1 1802; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1803; GFX9-NEXT: s_nop 1 1804; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1805; GFX9-NEXT: s_nop 1 1806; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1807; GFX9-NEXT: s_nop 1 1808; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1809; GFX9-NEXT: v_readlane_b32 s2, v1, 63 1810; GFX9-NEXT: s_mov_b64 exec, s[0:1] 1811; GFX9-NEXT: s_mov_b32 s0, s2 1812; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1813; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1814; GFX9-NEXT: s_cbranch_execz .LBB10_2 1815; GFX9-NEXT: ; %bb.1: 1816; GFX9-NEXT: v_mov_b32_e32 v0, 0 1817; GFX9-NEXT: v_mov_b32_e32 v2, s0 1818; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1819; GFX9-NEXT: ds_sub_u32 v0, v2 1820; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1821; GFX9-NEXT: .LBB10_2: 1822; GFX9-NEXT: s_endpgm 1823; 1824; GFX1064-LABEL: sub_i32_varying_nouse: 1825; GFX1064: ; %bb.0: ; %entry 1826; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1827; GFX1064-NEXT: s_not_b64 exec, exec 1828; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1829; GFX1064-NEXT: s_not_b64 exec, exec 1830; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1831; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1832; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1833; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1834; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1835; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1836; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1837; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 1838; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1839; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1840; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1841; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 1842; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 1843; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1844; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1845; GFX1064-NEXT: s_add_i32 s0, s2, s3 1846; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1847; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1848; GFX1064-NEXT: s_cbranch_execz .LBB10_2 1849; GFX1064-NEXT: ; %bb.1: 1850; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1851; GFX1064-NEXT: v_mov_b32_e32 v3, s0 1852; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1853; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1854; GFX1064-NEXT: ds_sub_u32 v0, v3 1855; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1856; GFX1064-NEXT: buffer_gl0_inv 1857; GFX1064-NEXT: .LBB10_2: 1858; GFX1064-NEXT: s_endpgm 1859; 1860; GFX1032-LABEL: sub_i32_varying_nouse: 1861; GFX1032: ; %bb.0: ; %entry 1862; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1863; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1864; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1865; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1866; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1867; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1868; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1869; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1870; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1871; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1872; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1873; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 1874; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1875; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1876; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1877; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 1878; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1879; GFX1032-NEXT: s_cbranch_execz .LBB10_2 1880; GFX1032-NEXT: ; %bb.1: 1881; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1882; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1883; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1884; GFX1032-NEXT: ds_sub_u32 v3, v0 1885; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1886; GFX1032-NEXT: buffer_gl0_inv 1887; GFX1032-NEXT: .LBB10_2: 1888; GFX1032-NEXT: s_endpgm 1889entry: 1890 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1891 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1892 ret void 1893} 1894 1895define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 1896; 1897; 1898; GFX7LESS-LABEL: sub_i64_constant: 1899; GFX7LESS: ; %bb.0: ; %entry 1900; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1901; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1902; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1903; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1904; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1905; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1906; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1907; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 1908; GFX7LESS-NEXT: ; %bb.1: 1909; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1910; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1911; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1912; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1913; GFX7LESS-NEXT: s_mov_b32 m0, -1 1914; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1915; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1916; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1917; GFX7LESS-NEXT: .LBB11_2: 1918; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1919; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1920; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1921; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1922; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1923; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1924; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1925; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1926; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1927; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1928; GFX7LESS-NEXT: s_mov_b32 s2, -1 1929; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1930; GFX7LESS-NEXT: s_endpgm 1931; 1932; GFX8-LABEL: sub_i64_constant: 1933; GFX8: ; %bb.0: ; %entry 1934; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1935; GFX8-NEXT: s_mov_b64 s[4:5], exec 1936; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1937; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1938; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1939; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1940; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1941; GFX8-NEXT: s_cbranch_execz .LBB11_2 1942; GFX8-NEXT: ; %bb.1: 1943; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1944; GFX8-NEXT: s_mul_i32 s4, s4, 5 1945; GFX8-NEXT: v_mov_b32_e32 v0, s4 1946; GFX8-NEXT: v_mov_b32_e32 v1, 0 1947; GFX8-NEXT: s_mov_b32 m0, -1 1948; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1950; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1951; GFX8-NEXT: .LBB11_2: 1952; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1953; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1954; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1955; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1956; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1957; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1958; GFX8-NEXT: v_mov_b32_e32 v2, s3 1959; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1960; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1961; GFX8-NEXT: s_mov_b32 s3, 0xf000 1962; GFX8-NEXT: s_mov_b32 s2, -1 1963; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1964; GFX8-NEXT: s_endpgm 1965; 1966; GFX9-LABEL: sub_i64_constant: 1967; GFX9: ; %bb.0: ; %entry 1968; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1969; GFX9-NEXT: s_mov_b64 s[4:5], exec 1970; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1971; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1972; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1973; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1974; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1975; GFX9-NEXT: s_cbranch_execz .LBB11_2 1976; GFX9-NEXT: ; %bb.1: 1977; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1978; GFX9-NEXT: s_mul_i32 s4, s4, 5 1979; GFX9-NEXT: v_mov_b32_e32 v0, s4 1980; GFX9-NEXT: v_mov_b32_e32 v1, 0 1981; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1982; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1983; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1984; GFX9-NEXT: .LBB11_2: 1985; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1986; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1987; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1988; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1989; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1990; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1991; GFX9-NEXT: v_mov_b32_e32 v2, s3 1992; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 1993; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 1994; GFX9-NEXT: s_mov_b32 s3, 0xf000 1995; GFX9-NEXT: s_mov_b32 s2, -1 1996; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1997; GFX9-NEXT: s_endpgm 1998; 1999; GFX1064-LABEL: sub_i64_constant: 2000; GFX1064: ; %bb.0: ; %entry 2001; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2002; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2003; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2004; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 2005; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2006; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2007; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2008; GFX1064-NEXT: s_cbranch_execz .LBB11_2 2009; GFX1064-NEXT: ; %bb.1: 2010; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2011; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2012; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2013; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2014; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2015; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2016; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2017; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2018; GFX1064-NEXT: buffer_gl0_inv 2019; GFX1064-NEXT: .LBB11_2: 2020; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2021; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2022; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2023; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2024; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2025; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2026; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2027; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2028; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2029; GFX1064-NEXT: s_mov_b32 s2, -1 2030; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2031; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2032; GFX1064-NEXT: s_endpgm 2033; 2034; GFX1032-LABEL: sub_i64_constant: 2035; GFX1032: ; %bb.0: ; %entry 2036; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2037; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2038; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2039; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2040; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2041; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2042; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2043; GFX1032-NEXT: ; %bb.1: 2044; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2045; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2046; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2047; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2048; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2049; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2050; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2051; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2052; GFX1032-NEXT: buffer_gl0_inv 2053; GFX1032-NEXT: .LBB11_2: 2054; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2055; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2056; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2057; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2058; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2059; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2060; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2061; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2062; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2063; GFX1032-NEXT: s_mov_b32 s2, -1 2064; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2065; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2066; GFX1032-NEXT: s_endpgm 2067entry: 2068 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2069 store i64 %old, i64 addrspace(1)* %out 2070 ret void 2071} 2072 2073define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2074; 2075; 2076; GFX7LESS-LABEL: sub_i64_uniform: 2077; GFX7LESS: ; %bb.0: ; %entry 2078; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2079; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2080; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2081; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 2082; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2083; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2084; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2085; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 2086; GFX7LESS-NEXT: ; %bb.1: 2087; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2088; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 2089; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2090; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2091; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2092; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 2093; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2094; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 2095; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2096; GFX7LESS-NEXT: s_mov_b32 m0, -1 2097; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2098; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2099; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2100; GFX7LESS-NEXT: .LBB12_2: 2101; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2102; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2103; GFX7LESS-NEXT: s_mov_b32 s6, -1 2104; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2105; GFX7LESS-NEXT: s_mov_b32 s4, s0 2106; GFX7LESS-NEXT: s_mov_b32 s5, s1 2107; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 2108; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 2109; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 2110; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 2111; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 2112; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 2113; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 2114; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 2115; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2116; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2117; GFX7LESS-NEXT: s_endpgm 2118; 2119; GFX8-LABEL: sub_i64_uniform: 2120; GFX8: ; %bb.0: ; %entry 2121; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2122; GFX8-NEXT: s_mov_b64 s[6:7], exec 2123; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2124; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2125; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2126; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2127; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2128; GFX8-NEXT: s_cbranch_execz .LBB12_2 2129; GFX8-NEXT: ; %bb.1: 2130; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 2131; GFX8-NEXT: v_mov_b32_e32 v0, s8 2132; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2133; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 2134; GFX8-NEXT: s_mul_i32 s6, s3, s8 2135; GFX8-NEXT: v_mov_b32_e32 v3, 0 2136; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 2137; GFX8-NEXT: s_mov_b32 m0, -1 2138; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2139; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2140; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2141; GFX8-NEXT: .LBB12_2: 2142; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2143; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2144; GFX8-NEXT: s_mov_b32 s4, s0 2145; GFX8-NEXT: s_mov_b32 s5, s1 2146; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 2147; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2148; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2149; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2150; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 2151; GFX8-NEXT: v_mov_b32_e32 v3, s1 2152; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 2153; GFX8-NEXT: s_mov_b32 s7, 0xf000 2154; GFX8-NEXT: s_mov_b32 s6, -1 2155; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2156; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2157; GFX8-NEXT: s_endpgm 2158; 2159; GFX9-LABEL: sub_i64_uniform: 2160; GFX9: ; %bb.0: ; %entry 2161; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2162; GFX9-NEXT: s_mov_b64 s[6:7], exec 2163; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2164; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2165; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2166; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2167; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2168; GFX9-NEXT: s_cbranch_execz .LBB12_2 2169; GFX9-NEXT: ; %bb.1: 2170; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2171; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2172; GFX9-NEXT: s_mul_i32 s7, s3, s6 2173; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2174; GFX9-NEXT: s_add_i32 s8, s8, s7 2175; GFX9-NEXT: s_mul_i32 s6, s2, s6 2176; GFX9-NEXT: v_mov_b32_e32 v0, s6 2177; GFX9-NEXT: v_mov_b32_e32 v1, s8 2178; GFX9-NEXT: v_mov_b32_e32 v3, 0 2179; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2180; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2181; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2182; GFX9-NEXT: .LBB12_2: 2183; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2184; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2185; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 2186; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 2187; GFX9-NEXT: s_mov_b32 s4, s0 2188; GFX9-NEXT: s_mov_b32 s5, s1 2189; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2190; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2191; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 2192; GFX9-NEXT: v_mov_b32_e32 v3, s1 2193; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 2194; GFX9-NEXT: s_mov_b32 s7, 0xf000 2195; GFX9-NEXT: s_mov_b32 s6, -1 2196; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc 2197; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2198; GFX9-NEXT: s_endpgm 2199; 2200; GFX1064-LABEL: sub_i64_uniform: 2201; GFX1064: ; %bb.0: ; %entry 2202; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2203; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2204; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2205; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2206; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2207; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2208; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2209; GFX1064-NEXT: s_cbranch_execz .LBB12_2 2210; GFX1064-NEXT: ; %bb.1: 2211; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2212; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2213; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2214; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2215; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2216; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2217; GFX1064-NEXT: s_add_i32 s8, s8, s7 2218; GFX1064-NEXT: v_mov_b32_e32 v0, s6 2219; GFX1064-NEXT: v_mov_b32_e32 v1, s8 2220; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2221; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2222; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2223; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2224; GFX1064-NEXT: buffer_gl0_inv 2225; GFX1064-NEXT: .LBB12_2: 2226; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2227; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2228; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2229; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 2230; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 2231; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2232; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 2233; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2234; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 2235; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v2 2236; GFX1064-NEXT: s_mov_b32 s2, -1 2237; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2238; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2239; GFX1064-NEXT: s_endpgm 2240; 2241; GFX1032-LABEL: sub_i64_uniform: 2242; GFX1032: ; %bb.0: ; %entry 2243; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2244; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2245; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2246; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 2247; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2248; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2249; GFX1032-NEXT: s_cbranch_execz .LBB12_2 2250; GFX1032-NEXT: ; %bb.1: 2251; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2252; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2253; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2254; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2255; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2256; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2257; GFX1032-NEXT: s_add_i32 s7, s7, s6 2258; GFX1032-NEXT: v_mov_b32_e32 v0, s5 2259; GFX1032-NEXT: v_mov_b32_e32 v1, s7 2260; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2261; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2262; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2263; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2264; GFX1032-NEXT: buffer_gl0_inv 2265; GFX1032-NEXT: .LBB12_2: 2266; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2267; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2268; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2269; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 2270; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s2, v2, 0 2271; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2272; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 2273; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2274; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 2275; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v2 2276; GFX1032-NEXT: s_mov_b32 s2, -1 2277; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2278; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2279; GFX1032-NEXT: s_endpgm 2280entry: 2281 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2282 store i64 %old, i64 addrspace(1)* %out 2283 ret void 2284} 2285 2286define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2287; 2288; 2289; GFX7LESS-LABEL: sub_i64_varying: 2290; GFX7LESS: ; %bb.0: ; %entry 2291; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2292; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2293; GFX7LESS-NEXT: s_mov_b32 m0, -1 2294; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2295; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2296; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2297; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2298; GFX7LESS-NEXT: s_mov_b32 s2, -1 2299; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2300; GFX7LESS-NEXT: s_endpgm 2301; 2302; GFX8-LABEL: sub_i64_varying: 2303; GFX8: ; %bb.0: ; %entry 2304; GFX8-NEXT: v_mov_b32_e32 v1, 0 2305; GFX8-NEXT: s_mov_b32 m0, -1 2306; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2307; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2308; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2309; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2310; GFX8-NEXT: s_mov_b32 s3, 0xf000 2311; GFX8-NEXT: s_mov_b32 s2, -1 2312; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2313; GFX8-NEXT: s_endpgm 2314; 2315; GFX9-LABEL: sub_i64_varying: 2316; GFX9: ; %bb.0: ; %entry 2317; GFX9-NEXT: v_mov_b32_e32 v1, 0 2318; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2319; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2320; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2321; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX9-NEXT: s_mov_b32 s3, 0xf000 2323; GFX9-NEXT: s_mov_b32 s2, -1 2324; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2325; GFX9-NEXT: s_endpgm 2326; 2327; GFX10-LABEL: sub_i64_varying: 2328; GFX10: ; %bb.0: ; %entry 2329; GFX10-NEXT: v_mov_b32_e32 v1, 0 2330; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2331; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2332; GFX10-NEXT: s_mov_b32 s2, -1 2333; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2334; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2335; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2336; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2337; GFX10-NEXT: buffer_gl0_inv 2338; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2339; GFX10-NEXT: s_endpgm 2340entry: 2341 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2342 %zext = zext i32 %lane to i64 2343 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2344 store i64 %old, i64 addrspace(1)* %out 2345 ret void 2346} 2347 2348define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2349; 2350; 2351; GFX7LESS-LABEL: and_i32_varying: 2352; GFX7LESS: ; %bb.0: ; %entry 2353; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2354; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2355; GFX7LESS-NEXT: s_mov_b32 m0, -1 2356; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2357; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2358; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2359; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2360; GFX7LESS-NEXT: s_mov_b32 s2, -1 2361; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2362; GFX7LESS-NEXT: s_endpgm 2363; 2364; GFX8-LABEL: and_i32_varying: 2365; GFX8: ; %bb.0: ; %entry 2366; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2367; GFX8-NEXT: v_mov_b32_e32 v2, v0 2368; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2369; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2370; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2371; GFX8-NEXT: v_mov_b32_e32 v1, -1 2372; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2373; GFX8-NEXT: s_not_b64 exec, exec 2374; GFX8-NEXT: v_mov_b32_e32 v2, -1 2375; GFX8-NEXT: s_not_b64 exec, exec 2376; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2377; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2378; GFX8-NEXT: s_nop 1 2379; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2380; GFX8-NEXT: s_nop 1 2381; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2382; GFX8-NEXT: s_nop 1 2383; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2384; GFX8-NEXT: s_nop 1 2385; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2386; GFX8-NEXT: s_nop 1 2387; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2388; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2389; GFX8-NEXT: s_nop 0 2390; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2391; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2392; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2393; GFX8-NEXT: ; implicit-def: $vgpr0 2394; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2395; GFX8-NEXT: s_cbranch_execz .LBB14_2 2396; GFX8-NEXT: ; %bb.1: 2397; GFX8-NEXT: v_mov_b32_e32 v0, 0 2398; GFX8-NEXT: v_mov_b32_e32 v3, s4 2399; GFX8-NEXT: s_mov_b32 m0, -1 2400; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2401; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2402; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2403; GFX8-NEXT: .LBB14_2: 2404; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2405; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2406; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2407; GFX8-NEXT: v_mov_b32_e32 v0, v1 2408; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2409; GFX8-NEXT: s_mov_b32 s3, 0xf000 2410; GFX8-NEXT: s_mov_b32 s2, -1 2411; GFX8-NEXT: s_nop 0 2412; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2413; GFX8-NEXT: s_endpgm 2414; 2415; GFX9-LABEL: and_i32_varying: 2416; GFX9: ; %bb.0: ; %entry 2417; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2418; GFX9-NEXT: v_mov_b32_e32 v2, v0 2419; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2420; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2421; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2422; GFX9-NEXT: v_mov_b32_e32 v1, -1 2423; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2424; GFX9-NEXT: s_not_b64 exec, exec 2425; GFX9-NEXT: v_mov_b32_e32 v2, -1 2426; GFX9-NEXT: s_not_b64 exec, exec 2427; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2428; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2429; GFX9-NEXT: s_nop 1 2430; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2431; GFX9-NEXT: s_nop 1 2432; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2433; GFX9-NEXT: s_nop 1 2434; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2435; GFX9-NEXT: s_nop 1 2436; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2437; GFX9-NEXT: s_nop 1 2438; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2439; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2440; GFX9-NEXT: s_nop 0 2441; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2442; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2443; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2444; GFX9-NEXT: ; implicit-def: $vgpr0 2445; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2446; GFX9-NEXT: s_cbranch_execz .LBB14_2 2447; GFX9-NEXT: ; %bb.1: 2448; GFX9-NEXT: v_mov_b32_e32 v0, 0 2449; GFX9-NEXT: v_mov_b32_e32 v3, s4 2450; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2451; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2453; GFX9-NEXT: .LBB14_2: 2454; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2455; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2456; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2457; GFX9-NEXT: v_mov_b32_e32 v0, v1 2458; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2459; GFX9-NEXT: s_mov_b32 s3, 0xf000 2460; GFX9-NEXT: s_mov_b32 s2, -1 2461; GFX9-NEXT: s_nop 0 2462; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2463; GFX9-NEXT: s_endpgm 2464; 2465; GFX1064-LABEL: and_i32_varying: 2466; GFX1064: ; %bb.0: ; %entry 2467; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2468; GFX1064-NEXT: s_not_b64 exec, exec 2469; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2470; GFX1064-NEXT: s_not_b64 exec, exec 2471; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2472; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2473; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2474; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2475; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2476; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2477; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2478; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2479; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2480; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2481; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2482; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2483; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2484; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2485; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2486; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2487; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2488; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2489; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2490; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2491; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2492; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2493; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2494; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2495; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2496; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2497; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2498; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2499; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2500; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2501; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2502; GFX1064-NEXT: s_mov_b32 s2, -1 2503; GFX1064-NEXT: ; implicit-def: $vgpr0 2504; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2505; GFX1064-NEXT: s_cbranch_execz .LBB14_2 2506; GFX1064-NEXT: ; %bb.1: 2507; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2508; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2509; GFX1064-NEXT: s_mov_b32 s3, s7 2510; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2511; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2512; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 2513; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2514; GFX1064-NEXT: buffer_gl0_inv 2515; GFX1064-NEXT: .LBB14_2: 2516; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2517; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2518; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2519; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2520; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2521; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2522; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2523; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2524; GFX1064-NEXT: s_endpgm 2525; 2526; GFX1032-LABEL: and_i32_varying: 2527; GFX1032: ; %bb.0: ; %entry 2528; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2529; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2530; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2531; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2532; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2533; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2534; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2535; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2536; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2537; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2538; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2539; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2540; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2541; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2542; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2543; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2544; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2545; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2546; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2548; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2549; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2550; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2551; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2552; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2553; GFX1032-NEXT: s_mov_b32 s2, -1 2554; GFX1032-NEXT: ; implicit-def: $vgpr0 2555; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2556; GFX1032-NEXT: s_cbranch_execz .LBB14_2 2557; GFX1032-NEXT: ; %bb.1: 2558; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2559; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2560; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2561; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2562; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 2563; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2564; GFX1032-NEXT: buffer_gl0_inv 2565; GFX1032-NEXT: .LBB14_2: 2566; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2567; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2568; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2569; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2570; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2571; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2572; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2573; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2574; GFX1032-NEXT: s_endpgm 2575entry: 2576 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2577 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2578 store i32 %old, i32 addrspace(1)* %out 2579 ret void 2580} 2581 2582define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2583; 2584; 2585; GFX7LESS-LABEL: or_i32_varying: 2586; GFX7LESS: ; %bb.0: ; %entry 2587; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2588; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2589; GFX7LESS-NEXT: s_mov_b32 m0, -1 2590; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2591; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2592; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2593; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2594; GFX7LESS-NEXT: s_mov_b32 s2, -1 2595; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2596; GFX7LESS-NEXT: s_endpgm 2597; 2598; GFX8-LABEL: or_i32_varying: 2599; GFX8: ; %bb.0: ; %entry 2600; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2601; GFX8-NEXT: v_mov_b32_e32 v2, v0 2602; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2603; GFX8-NEXT: v_mov_b32_e32 v1, 0 2604; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2605; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2606; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2607; GFX8-NEXT: s_not_b64 exec, exec 2608; GFX8-NEXT: v_mov_b32_e32 v2, 0 2609; GFX8-NEXT: s_not_b64 exec, exec 2610; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2611; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2612; GFX8-NEXT: s_nop 1 2613; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2614; GFX8-NEXT: s_nop 1 2615; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2616; GFX8-NEXT: s_nop 1 2617; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2618; GFX8-NEXT: s_nop 1 2619; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2620; GFX8-NEXT: s_nop 1 2621; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2622; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2623; GFX8-NEXT: s_nop 0 2624; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2625; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2626; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2627; GFX8-NEXT: ; implicit-def: $vgpr0 2628; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2629; GFX8-NEXT: s_cbranch_execz .LBB15_2 2630; GFX8-NEXT: ; %bb.1: 2631; GFX8-NEXT: v_mov_b32_e32 v0, 0 2632; GFX8-NEXT: v_mov_b32_e32 v3, s4 2633; GFX8-NEXT: s_mov_b32 m0, -1 2634; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2635; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2636; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2637; GFX8-NEXT: .LBB15_2: 2638; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2639; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2640; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2641; GFX8-NEXT: v_mov_b32_e32 v0, v1 2642; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2643; GFX8-NEXT: s_mov_b32 s3, 0xf000 2644; GFX8-NEXT: s_mov_b32 s2, -1 2645; GFX8-NEXT: s_nop 0 2646; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2647; GFX8-NEXT: s_endpgm 2648; 2649; GFX9-LABEL: or_i32_varying: 2650; GFX9: ; %bb.0: ; %entry 2651; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2652; GFX9-NEXT: v_mov_b32_e32 v2, v0 2653; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2654; GFX9-NEXT: v_mov_b32_e32 v1, 0 2655; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2656; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2657; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2658; GFX9-NEXT: s_not_b64 exec, exec 2659; GFX9-NEXT: v_mov_b32_e32 v2, 0 2660; GFX9-NEXT: s_not_b64 exec, exec 2661; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2662; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2663; GFX9-NEXT: s_nop 1 2664; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2665; GFX9-NEXT: s_nop 1 2666; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2667; GFX9-NEXT: s_nop 1 2668; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2669; GFX9-NEXT: s_nop 1 2670; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2671; GFX9-NEXT: s_nop 1 2672; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2673; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2674; GFX9-NEXT: s_nop 0 2675; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2676; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2677; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2678; GFX9-NEXT: ; implicit-def: $vgpr0 2679; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2680; GFX9-NEXT: s_cbranch_execz .LBB15_2 2681; GFX9-NEXT: ; %bb.1: 2682; GFX9-NEXT: v_mov_b32_e32 v0, 0 2683; GFX9-NEXT: v_mov_b32_e32 v3, s4 2684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2685; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2686; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2687; GFX9-NEXT: .LBB15_2: 2688; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2689; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2690; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2691; GFX9-NEXT: v_mov_b32_e32 v0, v1 2692; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2693; GFX9-NEXT: s_mov_b32 s3, 0xf000 2694; GFX9-NEXT: s_mov_b32 s2, -1 2695; GFX9-NEXT: s_nop 0 2696; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2697; GFX9-NEXT: s_endpgm 2698; 2699; GFX1064-LABEL: or_i32_varying: 2700; GFX1064: ; %bb.0: ; %entry 2701; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2702; GFX1064-NEXT: s_not_b64 exec, exec 2703; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2704; GFX1064-NEXT: s_not_b64 exec, exec 2705; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2706; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2707; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2708; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2709; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2710; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2711; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2712; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2713; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2714; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2715; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2716; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2717; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2718; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2719; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2720; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2721; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2722; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2723; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2724; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2725; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2726; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2727; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2728; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2729; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2730; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2731; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2732; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2733; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2734; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2735; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2736; GFX1064-NEXT: s_mov_b32 s2, -1 2737; GFX1064-NEXT: ; implicit-def: $vgpr0 2738; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2739; GFX1064-NEXT: s_cbranch_execz .LBB15_2 2740; GFX1064-NEXT: ; %bb.1: 2741; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2742; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2743; GFX1064-NEXT: s_mov_b32 s3, s7 2744; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2745; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2746; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 2747; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2748; GFX1064-NEXT: buffer_gl0_inv 2749; GFX1064-NEXT: .LBB15_2: 2750; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2751; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2752; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2753; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2754; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2755; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2756; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2757; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2758; GFX1064-NEXT: s_endpgm 2759; 2760; GFX1032-LABEL: or_i32_varying: 2761; GFX1032: ; %bb.0: ; %entry 2762; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2763; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2764; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2765; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2766; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2767; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2768; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2769; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2770; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2771; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2772; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2773; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2774; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2775; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2776; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2777; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2778; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2779; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2780; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2781; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2782; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2783; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2784; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2785; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2786; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2787; GFX1032-NEXT: s_mov_b32 s2, -1 2788; GFX1032-NEXT: ; implicit-def: $vgpr0 2789; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2790; GFX1032-NEXT: s_cbranch_execz .LBB15_2 2791; GFX1032-NEXT: ; %bb.1: 2792; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2793; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2794; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2795; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2796; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 2797; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2798; GFX1032-NEXT: buffer_gl0_inv 2799; GFX1032-NEXT: .LBB15_2: 2800; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2801; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2802; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2803; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2804; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 2805; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2806; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2807; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2808; GFX1032-NEXT: s_endpgm 2809entry: 2810 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2811 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2812 store i32 %old, i32 addrspace(1)* %out 2813 ret void 2814} 2815 2816define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 2817; 2818; 2819; GFX7LESS-LABEL: xor_i32_varying: 2820; GFX7LESS: ; %bb.0: ; %entry 2821; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2822; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2823; GFX7LESS-NEXT: s_mov_b32 m0, -1 2824; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2825; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 2826; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2828; GFX7LESS-NEXT: s_mov_b32 s2, -1 2829; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2830; GFX7LESS-NEXT: s_endpgm 2831; 2832; GFX8-LABEL: xor_i32_varying: 2833; GFX8: ; %bb.0: ; %entry 2834; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2835; GFX8-NEXT: v_mov_b32_e32 v2, v0 2836; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2837; GFX8-NEXT: v_mov_b32_e32 v1, 0 2838; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2839; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2840; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2841; GFX8-NEXT: s_not_b64 exec, exec 2842; GFX8-NEXT: v_mov_b32_e32 v2, 0 2843; GFX8-NEXT: s_not_b64 exec, exec 2844; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2845; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2846; GFX8-NEXT: s_nop 1 2847; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2848; GFX8-NEXT: s_nop 1 2849; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2850; GFX8-NEXT: s_nop 1 2851; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2852; GFX8-NEXT: s_nop 1 2853; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2854; GFX8-NEXT: s_nop 1 2855; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2856; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2857; GFX8-NEXT: s_nop 0 2858; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2859; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2860; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2861; GFX8-NEXT: ; implicit-def: $vgpr0 2862; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2863; GFX8-NEXT: s_cbranch_execz .LBB16_2 2864; GFX8-NEXT: ; %bb.1: 2865; GFX8-NEXT: v_mov_b32_e32 v0, 0 2866; GFX8-NEXT: v_mov_b32_e32 v3, s4 2867; GFX8-NEXT: s_mov_b32 m0, -1 2868; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2869; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 2870; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2871; GFX8-NEXT: .LBB16_2: 2872; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2873; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2874; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2875; GFX8-NEXT: v_mov_b32_e32 v0, v1 2876; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 2877; GFX8-NEXT: s_mov_b32 s3, 0xf000 2878; GFX8-NEXT: s_mov_b32 s2, -1 2879; GFX8-NEXT: s_nop 0 2880; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2881; GFX8-NEXT: s_endpgm 2882; 2883; GFX9-LABEL: xor_i32_varying: 2884; GFX9: ; %bb.0: ; %entry 2885; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2886; GFX9-NEXT: v_mov_b32_e32 v2, v0 2887; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2888; GFX9-NEXT: v_mov_b32_e32 v1, 0 2889; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2890; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2891; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2892; GFX9-NEXT: s_not_b64 exec, exec 2893; GFX9-NEXT: v_mov_b32_e32 v2, 0 2894; GFX9-NEXT: s_not_b64 exec, exec 2895; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2896; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2897; GFX9-NEXT: s_nop 1 2898; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2899; GFX9-NEXT: s_nop 1 2900; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2901; GFX9-NEXT: s_nop 1 2902; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2903; GFX9-NEXT: s_nop 1 2904; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2905; GFX9-NEXT: s_nop 1 2906; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2907; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2908; GFX9-NEXT: s_nop 0 2909; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2910; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2911; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2912; GFX9-NEXT: ; implicit-def: $vgpr0 2913; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2914; GFX9-NEXT: s_cbranch_execz .LBB16_2 2915; GFX9-NEXT: ; %bb.1: 2916; GFX9-NEXT: v_mov_b32_e32 v0, 0 2917; GFX9-NEXT: v_mov_b32_e32 v3, s4 2918; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2919; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 2920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2921; GFX9-NEXT: .LBB16_2: 2922; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2923; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2924; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2925; GFX9-NEXT: v_mov_b32_e32 v0, v1 2926; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2927; GFX9-NEXT: s_mov_b32 s3, 0xf000 2928; GFX9-NEXT: s_mov_b32 s2, -1 2929; GFX9-NEXT: s_nop 0 2930; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2931; GFX9-NEXT: s_endpgm 2932; 2933; GFX1064-LABEL: xor_i32_varying: 2934; GFX1064: ; %bb.0: ; %entry 2935; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2936; GFX1064-NEXT: s_not_b64 exec, exec 2937; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2938; GFX1064-NEXT: s_not_b64 exec, exec 2939; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2940; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2941; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2942; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2943; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2944; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2945; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2946; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2947; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2948; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2949; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2950; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2951; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2952; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2953; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2954; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2955; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2956; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2957; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2958; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2959; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2960; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2961; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2962; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2963; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2964; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2965; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2966; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2967; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2968; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2969; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2970; GFX1064-NEXT: s_mov_b32 s2, -1 2971; GFX1064-NEXT: ; implicit-def: $vgpr0 2972; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2973; GFX1064-NEXT: s_cbranch_execz .LBB16_2 2974; GFX1064-NEXT: ; %bb.1: 2975; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2976; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2977; GFX1064-NEXT: s_mov_b32 s3, s7 2978; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2979; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2980; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 2981; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2982; GFX1064-NEXT: buffer_gl0_inv 2983; GFX1064-NEXT: .LBB16_2: 2984; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2985; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2986; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2987; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2988; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 2989; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2990; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2991; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2992; GFX1064-NEXT: s_endpgm 2993; 2994; GFX1032-LABEL: xor_i32_varying: 2995; GFX1032: ; %bb.0: ; %entry 2996; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2997; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2998; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2999; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3000; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3001; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3002; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3003; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3004; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3005; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3006; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3007; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3008; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3009; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3010; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3011; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3012; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3013; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3014; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3015; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3016; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3017; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3018; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3019; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3020; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3021; GFX1032-NEXT: s_mov_b32 s2, -1 3022; GFX1032-NEXT: ; implicit-def: $vgpr0 3023; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3024; GFX1032-NEXT: s_cbranch_execz .LBB16_2 3025; GFX1032-NEXT: ; %bb.1: 3026; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3027; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3028; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3029; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3030; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 3031; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3032; GFX1032-NEXT: buffer_gl0_inv 3033; GFX1032-NEXT: .LBB16_2: 3034; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3035; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3036; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3037; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3038; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3039; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3040; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3041; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3042; GFX1032-NEXT: s_endpgm 3043entry: 3044 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3045 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3046 store i32 %old, i32 addrspace(1)* %out 3047 ret void 3048} 3049 3050define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3051; 3052; 3053; GFX7LESS-LABEL: max_i32_varying: 3054; GFX7LESS: ; %bb.0: ; %entry 3055; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3056; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3057; GFX7LESS-NEXT: s_mov_b32 m0, -1 3058; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3059; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3060; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3061; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3062; GFX7LESS-NEXT: s_mov_b32 s2, -1 3063; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3064; GFX7LESS-NEXT: s_endpgm 3065; 3066; GFX8-LABEL: max_i32_varying: 3067; GFX8: ; %bb.0: ; %entry 3068; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3069; GFX8-NEXT: v_mov_b32_e32 v2, v0 3070; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3071; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3072; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3073; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3074; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3075; GFX8-NEXT: s_not_b64 exec, exec 3076; GFX8-NEXT: v_mov_b32_e32 v2, v1 3077; GFX8-NEXT: s_not_b64 exec, exec 3078; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3079; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3080; GFX8-NEXT: s_nop 1 3081; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3082; GFX8-NEXT: s_nop 1 3083; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3084; GFX8-NEXT: s_nop 1 3085; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3086; GFX8-NEXT: s_nop 1 3087; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3088; GFX8-NEXT: s_nop 1 3089; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3090; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3091; GFX8-NEXT: s_nop 0 3092; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3093; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3094; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3095; GFX8-NEXT: ; implicit-def: $vgpr0 3096; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3097; GFX8-NEXT: s_cbranch_execz .LBB17_2 3098; GFX8-NEXT: ; %bb.1: 3099; GFX8-NEXT: v_mov_b32_e32 v0, 0 3100; GFX8-NEXT: v_mov_b32_e32 v3, s4 3101; GFX8-NEXT: s_mov_b32 m0, -1 3102; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3103; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3104; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3105; GFX8-NEXT: .LBB17_2: 3106; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3107; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3108; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3109; GFX8-NEXT: v_mov_b32_e32 v0, v1 3110; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3111; GFX8-NEXT: s_mov_b32 s3, 0xf000 3112; GFX8-NEXT: s_mov_b32 s2, -1 3113; GFX8-NEXT: s_nop 0 3114; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3115; GFX8-NEXT: s_endpgm 3116; 3117; GFX9-LABEL: max_i32_varying: 3118; GFX9: ; %bb.0: ; %entry 3119; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3120; GFX9-NEXT: v_mov_b32_e32 v2, v0 3121; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3122; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3123; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3124; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3125; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3126; GFX9-NEXT: s_not_b64 exec, exec 3127; GFX9-NEXT: v_mov_b32_e32 v2, v1 3128; GFX9-NEXT: s_not_b64 exec, exec 3129; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3130; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3131; GFX9-NEXT: s_nop 1 3132; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3133; GFX9-NEXT: s_nop 1 3134; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3135; GFX9-NEXT: s_nop 1 3136; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3137; GFX9-NEXT: s_nop 1 3138; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3139; GFX9-NEXT: s_nop 1 3140; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3141; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3142; GFX9-NEXT: s_nop 0 3143; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3144; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3145; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3146; GFX9-NEXT: ; implicit-def: $vgpr0 3147; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3148; GFX9-NEXT: s_cbranch_execz .LBB17_2 3149; GFX9-NEXT: ; %bb.1: 3150; GFX9-NEXT: v_mov_b32_e32 v0, 0 3151; GFX9-NEXT: v_mov_b32_e32 v3, s4 3152; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3153; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3154; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3155; GFX9-NEXT: .LBB17_2: 3156; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3157; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3158; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3159; GFX9-NEXT: v_mov_b32_e32 v0, v1 3160; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3161; GFX9-NEXT: s_mov_b32 s3, 0xf000 3162; GFX9-NEXT: s_mov_b32 s2, -1 3163; GFX9-NEXT: s_nop 0 3164; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3165; GFX9-NEXT: s_endpgm 3166; 3167; GFX1064-LABEL: max_i32_varying: 3168; GFX1064: ; %bb.0: ; %entry 3169; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3170; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3171; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3172; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3173; GFX1064-NEXT: s_not_b64 exec, exec 3174; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3175; GFX1064-NEXT: s_not_b64 exec, exec 3176; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3177; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3178; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3179; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3180; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3181; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3182; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3183; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3184; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3185; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3186; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3187; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3188; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3189; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3190; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3191; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3192; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3193; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3194; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3195; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3196; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3197; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3198; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3199; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3200; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3201; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3202; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3203; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3204; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3205; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3206; GFX1064-NEXT: s_mov_b32 s2, -1 3207; GFX1064-NEXT: ; implicit-def: $vgpr0 3208; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3209; GFX1064-NEXT: s_cbranch_execz .LBB17_2 3210; GFX1064-NEXT: ; %bb.1: 3211; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3212; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3213; GFX1064-NEXT: s_mov_b32 s3, s7 3214; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3215; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3216; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 3217; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3218; GFX1064-NEXT: buffer_gl0_inv 3219; GFX1064-NEXT: .LBB17_2: 3220; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3221; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3222; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3223; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3224; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3225; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3226; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3227; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3228; GFX1064-NEXT: s_endpgm 3229; 3230; GFX1032-LABEL: max_i32_varying: 3231; GFX1032: ; %bb.0: ; %entry 3232; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3233; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3234; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3235; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3236; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3237; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3238; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3239; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3240; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3241; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3242; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3243; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3244; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3245; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3246; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3247; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3248; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3249; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3250; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3251; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3252; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3253; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3254; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3255; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3256; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3257; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3258; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3259; GFX1032-NEXT: s_mov_b32 s2, -1 3260; GFX1032-NEXT: ; implicit-def: $vgpr0 3261; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3262; GFX1032-NEXT: s_cbranch_execz .LBB17_2 3263; GFX1032-NEXT: ; %bb.1: 3264; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3265; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3266; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3267; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3268; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 3269; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3270; GFX1032-NEXT: buffer_gl0_inv 3271; GFX1032-NEXT: .LBB17_2: 3272; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3273; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3274; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3275; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3276; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3277; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3278; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3279; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3280; GFX1032-NEXT: s_endpgm 3281entry: 3282 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3283 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3284 store i32 %old, i32 addrspace(1)* %out 3285 ret void 3286} 3287 3288define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3289; 3290; 3291; GFX7LESS-LABEL: max_i64_constant: 3292; GFX7LESS: ; %bb.0: ; %entry 3293; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3294; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3295; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3296; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3297; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3298; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3299; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 3300; GFX7LESS-NEXT: ; %bb.1: 3301; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 3302; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3303; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3304; GFX7LESS-NEXT: s_mov_b32 m0, -1 3305; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3306; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3307; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3308; GFX7LESS-NEXT: .LBB18_2: 3309; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3310; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3311; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3312; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3313; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3314; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3315; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3316; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3317; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3318; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3319; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3320; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3321; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3322; GFX7LESS-NEXT: s_mov_b32 s2, -1 3323; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3324; GFX7LESS-NEXT: s_endpgm 3325; 3326; GFX8-LABEL: max_i64_constant: 3327; GFX8: ; %bb.0: ; %entry 3328; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3329; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3330; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3331; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3332; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3333; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3334; GFX8-NEXT: s_cbranch_execz .LBB18_2 3335; GFX8-NEXT: ; %bb.1: 3336; GFX8-NEXT: v_mov_b32_e32 v0, 5 3337; GFX8-NEXT: v_mov_b32_e32 v2, 0 3338; GFX8-NEXT: v_mov_b32_e32 v1, 0 3339; GFX8-NEXT: s_mov_b32 m0, -1 3340; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3341; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3342; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3343; GFX8-NEXT: .LBB18_2: 3344; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3345; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3346; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3347; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3348; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3349; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3350; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3351; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3352; GFX8-NEXT: v_mov_b32_e32 v2, s3 3353; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3354; GFX8-NEXT: v_mov_b32_e32 v2, s2 3355; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3356; GFX8-NEXT: s_mov_b32 s3, 0xf000 3357; GFX8-NEXT: s_mov_b32 s2, -1 3358; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3359; GFX8-NEXT: s_endpgm 3360; 3361; GFX9-LABEL: max_i64_constant: 3362; GFX9: ; %bb.0: ; %entry 3363; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3364; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3365; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3366; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3367; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3368; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3369; GFX9-NEXT: s_cbranch_execz .LBB18_2 3370; GFX9-NEXT: ; %bb.1: 3371; GFX9-NEXT: v_mov_b32_e32 v0, 5 3372; GFX9-NEXT: v_mov_b32_e32 v1, 0 3373; GFX9-NEXT: v_mov_b32_e32 v2, 0 3374; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3375; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3376; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3377; GFX9-NEXT: .LBB18_2: 3378; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3379; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3380; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3381; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3382; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3383; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3384; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3385; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3386; GFX9-NEXT: v_mov_b32_e32 v2, s3 3387; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3388; GFX9-NEXT: v_mov_b32_e32 v2, s2 3389; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3390; GFX9-NEXT: s_mov_b32 s3, 0xf000 3391; GFX9-NEXT: s_mov_b32 s2, -1 3392; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3393; GFX9-NEXT: s_endpgm 3394; 3395; GFX1064-LABEL: max_i64_constant: 3396; GFX1064: ; %bb.0: ; %entry 3397; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3398; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3399; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3400; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3401; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3402; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3403; GFX1064-NEXT: s_cbranch_execz .LBB18_2 3404; GFX1064-NEXT: ; %bb.1: 3405; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3406; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3407; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3408; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3409; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3410; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3411; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3412; GFX1064-NEXT: buffer_gl0_inv 3413; GFX1064-NEXT: .LBB18_2: 3414; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3415; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3416; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3417; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3418; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3419; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3420; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3421; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3422; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3423; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3424; GFX1064-NEXT: s_mov_b32 s2, -1 3425; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3426; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3427; GFX1064-NEXT: s_endpgm 3428; 3429; GFX1032-LABEL: max_i64_constant: 3430; GFX1032: ; %bb.0: ; %entry 3431; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3432; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3433; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3434; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3435; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3436; GFX1032-NEXT: s_cbranch_execz .LBB18_2 3437; GFX1032-NEXT: ; %bb.1: 3438; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3439; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3440; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3441; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3442; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3443; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3444; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3445; GFX1032-NEXT: buffer_gl0_inv 3446; GFX1032-NEXT: .LBB18_2: 3447; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3448; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3449; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3450; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3451; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3452; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3453; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3454; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3455; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3456; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3457; GFX1032-NEXT: s_mov_b32 s2, -1 3458; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3459; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3460; GFX1032-NEXT: s_endpgm 3461entry: 3462 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3463 store i64 %old, i64 addrspace(1)* %out 3464 ret void 3465} 3466 3467define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3468; 3469; 3470; GFX7LESS-LABEL: min_i32_varying: 3471; GFX7LESS: ; %bb.0: ; %entry 3472; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3473; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3474; GFX7LESS-NEXT: s_mov_b32 m0, -1 3475; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3476; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3477; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3478; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3479; GFX7LESS-NEXT: s_mov_b32 s2, -1 3480; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3481; GFX7LESS-NEXT: s_endpgm 3482; 3483; GFX8-LABEL: min_i32_varying: 3484; GFX8: ; %bb.0: ; %entry 3485; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3486; GFX8-NEXT: v_mov_b32_e32 v2, v0 3487; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3488; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3489; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3490; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3491; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3492; GFX8-NEXT: s_not_b64 exec, exec 3493; GFX8-NEXT: v_mov_b32_e32 v2, v1 3494; GFX8-NEXT: s_not_b64 exec, exec 3495; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3496; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3497; GFX8-NEXT: s_nop 1 3498; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3499; GFX8-NEXT: s_nop 1 3500; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3501; GFX8-NEXT: s_nop 1 3502; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3503; GFX8-NEXT: s_nop 1 3504; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3505; GFX8-NEXT: s_nop 1 3506; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3507; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3508; GFX8-NEXT: s_nop 0 3509; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3510; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3511; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3512; GFX8-NEXT: ; implicit-def: $vgpr0 3513; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3514; GFX8-NEXT: s_cbranch_execz .LBB19_2 3515; GFX8-NEXT: ; %bb.1: 3516; GFX8-NEXT: v_mov_b32_e32 v0, 0 3517; GFX8-NEXT: v_mov_b32_e32 v3, s4 3518; GFX8-NEXT: s_mov_b32 m0, -1 3519; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3520; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3521; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3522; GFX8-NEXT: .LBB19_2: 3523; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3524; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3525; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3526; GFX8-NEXT: v_mov_b32_e32 v0, v1 3527; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3528; GFX8-NEXT: s_mov_b32 s3, 0xf000 3529; GFX8-NEXT: s_mov_b32 s2, -1 3530; GFX8-NEXT: s_nop 0 3531; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3532; GFX8-NEXT: s_endpgm 3533; 3534; GFX9-LABEL: min_i32_varying: 3535; GFX9: ; %bb.0: ; %entry 3536; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3537; GFX9-NEXT: v_mov_b32_e32 v2, v0 3538; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3539; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3540; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3541; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3542; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3543; GFX9-NEXT: s_not_b64 exec, exec 3544; GFX9-NEXT: v_mov_b32_e32 v2, v1 3545; GFX9-NEXT: s_not_b64 exec, exec 3546; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3547; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3548; GFX9-NEXT: s_nop 1 3549; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3550; GFX9-NEXT: s_nop 1 3551; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3552; GFX9-NEXT: s_nop 1 3553; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3554; GFX9-NEXT: s_nop 1 3555; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3556; GFX9-NEXT: s_nop 1 3557; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3558; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3559; GFX9-NEXT: s_nop 0 3560; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3561; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3562; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3563; GFX9-NEXT: ; implicit-def: $vgpr0 3564; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3565; GFX9-NEXT: s_cbranch_execz .LBB19_2 3566; GFX9-NEXT: ; %bb.1: 3567; GFX9-NEXT: v_mov_b32_e32 v0, 0 3568; GFX9-NEXT: v_mov_b32_e32 v3, s4 3569; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3570; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3571; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3572; GFX9-NEXT: .LBB19_2: 3573; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3574; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3575; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3576; GFX9-NEXT: v_mov_b32_e32 v0, v1 3577; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3578; GFX9-NEXT: s_mov_b32 s3, 0xf000 3579; GFX9-NEXT: s_mov_b32 s2, -1 3580; GFX9-NEXT: s_nop 0 3581; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3582; GFX9-NEXT: s_endpgm 3583; 3584; GFX1064-LABEL: min_i32_varying: 3585; GFX1064: ; %bb.0: ; %entry 3586; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3587; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3588; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3589; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3590; GFX1064-NEXT: s_not_b64 exec, exec 3591; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3592; GFX1064-NEXT: s_not_b64 exec, exec 3593; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3594; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3595; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3596; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3597; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3598; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3599; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3600; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3601; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3602; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3603; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3604; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3605; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3606; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3607; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3608; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3609; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3610; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3611; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3612; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3613; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3614; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3615; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3616; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3617; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3618; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3619; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3620; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3621; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3622; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3623; GFX1064-NEXT: s_mov_b32 s2, -1 3624; GFX1064-NEXT: ; implicit-def: $vgpr0 3625; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3626; GFX1064-NEXT: s_cbranch_execz .LBB19_2 3627; GFX1064-NEXT: ; %bb.1: 3628; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3629; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3630; GFX1064-NEXT: s_mov_b32 s3, s7 3631; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3632; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3633; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 3634; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3635; GFX1064-NEXT: buffer_gl0_inv 3636; GFX1064-NEXT: .LBB19_2: 3637; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3638; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3639; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3640; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3641; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3642; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3643; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3644; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3645; GFX1064-NEXT: s_endpgm 3646; 3647; GFX1032-LABEL: min_i32_varying: 3648; GFX1032: ; %bb.0: ; %entry 3649; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3650; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3651; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3652; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3653; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3654; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3655; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3656; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3657; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3658; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3659; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3660; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3661; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3662; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3663; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3664; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3665; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3666; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3667; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3668; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3669; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3670; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3671; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3672; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3673; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3674; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3675; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3676; GFX1032-NEXT: s_mov_b32 s2, -1 3677; GFX1032-NEXT: ; implicit-def: $vgpr0 3678; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3679; GFX1032-NEXT: s_cbranch_execz .LBB19_2 3680; GFX1032-NEXT: ; %bb.1: 3681; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3682; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3683; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3684; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3685; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 3686; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3687; GFX1032-NEXT: buffer_gl0_inv 3688; GFX1032-NEXT: .LBB19_2: 3689; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3690; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3691; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3692; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3693; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3694; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3695; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3696; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3697; GFX1032-NEXT: s_endpgm 3698entry: 3699 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3700 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3701 store i32 %old, i32 addrspace(1)* %out 3702 ret void 3703} 3704 3705define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3706; 3707; 3708; GFX7LESS-LABEL: min_i64_constant: 3709; GFX7LESS: ; %bb.0: ; %entry 3710; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3711; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3712; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3713; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3714; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3715; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3716; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 3717; GFX7LESS-NEXT: ; %bb.1: 3718; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 3719; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3720; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3721; GFX7LESS-NEXT: s_mov_b32 m0, -1 3722; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3723; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3724; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3725; GFX7LESS-NEXT: .LBB20_2: 3726; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3727; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3728; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3729; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3730; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3731; GFX7LESS-NEXT: s_mov_b32 s2, -1 3732; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3733; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3734; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3735; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3736; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3737; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3738; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3739; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3740; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3741; GFX7LESS-NEXT: s_endpgm 3742; 3743; GFX8-LABEL: min_i64_constant: 3744; GFX8: ; %bb.0: ; %entry 3745; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3746; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3747; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3748; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3749; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3750; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3751; GFX8-NEXT: s_cbranch_execz .LBB20_2 3752; GFX8-NEXT: ; %bb.1: 3753; GFX8-NEXT: v_mov_b32_e32 v0, 5 3754; GFX8-NEXT: v_mov_b32_e32 v2, 0 3755; GFX8-NEXT: v_mov_b32_e32 v1, 0 3756; GFX8-NEXT: s_mov_b32 m0, -1 3757; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3758; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3759; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3760; GFX8-NEXT: .LBB20_2: 3761; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3762; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3763; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3764; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3765; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3766; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3767; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3768; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3769; GFX8-NEXT: v_mov_b32_e32 v2, s5 3770; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3771; GFX8-NEXT: v_mov_b32_e32 v2, s4 3772; GFX8-NEXT: s_mov_b32 s2, -1 3773; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3774; GFX8-NEXT: s_mov_b32 s3, 0xf000 3775; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3776; GFX8-NEXT: s_endpgm 3777; 3778; GFX9-LABEL: min_i64_constant: 3779; GFX9: ; %bb.0: ; %entry 3780; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3781; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3782; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3783; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3784; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3785; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3786; GFX9-NEXT: s_cbranch_execz .LBB20_2 3787; GFX9-NEXT: ; %bb.1: 3788; GFX9-NEXT: v_mov_b32_e32 v0, 5 3789; GFX9-NEXT: v_mov_b32_e32 v1, 0 3790; GFX9-NEXT: v_mov_b32_e32 v2, 0 3791; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3792; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3793; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3794; GFX9-NEXT: .LBB20_2: 3795; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3797; GFX9-NEXT: v_readfirstlane_b32 s4, v0 3798; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 3799; GFX9-NEXT: v_readfirstlane_b32 s5, v1 3800; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3801; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3802; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3803; GFX9-NEXT: v_mov_b32_e32 v2, s5 3804; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3805; GFX9-NEXT: v_mov_b32_e32 v2, s4 3806; GFX9-NEXT: s_mov_b32 s2, -1 3807; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3808; GFX9-NEXT: s_mov_b32 s3, 0xf000 3809; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3810; GFX9-NEXT: s_endpgm 3811; 3812; GFX1064-LABEL: min_i64_constant: 3813; GFX1064: ; %bb.0: ; %entry 3814; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3815; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3816; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3817; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3818; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3819; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3820; GFX1064-NEXT: s_cbranch_execz .LBB20_2 3821; GFX1064-NEXT: ; %bb.1: 3822; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3823; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3824; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3825; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3826; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3827; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3828; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3829; GFX1064-NEXT: buffer_gl0_inv 3830; GFX1064-NEXT: .LBB20_2: 3831; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3832; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3833; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3834; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3835; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 3836; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3837; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 3838; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3839; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3840; GFX1064-NEXT: s_mov_b32 s2, -1 3841; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3842; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3843; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3844; GFX1064-NEXT: s_endpgm 3845; 3846; GFX1032-LABEL: min_i64_constant: 3847; GFX1032: ; %bb.0: ; %entry 3848; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3849; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3850; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3851; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3852; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3853; GFX1032-NEXT: s_cbranch_execz .LBB20_2 3854; GFX1032-NEXT: ; %bb.1: 3855; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3856; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3857; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3858; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3859; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3860; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3861; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3862; GFX1032-NEXT: buffer_gl0_inv 3863; GFX1032-NEXT: .LBB20_2: 3864; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3865; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3866; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3867; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3868; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 3869; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 3870; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 3871; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3872; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3873; GFX1032-NEXT: s_mov_b32 s2, -1 3874; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3875; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3876; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3877; GFX1032-NEXT: s_endpgm 3878entry: 3879 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 3880 store i64 %old, i64 addrspace(1)* %out 3881 ret void 3882} 3883 3884define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 3885; 3886; 3887; GFX7LESS-LABEL: umax_i32_varying: 3888; GFX7LESS: ; %bb.0: ; %entry 3889; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3890; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3891; GFX7LESS-NEXT: s_mov_b32 m0, -1 3892; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3893; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 3894; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3895; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3896; GFX7LESS-NEXT: s_mov_b32 s2, -1 3897; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3898; GFX7LESS-NEXT: s_endpgm 3899; 3900; GFX8-LABEL: umax_i32_varying: 3901; GFX8: ; %bb.0: ; %entry 3902; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3903; GFX8-NEXT: v_mov_b32_e32 v2, v0 3904; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3905; GFX8-NEXT: v_mov_b32_e32 v1, 0 3906; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3907; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3908; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3909; GFX8-NEXT: s_not_b64 exec, exec 3910; GFX8-NEXT: v_mov_b32_e32 v2, 0 3911; GFX8-NEXT: s_not_b64 exec, exec 3912; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3913; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3914; GFX8-NEXT: s_nop 1 3915; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3916; GFX8-NEXT: s_nop 1 3917; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3918; GFX8-NEXT: s_nop 1 3919; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3920; GFX8-NEXT: s_nop 1 3921; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3922; GFX8-NEXT: s_nop 1 3923; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3924; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3925; GFX8-NEXT: s_nop 0 3926; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3927; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3928; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3929; GFX8-NEXT: ; implicit-def: $vgpr0 3930; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3931; GFX8-NEXT: s_cbranch_execz .LBB21_2 3932; GFX8-NEXT: ; %bb.1: 3933; GFX8-NEXT: v_mov_b32_e32 v0, 0 3934; GFX8-NEXT: v_mov_b32_e32 v3, s4 3935; GFX8-NEXT: s_mov_b32 m0, -1 3936; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3937; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 3938; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3939; GFX8-NEXT: .LBB21_2: 3940; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3941; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3942; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3943; GFX8-NEXT: v_mov_b32_e32 v0, v1 3944; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 3945; GFX8-NEXT: s_mov_b32 s3, 0xf000 3946; GFX8-NEXT: s_mov_b32 s2, -1 3947; GFX8-NEXT: s_nop 0 3948; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3949; GFX8-NEXT: s_endpgm 3950; 3951; GFX9-LABEL: umax_i32_varying: 3952; GFX9: ; %bb.0: ; %entry 3953; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3954; GFX9-NEXT: v_mov_b32_e32 v2, v0 3955; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3956; GFX9-NEXT: v_mov_b32_e32 v1, 0 3957; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3958; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3959; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3960; GFX9-NEXT: s_not_b64 exec, exec 3961; GFX9-NEXT: v_mov_b32_e32 v2, 0 3962; GFX9-NEXT: s_not_b64 exec, exec 3963; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3964; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3965; GFX9-NEXT: s_nop 1 3966; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3967; GFX9-NEXT: s_nop 1 3968; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3969; GFX9-NEXT: s_nop 1 3970; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3971; GFX9-NEXT: s_nop 1 3972; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3973; GFX9-NEXT: s_nop 1 3974; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3975; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3976; GFX9-NEXT: s_nop 0 3977; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3978; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3979; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3980; GFX9-NEXT: ; implicit-def: $vgpr0 3981; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3982; GFX9-NEXT: s_cbranch_execz .LBB21_2 3983; GFX9-NEXT: ; %bb.1: 3984; GFX9-NEXT: v_mov_b32_e32 v0, 0 3985; GFX9-NEXT: v_mov_b32_e32 v3, s4 3986; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3987; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 3988; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3989; GFX9-NEXT: .LBB21_2: 3990; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3991; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3992; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3993; GFX9-NEXT: v_mov_b32_e32 v0, v1 3994; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 3995; GFX9-NEXT: s_mov_b32 s3, 0xf000 3996; GFX9-NEXT: s_mov_b32 s2, -1 3997; GFX9-NEXT: s_nop 0 3998; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3999; GFX9-NEXT: s_endpgm 4000; 4001; GFX1064-LABEL: umax_i32_varying: 4002; GFX1064: ; %bb.0: ; %entry 4003; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4004; GFX1064-NEXT: s_not_b64 exec, exec 4005; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4006; GFX1064-NEXT: s_not_b64 exec, exec 4007; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4008; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4009; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4010; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4011; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4012; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4013; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4014; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4015; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4016; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4017; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4018; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4019; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4020; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4021; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4022; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4023; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4024; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4025; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4026; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4027; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4028; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4029; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4030; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4031; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4032; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4033; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4034; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4035; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4036; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4037; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4038; GFX1064-NEXT: s_mov_b32 s2, -1 4039; GFX1064-NEXT: ; implicit-def: $vgpr0 4040; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4041; GFX1064-NEXT: s_cbranch_execz .LBB21_2 4042; GFX1064-NEXT: ; %bb.1: 4043; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4044; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4045; GFX1064-NEXT: s_mov_b32 s3, s7 4046; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4047; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4048; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 4049; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4050; GFX1064-NEXT: buffer_gl0_inv 4051; GFX1064-NEXT: .LBB21_2: 4052; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4053; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4054; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4055; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4056; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4057; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4058; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4059; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4060; GFX1064-NEXT: s_endpgm 4061; 4062; GFX1032-LABEL: umax_i32_varying: 4063; GFX1032: ; %bb.0: ; %entry 4064; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4065; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4066; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4067; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4068; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4069; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4070; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4071; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4072; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4073; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4074; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4075; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4076; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4077; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4078; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4079; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4080; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4081; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4082; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4083; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4084; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4085; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4086; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4087; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4088; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4089; GFX1032-NEXT: s_mov_b32 s2, -1 4090; GFX1032-NEXT: ; implicit-def: $vgpr0 4091; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4092; GFX1032-NEXT: s_cbranch_execz .LBB21_2 4093; GFX1032-NEXT: ; %bb.1: 4094; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4095; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4096; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4097; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4098; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 4099; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4100; GFX1032-NEXT: buffer_gl0_inv 4101; GFX1032-NEXT: .LBB21_2: 4102; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4103; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4104; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4105; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4106; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4107; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4108; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4109; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4110; GFX1032-NEXT: s_endpgm 4111entry: 4112 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4113 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4114 store i32 %old, i32 addrspace(1)* %out 4115 ret void 4116} 4117 4118define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4119; 4120; 4121; GFX7LESS-LABEL: umax_i64_constant: 4122; GFX7LESS: ; %bb.0: ; %entry 4123; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4124; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4125; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4126; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4127; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4128; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4129; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 4130; GFX7LESS-NEXT: ; %bb.1: 4131; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4132; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4133; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4134; GFX7LESS-NEXT: s_mov_b32 m0, -1 4135; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4136; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4137; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4138; GFX7LESS-NEXT: .LBB22_2: 4139; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4140; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4141; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4142; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4143; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4144; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4145; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4146; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4147; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4148; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4149; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4150; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4151; GFX7LESS-NEXT: s_mov_b32 s2, -1 4152; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4153; GFX7LESS-NEXT: s_endpgm 4154; 4155; GFX8-LABEL: umax_i64_constant: 4156; GFX8: ; %bb.0: ; %entry 4157; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4158; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4159; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4160; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4161; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4162; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4163; GFX8-NEXT: s_cbranch_execz .LBB22_2 4164; GFX8-NEXT: ; %bb.1: 4165; GFX8-NEXT: v_mov_b32_e32 v0, 5 4166; GFX8-NEXT: v_mov_b32_e32 v2, 0 4167; GFX8-NEXT: v_mov_b32_e32 v1, 0 4168; GFX8-NEXT: s_mov_b32 m0, -1 4169; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4170; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4171; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4172; GFX8-NEXT: .LBB22_2: 4173; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4174; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4175; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4176; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4177; GFX8-NEXT: v_mov_b32_e32 v1, 0 4178; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4179; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4180; GFX8-NEXT: v_mov_b32_e32 v2, s2 4181; GFX8-NEXT: v_mov_b32_e32 v1, s3 4182; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4183; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4184; GFX8-NEXT: s_mov_b32 s3, 0xf000 4185; GFX8-NEXT: s_mov_b32 s2, -1 4186; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4187; GFX8-NEXT: s_endpgm 4188; 4189; GFX9-LABEL: umax_i64_constant: 4190; GFX9: ; %bb.0: ; %entry 4191; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4192; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4193; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4194; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4195; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4196; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4197; GFX9-NEXT: s_cbranch_execz .LBB22_2 4198; GFX9-NEXT: ; %bb.1: 4199; GFX9-NEXT: v_mov_b32_e32 v0, 5 4200; GFX9-NEXT: v_mov_b32_e32 v1, 0 4201; GFX9-NEXT: v_mov_b32_e32 v2, 0 4202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4203; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4204; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4205; GFX9-NEXT: .LBB22_2: 4206; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4208; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4209; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4210; GFX9-NEXT: v_mov_b32_e32 v1, 0 4211; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4212; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4213; GFX9-NEXT: v_mov_b32_e32 v2, s2 4214; GFX9-NEXT: v_mov_b32_e32 v1, s3 4215; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4216; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4217; GFX9-NEXT: s_mov_b32 s3, 0xf000 4218; GFX9-NEXT: s_mov_b32 s2, -1 4219; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4220; GFX9-NEXT: s_endpgm 4221; 4222; GFX1064-LABEL: umax_i64_constant: 4223; GFX1064: ; %bb.0: ; %entry 4224; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4225; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4226; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4227; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4228; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4229; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4230; GFX1064-NEXT: s_cbranch_execz .LBB22_2 4231; GFX1064-NEXT: ; %bb.1: 4232; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4233; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4234; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4235; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4236; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4237; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4238; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4239; GFX1064-NEXT: buffer_gl0_inv 4240; GFX1064-NEXT: .LBB22_2: 4241; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4242; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4243; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4244; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4245; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4246; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4247; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4248; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4249; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4250; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4251; GFX1064-NEXT: s_mov_b32 s2, -1 4252; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4253; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4254; GFX1064-NEXT: s_endpgm 4255; 4256; GFX1032-LABEL: umax_i64_constant: 4257; GFX1032: ; %bb.0: ; %entry 4258; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4259; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4260; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4261; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4262; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4263; GFX1032-NEXT: s_cbranch_execz .LBB22_2 4264; GFX1032-NEXT: ; %bb.1: 4265; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4266; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4267; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4268; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4269; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4270; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4271; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4272; GFX1032-NEXT: buffer_gl0_inv 4273; GFX1032-NEXT: .LBB22_2: 4274; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4275; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4276; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4277; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4278; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4279; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4280; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4281; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4282; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4283; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4284; GFX1032-NEXT: s_mov_b32 s2, -1 4285; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4286; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4287; GFX1032-NEXT: s_endpgm 4288entry: 4289 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4290 store i64 %old, i64 addrspace(1)* %out 4291 ret void 4292} 4293 4294define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4295; 4296; 4297; GFX7LESS-LABEL: umin_i32_varying: 4298; GFX7LESS: ; %bb.0: ; %entry 4299; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4300; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4301; GFX7LESS-NEXT: s_mov_b32 m0, -1 4302; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4303; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4304; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4305; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4306; GFX7LESS-NEXT: s_mov_b32 s2, -1 4307; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4308; GFX7LESS-NEXT: s_endpgm 4309; 4310; GFX8-LABEL: umin_i32_varying: 4311; GFX8: ; %bb.0: ; %entry 4312; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4313; GFX8-NEXT: v_mov_b32_e32 v2, v0 4314; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4315; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4316; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4317; GFX8-NEXT: v_mov_b32_e32 v1, -1 4318; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4319; GFX8-NEXT: s_not_b64 exec, exec 4320; GFX8-NEXT: v_mov_b32_e32 v2, -1 4321; GFX8-NEXT: s_not_b64 exec, exec 4322; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4323; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4324; GFX8-NEXT: s_nop 1 4325; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4326; GFX8-NEXT: s_nop 1 4327; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4328; GFX8-NEXT: s_nop 1 4329; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4330; GFX8-NEXT: s_nop 1 4331; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4332; GFX8-NEXT: s_nop 1 4333; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4334; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4335; GFX8-NEXT: s_nop 0 4336; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4337; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4338; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4339; GFX8-NEXT: ; implicit-def: $vgpr0 4340; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4341; GFX8-NEXT: s_cbranch_execz .LBB23_2 4342; GFX8-NEXT: ; %bb.1: 4343; GFX8-NEXT: v_mov_b32_e32 v0, 0 4344; GFX8-NEXT: v_mov_b32_e32 v3, s4 4345; GFX8-NEXT: s_mov_b32 m0, -1 4346; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4347; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4348; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4349; GFX8-NEXT: .LBB23_2: 4350; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4351; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4352; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4353; GFX8-NEXT: v_mov_b32_e32 v0, v1 4354; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4355; GFX8-NEXT: s_mov_b32 s3, 0xf000 4356; GFX8-NEXT: s_mov_b32 s2, -1 4357; GFX8-NEXT: s_nop 0 4358; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4359; GFX8-NEXT: s_endpgm 4360; 4361; GFX9-LABEL: umin_i32_varying: 4362; GFX9: ; %bb.0: ; %entry 4363; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4364; GFX9-NEXT: v_mov_b32_e32 v2, v0 4365; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4366; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4367; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4368; GFX9-NEXT: v_mov_b32_e32 v1, -1 4369; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4370; GFX9-NEXT: s_not_b64 exec, exec 4371; GFX9-NEXT: v_mov_b32_e32 v2, -1 4372; GFX9-NEXT: s_not_b64 exec, exec 4373; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4374; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4375; GFX9-NEXT: s_nop 1 4376; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4377; GFX9-NEXT: s_nop 1 4378; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4379; GFX9-NEXT: s_nop 1 4380; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4381; GFX9-NEXT: s_nop 1 4382; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4383; GFX9-NEXT: s_nop 1 4384; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4385; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4386; GFX9-NEXT: s_nop 0 4387; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4388; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4389; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4390; GFX9-NEXT: ; implicit-def: $vgpr0 4391; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4392; GFX9-NEXT: s_cbranch_execz .LBB23_2 4393; GFX9-NEXT: ; %bb.1: 4394; GFX9-NEXT: v_mov_b32_e32 v0, 0 4395; GFX9-NEXT: v_mov_b32_e32 v3, s4 4396; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4397; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4398; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4399; GFX9-NEXT: .LBB23_2: 4400; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4401; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4402; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4403; GFX9-NEXT: v_mov_b32_e32 v0, v1 4404; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4405; GFX9-NEXT: s_mov_b32 s3, 0xf000 4406; GFX9-NEXT: s_mov_b32 s2, -1 4407; GFX9-NEXT: s_nop 0 4408; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4409; GFX9-NEXT: s_endpgm 4410; 4411; GFX1064-LABEL: umin_i32_varying: 4412; GFX1064: ; %bb.0: ; %entry 4413; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4414; GFX1064-NEXT: s_not_b64 exec, exec 4415; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4416; GFX1064-NEXT: s_not_b64 exec, exec 4417; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4418; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4419; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4420; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4421; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4422; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4423; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4424; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4425; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4426; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4427; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4428; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4429; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4430; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4431; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4432; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4433; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4434; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4435; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4436; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4437; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4438; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4439; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4440; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4441; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4442; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4443; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4444; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4445; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4446; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4447; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4448; GFX1064-NEXT: s_mov_b32 s2, -1 4449; GFX1064-NEXT: ; implicit-def: $vgpr0 4450; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4451; GFX1064-NEXT: s_cbranch_execz .LBB23_2 4452; GFX1064-NEXT: ; %bb.1: 4453; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4454; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4455; GFX1064-NEXT: s_mov_b32 s3, s7 4456; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4457; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4458; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 4459; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4460; GFX1064-NEXT: buffer_gl0_inv 4461; GFX1064-NEXT: .LBB23_2: 4462; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4463; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4464; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4465; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4466; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4467; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4468; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4469; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4470; GFX1064-NEXT: s_endpgm 4471; 4472; GFX1032-LABEL: umin_i32_varying: 4473; GFX1032: ; %bb.0: ; %entry 4474; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4475; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4476; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4477; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4478; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4479; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4480; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4481; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4482; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4483; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4484; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4485; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4486; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4487; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4488; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4489; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4490; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4491; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4492; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4493; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4494; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4495; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4496; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4497; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4498; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4499; GFX1032-NEXT: s_mov_b32 s2, -1 4500; GFX1032-NEXT: ; implicit-def: $vgpr0 4501; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4502; GFX1032-NEXT: s_cbranch_execz .LBB23_2 4503; GFX1032-NEXT: ; %bb.1: 4504; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4505; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4506; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4507; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4508; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 4509; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4510; GFX1032-NEXT: buffer_gl0_inv 4511; GFX1032-NEXT: .LBB23_2: 4512; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4513; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4514; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4515; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4516; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4517; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4518; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4519; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4520; GFX1032-NEXT: s_endpgm 4521entry: 4522 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4523 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4524 store i32 %old, i32 addrspace(1)* %out 4525 ret void 4526} 4527 4528define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4529; 4530; 4531; GFX7LESS-LABEL: umin_i64_constant: 4532; GFX7LESS: ; %bb.0: ; %entry 4533; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4534; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4535; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4536; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4537; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4538; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4539; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 4540; GFX7LESS-NEXT: ; %bb.1: 4541; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4542; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4543; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4544; GFX7LESS-NEXT: s_mov_b32 m0, -1 4545; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4546; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4547; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4548; GFX7LESS-NEXT: .LBB24_2: 4549; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4550; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4551; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4552; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4553; GFX7LESS-NEXT: s_mov_b32 s2, -1 4554; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4555; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4556; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4557; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4558; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4559; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4560; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4561; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4562; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4563; GFX7LESS-NEXT: s_endpgm 4564; 4565; GFX8-LABEL: umin_i64_constant: 4566; GFX8: ; %bb.0: ; %entry 4567; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4568; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4569; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4570; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4571; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4572; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4573; GFX8-NEXT: s_cbranch_execz .LBB24_2 4574; GFX8-NEXT: ; %bb.1: 4575; GFX8-NEXT: v_mov_b32_e32 v0, 5 4576; GFX8-NEXT: v_mov_b32_e32 v2, 0 4577; GFX8-NEXT: v_mov_b32_e32 v1, 0 4578; GFX8-NEXT: s_mov_b32 m0, -1 4579; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4580; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4581; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4582; GFX8-NEXT: .LBB24_2: 4583; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4584; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4585; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4586; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4587; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4588; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4589; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4590; GFX8-NEXT: v_mov_b32_e32 v2, s5 4591; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4592; GFX8-NEXT: v_mov_b32_e32 v2, s4 4593; GFX8-NEXT: s_mov_b32 s2, -1 4594; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4595; GFX8-NEXT: s_mov_b32 s3, 0xf000 4596; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4597; GFX8-NEXT: s_endpgm 4598; 4599; GFX9-LABEL: umin_i64_constant: 4600; GFX9: ; %bb.0: ; %entry 4601; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4602; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4603; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4604; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4605; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4606; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4607; GFX9-NEXT: s_cbranch_execz .LBB24_2 4608; GFX9-NEXT: ; %bb.1: 4609; GFX9-NEXT: v_mov_b32_e32 v0, 5 4610; GFX9-NEXT: v_mov_b32_e32 v1, 0 4611; GFX9-NEXT: v_mov_b32_e32 v2, 0 4612; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4613; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4614; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4615; GFX9-NEXT: .LBB24_2: 4616; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4617; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4618; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4619; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4620; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4621; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4622; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4623; GFX9-NEXT: v_mov_b32_e32 v2, s5 4624; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4625; GFX9-NEXT: v_mov_b32_e32 v2, s4 4626; GFX9-NEXT: s_mov_b32 s2, -1 4627; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4628; GFX9-NEXT: s_mov_b32 s3, 0xf000 4629; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4630; GFX9-NEXT: s_endpgm 4631; 4632; GFX1064-LABEL: umin_i64_constant: 4633; GFX1064: ; %bb.0: ; %entry 4634; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4635; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4636; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4637; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4638; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4639; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4640; GFX1064-NEXT: s_cbranch_execz .LBB24_2 4641; GFX1064-NEXT: ; %bb.1: 4642; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4643; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4644; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4645; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4646; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4647; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4648; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4649; GFX1064-NEXT: buffer_gl0_inv 4650; GFX1064-NEXT: .LBB24_2: 4651; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4652; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4653; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4654; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4655; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4656; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4657; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4658; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4659; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4660; GFX1064-NEXT: s_mov_b32 s2, -1 4661; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4662; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4663; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4664; GFX1064-NEXT: s_endpgm 4665; 4666; GFX1032-LABEL: umin_i64_constant: 4667; GFX1032: ; %bb.0: ; %entry 4668; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4669; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4670; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4671; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4672; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4673; GFX1032-NEXT: s_cbranch_execz .LBB24_2 4674; GFX1032-NEXT: ; %bb.1: 4675; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4676; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4677; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4678; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4679; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4680; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4681; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4682; GFX1032-NEXT: buffer_gl0_inv 4683; GFX1032-NEXT: .LBB24_2: 4684; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4685; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4686; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4687; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4688; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4689; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4690; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4691; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4692; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4693; GFX1032-NEXT: s_mov_b32 s2, -1 4694; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4695; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4696; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4697; GFX1032-NEXT: s_endpgm 4698entry: 4699 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4700 store i64 %old, i64 addrspace(1)* %out 4701 ret void 4702} 4703