1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: .LBB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz .LBB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, 0 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: .LBB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz .LBB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, 0 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: .LBB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz .LBB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, 0 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: .LBB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 140; GFX1032-NEXT: s_mov_b32 s3, exec_lo 141; GFX1032-NEXT: ; implicit-def: $vgpr1 142; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 143; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 144; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 145; GFX1032-NEXT: s_cbranch_execz .LBB0_2 146; GFX1032-NEXT: ; %bb.1: 147; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 148; GFX1032-NEXT: v_mov_b32_e32 v1, 0 149; GFX1032-NEXT: s_mul_i32 s3, s3, 5 150; GFX1032-NEXT: v_mov_b32_e32 v2, s3 151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 154; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 155; GFX1032-NEXT: buffer_gl0_inv 156; GFX1032-NEXT: .LBB0_2: 157; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 158; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 159; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 161; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 162; GFX1032-NEXT: s_mov_b32 s2, -1 163; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 164; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 165; GFX1032-NEXT: s_endpgm 166entry: 167 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 168 store i32 %old, i32 addrspace(1)* %out 169 ret void 170} 171 172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 173; 174; 175; GFX7LESS-LABEL: add_i32_uniform: 176; GFX7LESS: ; %bb.0: ; %entry 177; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 178; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 179; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 180; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 181; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 182; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 183; GFX7LESS-NEXT: ; implicit-def: $vgpr1 184; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 185; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 186; GFX7LESS-NEXT: ; %bb.1: 187; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 190; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 191; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 192; GFX7LESS-NEXT: s_mov_b32 m0, -1 193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 194; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: .LBB1_2: 197; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 198; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 199; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 200; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 201; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 202; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 203; GFX7LESS-NEXT: s_mov_b32 s6, -1 204; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 205; GFX7LESS-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_uniform: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 210; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 211; GFX8-NEXT: s_mov_b64 s[2:3], exec 212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 215; GFX8-NEXT: ; implicit-def: $vgpr1 216; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 217; GFX8-NEXT: s_cbranch_execz .LBB1_2 218; GFX8-NEXT: ; %bb.1: 219; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_mul_i32 s2, s6, s2 222; GFX8-NEXT: v_mov_b32_e32 v1, 0 223; GFX8-NEXT: v_mov_b32_e32 v2, s2 224; GFX8-NEXT: s_mov_b32 m0, -1 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: .LBB1_2: 229; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 231; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 232; GFX8-NEXT: v_readfirstlane_b32 s0, v1 233; GFX8-NEXT: s_mov_b32 s7, 0xf000 234; GFX8-NEXT: s_mov_b32 s6, -1 235; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 236; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 237; GFX8-NEXT: s_endpgm 238; 239; GFX9-LABEL: add_i32_uniform: 240; GFX9: ; %bb.0: ; %entry 241; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 242; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 243; GFX9-NEXT: s_mov_b64 s[2:3], exec 244; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 245; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX9-NEXT: ; implicit-def: $vgpr1 248; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 249; GFX9-NEXT: s_cbranch_execz .LBB1_2 250; GFX9-NEXT: ; %bb.1: 251; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: s_mul_i32 s2, s6, s2 254; GFX9-NEXT: v_mov_b32_e32 v1, 0 255; GFX9-NEXT: v_mov_b32_e32 v2, s2 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: .LBB1_2: 260; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 263; GFX9-NEXT: v_readfirstlane_b32 s0, v1 264; GFX9-NEXT: s_mov_b32 s7, 0xf000 265; GFX9-NEXT: s_mov_b32 s6, -1 266; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 267; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 268; GFX9-NEXT: s_endpgm 269; 270; GFX1064-LABEL: add_i32_uniform: 271; GFX1064: ; %bb.0: ; %entry 272; GFX1064-NEXT: s_clause 0x1 273; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 274; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 275; GFX1064-NEXT: s_mov_b64 s[2:3], exec 276; GFX1064-NEXT: ; implicit-def: $vgpr1 277; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 278; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 279; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 280; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 281; GFX1064-NEXT: s_cbranch_execz .LBB1_2 282; GFX1064-NEXT: ; %bb.1: 283; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 284; GFX1064-NEXT: v_mov_b32_e32 v1, 0 285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 286; GFX1064-NEXT: s_mul_i32 s2, s6, s2 287; GFX1064-NEXT: v_mov_b32_e32 v2, s2 288; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: buffer_gl0_inv 293; GFX1064-NEXT: .LBB1_2: 294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 295; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 296; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 297; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 298; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 299; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1] 300; GFX1064-NEXT: s_mov_b32 s6, -1 301; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 302; GFX1064-NEXT: s_endpgm 303; 304; GFX1032-LABEL: add_i32_uniform: 305; GFX1032: ; %bb.0: ; %entry 306; GFX1032-NEXT: s_clause 0x1 307; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 308; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 309; GFX1032-NEXT: s_mov_b32 s3, exec_lo 310; GFX1032-NEXT: ; implicit-def: $vgpr1 311; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 312; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 313; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 314; GFX1032-NEXT: s_cbranch_execz .LBB1_2 315; GFX1032-NEXT: ; %bb.1: 316; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 317; GFX1032-NEXT: v_mov_b32_e32 v1, 0 318; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 319; GFX1032-NEXT: s_mul_i32 s1, s2, s1 320; GFX1032-NEXT: v_mov_b32_e32 v2, s1 321; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 322; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 323; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 324; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 325; GFX1032-NEXT: buffer_gl0_inv 326; GFX1032-NEXT: .LBB1_2: 327; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 328; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 329; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 330; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 331; GFX1032-NEXT: s_mov_b32 s6, -1 332; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 333; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] 334; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 335; GFX1032-NEXT: s_endpgm 336entry: 337 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 338 store i32 %old, i32 addrspace(1)* %out 339 ret void 340} 341 342define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 343; 344; 345; GFX7LESS-LABEL: add_i32_varying: 346; GFX7LESS: ; %bb.0: ; %entry 347; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 348; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 349; GFX7LESS-NEXT: s_mov_b32 m0, -1 350; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 351; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 354; GFX7LESS-NEXT: s_mov_b32 s2, -1 355; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 356; GFX7LESS-NEXT: s_endpgm 357; 358; GFX8-LABEL: add_i32_varying: 359; GFX8: ; %bb.0: ; %entry 360; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 361; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 362; GFX8-NEXT: v_mov_b32_e32 v1, 0 363; GFX8-NEXT: s_mov_b64 exec, s[2:3] 364; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 365; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 366; GFX8-NEXT: v_mov_b32_e32 v2, v0 367; GFX8-NEXT: s_not_b64 exec, exec 368; GFX8-NEXT: v_mov_b32_e32 v2, 0 369; GFX8-NEXT: s_not_b64 exec, exec 370; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 371; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 372; GFX8-NEXT: s_nop 1 373; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 374; GFX8-NEXT: s_nop 1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 382; GFX8-NEXT: v_readlane_b32 s4, v2, 63 383; GFX8-NEXT: s_nop 0 384; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 385; GFX8-NEXT: s_mov_b64 exec, s[2:3] 386; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 387; GFX8-NEXT: ; implicit-def: $vgpr0 388; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 389; GFX8-NEXT: s_cbranch_execz .LBB2_2 390; GFX8-NEXT: ; %bb.1: 391; GFX8-NEXT: v_mov_b32_e32 v0, 0 392; GFX8-NEXT: v_mov_b32_e32 v3, s4 393; GFX8-NEXT: s_mov_b32 m0, -1 394; GFX8-NEXT: s_waitcnt lgkmcnt(0) 395; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: .LBB2_2: 398; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 399; GFX8-NEXT: s_waitcnt lgkmcnt(0) 400; GFX8-NEXT: v_readfirstlane_b32 s2, v0 401; GFX8-NEXT: v_mov_b32_e32 v0, v1 402; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 403; GFX8-NEXT: s_mov_b32 s3, 0xf000 404; GFX8-NEXT: s_mov_b32 s2, -1 405; GFX8-NEXT: s_nop 0 406; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 407; GFX8-NEXT: s_endpgm 408; 409; GFX9-LABEL: add_i32_varying: 410; GFX9: ; %bb.0: ; %entry 411; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 412; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 413; GFX9-NEXT: v_mov_b32_e32 v1, 0 414; GFX9-NEXT: s_mov_b64 exec, s[2:3] 415; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 416; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 417; GFX9-NEXT: v_mov_b32_e32 v2, v0 418; GFX9-NEXT: s_not_b64 exec, exec 419; GFX9-NEXT: v_mov_b32_e32 v2, 0 420; GFX9-NEXT: s_not_b64 exec, exec 421; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 422; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 423; GFX9-NEXT: s_nop 1 424; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 425; GFX9-NEXT: s_nop 1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 433; GFX9-NEXT: v_readlane_b32 s4, v2, 63 434; GFX9-NEXT: s_nop 0 435; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 436; GFX9-NEXT: s_mov_b64 exec, s[2:3] 437; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 438; GFX9-NEXT: ; implicit-def: $vgpr0 439; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 440; GFX9-NEXT: s_cbranch_execz .LBB2_2 441; GFX9-NEXT: ; %bb.1: 442; GFX9-NEXT: v_mov_b32_e32 v0, 0 443; GFX9-NEXT: v_mov_b32_e32 v3, s4 444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 445; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: .LBB2_2: 448; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 449; GFX9-NEXT: s_waitcnt lgkmcnt(0) 450; GFX9-NEXT: v_readfirstlane_b32 s2, v0 451; GFX9-NEXT: v_mov_b32_e32 v0, v1 452; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 453; GFX9-NEXT: s_mov_b32 s3, 0xf000 454; GFX9-NEXT: s_mov_b32 s2, -1 455; GFX9-NEXT: s_nop 0 456; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 457; GFX9-NEXT: s_endpgm 458; 459; GFX1064-LABEL: add_i32_varying: 460; GFX1064: ; %bb.0: ; %entry 461; GFX1064-NEXT: v_mov_b32_e32 v1, v0 462; GFX1064-NEXT: s_not_b64 exec, exec 463; GFX1064-NEXT: v_mov_b32_e32 v1, 0 464; GFX1064-NEXT: s_not_b64 exec, exec 465; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 466; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 467; GFX1064-NEXT: v_mov_b32_e32 v3, 0 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_mov_b32_e32 v2, v1 472; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 473; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 474; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 475; GFX1064-NEXT: v_mov_b32_e32 v2, s4 476; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 477; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 478; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 479; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 480; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 481; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 482; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 483; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 484; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 485; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 486; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 487; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 488; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 489; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 490; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 491; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 492; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 493; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 494; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 495; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 496; GFX1064-NEXT: s_mov_b32 s2, -1 497; GFX1064-NEXT: ; implicit-def: $vgpr0 498; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 499; GFX1064-NEXT: s_cbranch_execz .LBB2_2 500; GFX1064-NEXT: ; %bb.1: 501; GFX1064-NEXT: v_mov_b32_e32 v0, 0 502; GFX1064-NEXT: v_mov_b32_e32 v4, s7 503; GFX1064-NEXT: s_mov_b32 s3, s7 504; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 505; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 506; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 507; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 508; GFX1064-NEXT: buffer_gl0_inv 509; GFX1064-NEXT: .LBB2_2: 510; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 511; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 512; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 513; GFX1064-NEXT: v_mov_b32_e32 v0, v3 514; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 515; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 516; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 517; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 518; GFX1064-NEXT: s_endpgm 519; 520; GFX1032-LABEL: add_i32_varying: 521; GFX1032: ; %bb.0: ; %entry 522; GFX1032-NEXT: v_mov_b32_e32 v1, v0 523; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 524; GFX1032-NEXT: v_mov_b32_e32 v1, 0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 527; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 528; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX1032-NEXT: v_mov_b32_e32 v2, v1 532; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 533; GFX1032-NEXT: s_mov_b32 exec_lo, s2 534; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 535; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 536; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 537; GFX1032-NEXT: v_mov_b32_e32 v3, 0 538; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 539; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 540; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 541; GFX1032-NEXT: s_mov_b32 exec_lo, s2 542; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 543; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 544; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 545; GFX1032-NEXT: s_mov_b32 exec_lo, s2 546; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 547; GFX1032-NEXT: s_mov_b32 s2, -1 548; GFX1032-NEXT: ; implicit-def: $vgpr0 549; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 550; GFX1032-NEXT: s_cbranch_execz .LBB2_2 551; GFX1032-NEXT: ; %bb.1: 552; GFX1032-NEXT: v_mov_b32_e32 v0, 0 553; GFX1032-NEXT: v_mov_b32_e32 v4, s4 554; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 555; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 556; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 557; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 558; GFX1032-NEXT: buffer_gl0_inv 559; GFX1032-NEXT: .LBB2_2: 560; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 561; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 562; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 563; GFX1032-NEXT: v_mov_b32_e32 v0, v3 564; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 565; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 566; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 567; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 568; GFX1032-NEXT: s_endpgm 569entry: 570 %lane = call i32 @llvm.amdgcn.workitem.id.x() 571 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 572 store i32 %old, i32 addrspace(1)* %out 573 ret void 574} 575 576define amdgpu_kernel void @add_i32_varying_nouse() { 577; GFX7LESS-LABEL: add_i32_varying_nouse: 578; GFX7LESS: ; %bb.0: ; %entry 579; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 580; GFX7LESS-NEXT: s_mov_b32 m0, -1 581; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 582; GFX7LESS-NEXT: ds_add_u32 v1, v0 583; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 584; GFX7LESS-NEXT: s_endpgm 585; 586; GFX8-LABEL: add_i32_varying_nouse: 587; GFX8: ; %bb.0: ; %entry 588; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 589; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 590; GFX8-NEXT: v_mov_b32_e32 v1, v0 591; GFX8-NEXT: s_not_b64 exec, exec 592; GFX8-NEXT: v_mov_b32_e32 v1, 0 593; GFX8-NEXT: s_not_b64 exec, exec 594; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 595; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 596; GFX8-NEXT: s_nop 1 597; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 598; GFX8-NEXT: s_nop 1 599; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 600; GFX8-NEXT: s_nop 1 601; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 602; GFX8-NEXT: s_nop 1 603; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 604; GFX8-NEXT: s_nop 1 605; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 606; GFX8-NEXT: v_readlane_b32 s2, v1, 63 607; GFX8-NEXT: s_mov_b64 exec, s[0:1] 608; GFX8-NEXT: s_mov_b32 s0, s2 609; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 610; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 611; GFX8-NEXT: s_cbranch_execz .LBB3_2 612; GFX8-NEXT: ; %bb.1: 613; GFX8-NEXT: v_mov_b32_e32 v0, 0 614; GFX8-NEXT: v_mov_b32_e32 v2, s0 615; GFX8-NEXT: s_mov_b32 m0, -1 616; GFX8-NEXT: s_waitcnt lgkmcnt(0) 617; GFX8-NEXT: ds_add_u32 v0, v2 618; GFX8-NEXT: s_waitcnt lgkmcnt(0) 619; GFX8-NEXT: .LBB3_2: 620; GFX8-NEXT: s_endpgm 621; 622; GFX9-LABEL: add_i32_varying_nouse: 623; GFX9: ; %bb.0: ; %entry 624; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 625; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 626; GFX9-NEXT: v_mov_b32_e32 v1, v0 627; GFX9-NEXT: s_not_b64 exec, exec 628; GFX9-NEXT: v_mov_b32_e32 v1, 0 629; GFX9-NEXT: s_not_b64 exec, exec 630; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 631; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 632; GFX9-NEXT: s_nop 1 633; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 634; GFX9-NEXT: s_nop 1 635; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX9-NEXT: s_nop 1 637; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 638; GFX9-NEXT: s_nop 1 639; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 640; GFX9-NEXT: s_nop 1 641; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 642; GFX9-NEXT: v_readlane_b32 s2, v1, 63 643; GFX9-NEXT: s_mov_b64 exec, s[0:1] 644; GFX9-NEXT: s_mov_b32 s0, s2 645; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 646; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 647; GFX9-NEXT: s_cbranch_execz .LBB3_2 648; GFX9-NEXT: ; %bb.1: 649; GFX9-NEXT: v_mov_b32_e32 v0, 0 650; GFX9-NEXT: v_mov_b32_e32 v2, s0 651; GFX9-NEXT: s_waitcnt lgkmcnt(0) 652; GFX9-NEXT: ds_add_u32 v0, v2 653; GFX9-NEXT: s_waitcnt lgkmcnt(0) 654; GFX9-NEXT: .LBB3_2: 655; GFX9-NEXT: s_endpgm 656; 657; GFX1064-LABEL: add_i32_varying_nouse: 658; GFX1064: ; %bb.0: ; %entry 659; GFX1064-NEXT: v_mov_b32_e32 v1, v0 660; GFX1064-NEXT: s_not_b64 exec, exec 661; GFX1064-NEXT: v_mov_b32_e32 v1, 0 662; GFX1064-NEXT: s_not_b64 exec, exec 663; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 664; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 665; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 666; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1064-NEXT: v_mov_b32_e32 v2, v1 669; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 670; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 671; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 672; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 673; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 674; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 675; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 676; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 677; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 678; GFX1064-NEXT: s_add_i32 s0, s2, s3 679; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 680; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 681; GFX1064-NEXT: s_cbranch_execz .LBB3_2 682; GFX1064-NEXT: ; %bb.1: 683; GFX1064-NEXT: v_mov_b32_e32 v0, 0 684; GFX1064-NEXT: v_mov_b32_e32 v3, s0 685; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 686; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 687; GFX1064-NEXT: ds_add_u32 v0, v3 688; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 689; GFX1064-NEXT: buffer_gl0_inv 690; GFX1064-NEXT: .LBB3_2: 691; GFX1064-NEXT: s_endpgm 692; 693; GFX1032-LABEL: add_i32_varying_nouse: 694; GFX1032: ; %bb.0: ; %entry 695; GFX1032-NEXT: v_mov_b32_e32 v1, v0 696; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 697; GFX1032-NEXT: v_mov_b32_e32 v1, 0 698; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 699; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 700; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 701; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 702; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 703; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 704; GFX1032-NEXT: v_mov_b32_e32 v2, v1 705; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 706; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 707; GFX1032-NEXT: s_mov_b32 exec_lo, s0 708; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 709; GFX1032-NEXT: v_mov_b32_e32 v0, v1 710; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 711; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 712; GFX1032-NEXT: s_cbranch_execz .LBB3_2 713; GFX1032-NEXT: ; %bb.1: 714; GFX1032-NEXT: v_mov_b32_e32 v3, 0 715; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 716; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 717; GFX1032-NEXT: ds_add_u32 v3, v0 718; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 719; GFX1032-NEXT: buffer_gl0_inv 720; GFX1032-NEXT: .LBB3_2: 721; GFX1032-NEXT: s_endpgm 722entry: 723 %lane = call i32 @llvm.amdgcn.workitem.id.x() 724 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 725 ret void 726} 727 728define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 729; 730; 731; GFX7LESS-LABEL: add_i64_constant: 732; GFX7LESS: ; %bb.0: ; %entry 733; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 734; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 735; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 736; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 737; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 738; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 739; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 740; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 741; GFX7LESS-NEXT: ; %bb.1: 742; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 743; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 744; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 745; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 746; GFX7LESS-NEXT: s_mov_b32 m0, -1 747; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 748; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 749; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 750; GFX7LESS-NEXT: .LBB4_2: 751; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 752; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 753; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 754; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 755; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 756; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 757; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 758; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 759; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 760; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 761; GFX7LESS-NEXT: s_mov_b32 s2, -1 762; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 763; GFX7LESS-NEXT: s_endpgm 764; 765; GFX8-LABEL: add_i64_constant: 766; GFX8: ; %bb.0: ; %entry 767; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 768; GFX8-NEXT: s_mov_b64 s[4:5], exec 769; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 770; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 771; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 772; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 773; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 774; GFX8-NEXT: s_cbranch_execz .LBB4_2 775; GFX8-NEXT: ; %bb.1: 776; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 777; GFX8-NEXT: s_mul_i32 s4, s4, 5 778; GFX8-NEXT: v_mov_b32_e32 v0, s4 779; GFX8-NEXT: v_mov_b32_e32 v1, 0 780; GFX8-NEXT: s_mov_b32 m0, -1 781; GFX8-NEXT: s_waitcnt lgkmcnt(0) 782; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 783; GFX8-NEXT: s_waitcnt lgkmcnt(0) 784; GFX8-NEXT: .LBB4_2: 785; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 786; GFX8-NEXT: s_waitcnt lgkmcnt(0) 787; GFX8-NEXT: v_readfirstlane_b32 s2, v0 788; GFX8-NEXT: v_readfirstlane_b32 s3, v1 789; GFX8-NEXT: v_mov_b32_e32 v0, s2 790; GFX8-NEXT: v_mov_b32_e32 v1, s3 791; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 792; GFX8-NEXT: s_mov_b32 s3, 0xf000 793; GFX8-NEXT: s_mov_b32 s2, -1 794; GFX8-NEXT: s_nop 2 795; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 796; GFX8-NEXT: s_endpgm 797; 798; GFX9-LABEL: add_i64_constant: 799; GFX9: ; %bb.0: ; %entry 800; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 801; GFX9-NEXT: s_mov_b64 s[4:5], exec 802; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 803; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 804; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 805; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 806; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 807; GFX9-NEXT: s_cbranch_execz .LBB4_2 808; GFX9-NEXT: ; %bb.1: 809; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 810; GFX9-NEXT: s_mul_i32 s4, s4, 5 811; GFX9-NEXT: v_mov_b32_e32 v0, s4 812; GFX9-NEXT: v_mov_b32_e32 v1, 0 813; GFX9-NEXT: s_waitcnt lgkmcnt(0) 814; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 815; GFX9-NEXT: s_waitcnt lgkmcnt(0) 816; GFX9-NEXT: .LBB4_2: 817; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 818; GFX9-NEXT: s_waitcnt lgkmcnt(0) 819; GFX9-NEXT: v_readfirstlane_b32 s2, v0 820; GFX9-NEXT: v_readfirstlane_b32 s3, v1 821; GFX9-NEXT: v_mov_b32_e32 v0, s2 822; GFX9-NEXT: v_mov_b32_e32 v1, s3 823; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 824; GFX9-NEXT: s_mov_b32 s3, 0xf000 825; GFX9-NEXT: s_mov_b32 s2, -1 826; GFX9-NEXT: s_nop 2 827; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 828; GFX9-NEXT: s_endpgm 829; 830; GFX1064-LABEL: add_i64_constant: 831; GFX1064: ; %bb.0: ; %entry 832; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 833; GFX1064-NEXT: s_mov_b64 s[4:5], exec 834; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 835; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 836; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 837; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 838; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 839; GFX1064-NEXT: s_cbranch_execz .LBB4_2 840; GFX1064-NEXT: ; %bb.1: 841; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 842; GFX1064-NEXT: v_mov_b32_e32 v1, 0 843; GFX1064-NEXT: s_mul_i32 s4, s4, 5 844; GFX1064-NEXT: v_mov_b32_e32 v0, s4 845; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 846; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 847; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 848; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 849; GFX1064-NEXT: buffer_gl0_inv 850; GFX1064-NEXT: .LBB4_2: 851; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 852; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 853; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 854; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 855; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 856; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 857; GFX1064-NEXT: s_mov_b32 s2, -1 858; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 859; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 860; GFX1064-NEXT: s_endpgm 861; 862; GFX1032-LABEL: add_i64_constant: 863; GFX1032: ; %bb.0: ; %entry 864; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 865; GFX1032-NEXT: s_mov_b32 s3, exec_lo 866; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 867; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 868; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 869; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 870; GFX1032-NEXT: s_cbranch_execz .LBB4_2 871; GFX1032-NEXT: ; %bb.1: 872; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 873; GFX1032-NEXT: v_mov_b32_e32 v1, 0 874; GFX1032-NEXT: s_mul_i32 s3, s3, 5 875; GFX1032-NEXT: v_mov_b32_e32 v0, s3 876; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 877; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 878; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 879; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 880; GFX1032-NEXT: buffer_gl0_inv 881; GFX1032-NEXT: .LBB4_2: 882; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 883; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 884; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 885; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 886; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 887; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 888; GFX1032-NEXT: s_mov_b32 s2, -1 889; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 890; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 891; GFX1032-NEXT: s_endpgm 892entry: 893 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 894 store i64 %old, i64 addrspace(1)* %out 895 ret void 896} 897 898define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 899; 900; 901; GFX7LESS-LABEL: add_i64_uniform: 902; GFX7LESS: ; %bb.0: ; %entry 903; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 904; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 905; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 906; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 907; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 908; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 909; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 910; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 911; GFX7LESS-NEXT: ; %bb.1: 912; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 913; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 914; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 915; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 916; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 917; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 918; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 919; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 920; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 921; GFX7LESS-NEXT: s_mov_b32 m0, -1 922; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 923; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 924; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 925; GFX7LESS-NEXT: .LBB5_2: 926; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 927; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 928; GFX7LESS-NEXT: s_mov_b32 s6, -1 929; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 930; GFX7LESS-NEXT: s_mov_b32 s4, s0 931; GFX7LESS-NEXT: s_mov_b32 s5, s1 932; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 933; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 934; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 935; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 936; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 937; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 938; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 939; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 940; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 941; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 942; GFX7LESS-NEXT: s_endpgm 943; 944; GFX8-LABEL: add_i64_uniform: 945; GFX8: ; %bb.0: ; %entry 946; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 947; GFX8-NEXT: s_mov_b64 s[6:7], exec 948; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 949; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 950; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 951; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 952; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 953; GFX8-NEXT: s_cbranch_execz .LBB5_2 954; GFX8-NEXT: ; %bb.1: 955; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 956; GFX8-NEXT: v_mov_b32_e32 v0, s8 957; GFX8-NEXT: s_waitcnt lgkmcnt(0) 958; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 959; GFX8-NEXT: s_mul_i32 s6, s3, s8 960; GFX8-NEXT: v_mov_b32_e32 v3, 0 961; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 962; GFX8-NEXT: s_mov_b32 m0, -1 963; GFX8-NEXT: s_waitcnt lgkmcnt(0) 964; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 965; GFX8-NEXT: s_waitcnt lgkmcnt(0) 966; GFX8-NEXT: .LBB5_2: 967; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 968; GFX8-NEXT: s_waitcnt lgkmcnt(0) 969; GFX8-NEXT: v_readfirstlane_b32 s4, v0 970; GFX8-NEXT: v_readfirstlane_b32 s5, v1 971; GFX8-NEXT: v_mov_b32_e32 v0, s4 972; GFX8-NEXT: v_mov_b32_e32 v1, s5 973; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 974; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 975; GFX8-NEXT: s_mov_b32 s7, 0xf000 976; GFX8-NEXT: s_mov_b32 s6, -1 977; GFX8-NEXT: s_mov_b32 s4, s0 978; GFX8-NEXT: s_mov_b32 s5, s1 979; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 980; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 981; GFX8-NEXT: s_endpgm 982; 983; GFX9-LABEL: add_i64_uniform: 984; GFX9: ; %bb.0: ; %entry 985; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 986; GFX9-NEXT: s_mov_b64 s[6:7], exec 987; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 988; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 989; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 990; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 991; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 992; GFX9-NEXT: s_cbranch_execz .LBB5_2 993; GFX9-NEXT: ; %bb.1: 994; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 995; GFX9-NEXT: s_waitcnt lgkmcnt(0) 996; GFX9-NEXT: s_mul_i32 s7, s3, s6 997; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 998; GFX9-NEXT: s_add_i32 s8, s8, s7 999; GFX9-NEXT: s_mul_i32 s6, s2, s6 1000; GFX9-NEXT: v_mov_b32_e32 v0, s6 1001; GFX9-NEXT: v_mov_b32_e32 v1, s8 1002; GFX9-NEXT: v_mov_b32_e32 v3, 0 1003; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1005; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX9-NEXT: .LBB5_2: 1007; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1008; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1009; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1010; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1011; GFX9-NEXT: v_mov_b32_e32 v0, s4 1012; GFX9-NEXT: v_mov_b32_e32 v1, s5 1013; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] 1014; GFX9-NEXT: s_mov_b32 s7, 0xf000 1015; GFX9-NEXT: s_mov_b32 s6, -1 1016; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1017; GFX9-NEXT: s_mov_b32 s4, s0 1018; GFX9-NEXT: s_mov_b32 s5, s1 1019; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1020; GFX9-NEXT: s_endpgm 1021; 1022; GFX1064-LABEL: add_i64_uniform: 1023; GFX1064: ; %bb.0: ; %entry 1024; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1025; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1026; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1027; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1028; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1029; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1030; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1031; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1032; GFX1064-NEXT: ; %bb.1: 1033; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1034; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1035; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1036; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1037; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1038; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1039; GFX1064-NEXT: s_add_i32 s8, s8, s7 1040; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1041; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1042; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1043; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1044; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1045; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1046; GFX1064-NEXT: buffer_gl0_inv 1047; GFX1064-NEXT: .LBB5_2: 1048; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1049; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1050; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1051; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1052; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5] 1054; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] 1055; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1056; GFX1064-NEXT: s_mov_b32 s2, -1 1057; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1058; GFX1064-NEXT: s_endpgm 1059; 1060; GFX1032-LABEL: add_i64_uniform: 1061; GFX1032: ; %bb.0: ; %entry 1062; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1063; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1064; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1065; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1066; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1067; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1068; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1069; GFX1032-NEXT: ; %bb.1: 1070; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1071; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1072; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1073; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1074; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1075; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1076; GFX1032-NEXT: s_add_i32 s7, s7, s6 1077; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1078; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1079; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1080; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1081; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1082; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX1032-NEXT: buffer_gl0_inv 1084; GFX1032-NEXT: .LBB5_2: 1085; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1086; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1087; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1088; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1089; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1090; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1091; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2] 1092; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1093; GFX1032-NEXT: s_mov_b32 s2, -1 1094; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1095; GFX1032-NEXT: s_endpgm 1096entry: 1097 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1098 store i64 %old, i64 addrspace(1)* %out 1099 ret void 1100} 1101 1102define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1103; 1104; 1105; GFX7LESS-LABEL: add_i64_varying: 1106; GFX7LESS: ; %bb.0: ; %entry 1107; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1108; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1109; GFX7LESS-NEXT: s_mov_b32 m0, -1 1110; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1112; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1114; GFX7LESS-NEXT: s_mov_b32 s2, -1 1115; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1116; GFX7LESS-NEXT: s_endpgm 1117; 1118; GFX8-LABEL: add_i64_varying: 1119; GFX8: ; %bb.0: ; %entry 1120; GFX8-NEXT: v_mov_b32_e32 v1, 0 1121; GFX8-NEXT: s_mov_b32 m0, -1 1122; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1123; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1124; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1125; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1126; GFX8-NEXT: s_mov_b32 s3, 0xf000 1127; GFX8-NEXT: s_mov_b32 s2, -1 1128; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1129; GFX8-NEXT: s_endpgm 1130; 1131; GFX9-LABEL: add_i64_varying: 1132; GFX9: ; %bb.0: ; %entry 1133; GFX9-NEXT: v_mov_b32_e32 v1, 0 1134; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1135; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1137; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX9-NEXT: s_mov_b32 s3, 0xf000 1139; GFX9-NEXT: s_mov_b32 s2, -1 1140; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1141; GFX9-NEXT: s_endpgm 1142; 1143; GFX10-LABEL: add_i64_varying: 1144; GFX10: ; %bb.0: ; %entry 1145; GFX10-NEXT: v_mov_b32_e32 v1, 0 1146; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1147; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1148; GFX10-NEXT: s_mov_b32 s2, -1 1149; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1150; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1151; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1152; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1153; GFX10-NEXT: buffer_gl0_inv 1154; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1155; GFX10-NEXT: s_endpgm 1156entry: 1157 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1158 %zext = zext i32 %lane to i64 1159 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1160 store i64 %old, i64 addrspace(1)* %out 1161 ret void 1162} 1163 1164define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1165; 1166; 1167; GFX7LESS-LABEL: sub_i32_constant: 1168; GFX7LESS: ; %bb.0: ; %entry 1169; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1170; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1171; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1172; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1173; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1174; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1175; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1176; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1177; GFX7LESS-NEXT: ; %bb.1: 1178; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1179; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1180; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1181; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1182; GFX7LESS-NEXT: s_mov_b32 m0, -1 1183; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1184; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1185; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1186; GFX7LESS-NEXT: .LBB7_2: 1187; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1189; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1190; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1191; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1192; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1193; GFX7LESS-NEXT: s_mov_b32 s2, -1 1194; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1195; GFX7LESS-NEXT: s_endpgm 1196; 1197; GFX8-LABEL: sub_i32_constant: 1198; GFX8: ; %bb.0: ; %entry 1199; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1200; GFX8-NEXT: s_mov_b64 s[2:3], exec 1201; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1202; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1203; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1204; GFX8-NEXT: ; implicit-def: $vgpr1 1205; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1206; GFX8-NEXT: s_cbranch_execz .LBB7_2 1207; GFX8-NEXT: ; %bb.1: 1208; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1209; GFX8-NEXT: s_mul_i32 s2, s2, 5 1210; GFX8-NEXT: v_mov_b32_e32 v1, 0 1211; GFX8-NEXT: v_mov_b32_e32 v2, s2 1212; GFX8-NEXT: s_mov_b32 m0, -1 1213; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1214; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1215; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX8-NEXT: .LBB7_2: 1217; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1218; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1219; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1220; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1221; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1222; GFX8-NEXT: s_mov_b32 s3, 0xf000 1223; GFX8-NEXT: s_mov_b32 s2, -1 1224; GFX8-NEXT: s_nop 0 1225; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1226; GFX8-NEXT: s_endpgm 1227; 1228; GFX9-LABEL: sub_i32_constant: 1229; GFX9: ; %bb.0: ; %entry 1230; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1231; GFX9-NEXT: s_mov_b64 s[2:3], exec 1232; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1233; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1234; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1235; GFX9-NEXT: ; implicit-def: $vgpr1 1236; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1237; GFX9-NEXT: s_cbranch_execz .LBB7_2 1238; GFX9-NEXT: ; %bb.1: 1239; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1240; GFX9-NEXT: s_mul_i32 s2, s2, 5 1241; GFX9-NEXT: v_mov_b32_e32 v1, 0 1242; GFX9-NEXT: v_mov_b32_e32 v2, s2 1243; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1244; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1245; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX9-NEXT: .LBB7_2: 1247; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1250; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1251; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1252; GFX9-NEXT: s_mov_b32 s3, 0xf000 1253; GFX9-NEXT: s_mov_b32 s2, -1 1254; GFX9-NEXT: s_nop 0 1255; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1256; GFX9-NEXT: s_endpgm 1257; 1258; GFX1064-LABEL: sub_i32_constant: 1259; GFX1064: ; %bb.0: ; %entry 1260; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1261; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1262; GFX1064-NEXT: ; implicit-def: $vgpr1 1263; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1264; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1265; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1266; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1267; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1268; GFX1064-NEXT: ; %bb.1: 1269; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1270; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1271; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1272; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1273; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1274; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1275; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1276; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1277; GFX1064-NEXT: buffer_gl0_inv 1278; GFX1064-NEXT: .LBB7_2: 1279; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1280; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1281; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1282; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1283; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1284; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1285; GFX1064-NEXT: s_mov_b32 s2, -1 1286; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1287; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1288; GFX1064-NEXT: s_endpgm 1289; 1290; GFX1032-LABEL: sub_i32_constant: 1291; GFX1032: ; %bb.0: ; %entry 1292; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1293; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1294; GFX1032-NEXT: ; implicit-def: $vgpr1 1295; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1296; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1297; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1298; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1299; GFX1032-NEXT: ; %bb.1: 1300; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1301; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1302; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1303; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1304; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1305; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1306; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1307; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1308; GFX1032-NEXT: buffer_gl0_inv 1309; GFX1032-NEXT: .LBB7_2: 1310; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1311; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1312; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1313; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1314; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1315; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1316; GFX1032-NEXT: s_mov_b32 s2, -1 1317; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1318; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1319; GFX1032-NEXT: s_endpgm 1320entry: 1321 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1322 store i32 %old, i32 addrspace(1)* %out 1323 ret void 1324} 1325 1326define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1327; 1328; 1329; GFX7LESS-LABEL: sub_i32_uniform: 1330; GFX7LESS: ; %bb.0: ; %entry 1331; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1332; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1333; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1334; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1335; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1336; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1337; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1338; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1339; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1340; GFX7LESS-NEXT: ; %bb.1: 1341; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1342; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1344; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1345; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1346; GFX7LESS-NEXT: s_mov_b32 m0, -1 1347; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1348; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1349; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1350; GFX7LESS-NEXT: .LBB8_2: 1351; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1353; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1354; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1355; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1356; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1357; GFX7LESS-NEXT: s_mov_b32 s6, -1 1358; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1359; GFX7LESS-NEXT: s_endpgm 1360; 1361; GFX8-LABEL: sub_i32_uniform: 1362; GFX8: ; %bb.0: ; %entry 1363; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1364; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1365; GFX8-NEXT: s_mov_b64 s[2:3], exec 1366; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1367; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1368; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1369; GFX8-NEXT: ; implicit-def: $vgpr1 1370; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1371; GFX8-NEXT: s_cbranch_execz .LBB8_2 1372; GFX8-NEXT: ; %bb.1: 1373; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1374; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1375; GFX8-NEXT: s_mul_i32 s2, s6, s2 1376; GFX8-NEXT: v_mov_b32_e32 v1, 0 1377; GFX8-NEXT: v_mov_b32_e32 v2, s2 1378; GFX8-NEXT: s_mov_b32 m0, -1 1379; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1380; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1381; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1382; GFX8-NEXT: .LBB8_2: 1383; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1384; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1385; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1386; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1387; GFX8-NEXT: s_mov_b32 s7, 0xf000 1388; GFX8-NEXT: s_mov_b32 s6, -1 1389; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1390; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1391; GFX8-NEXT: s_endpgm 1392; 1393; GFX9-LABEL: sub_i32_uniform: 1394; GFX9: ; %bb.0: ; %entry 1395; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1396; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1397; GFX9-NEXT: s_mov_b64 s[2:3], exec 1398; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1399; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1400; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1401; GFX9-NEXT: ; implicit-def: $vgpr1 1402; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1403; GFX9-NEXT: s_cbranch_execz .LBB8_2 1404; GFX9-NEXT: ; %bb.1: 1405; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1406; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1407; GFX9-NEXT: s_mul_i32 s2, s6, s2 1408; GFX9-NEXT: v_mov_b32_e32 v1, 0 1409; GFX9-NEXT: v_mov_b32_e32 v2, s2 1410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1412; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1413; GFX9-NEXT: .LBB8_2: 1414; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1415; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1417; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1418; GFX9-NEXT: s_mov_b32 s7, 0xf000 1419; GFX9-NEXT: s_mov_b32 s6, -1 1420; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1421; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1422; GFX9-NEXT: s_endpgm 1423; 1424; GFX1064-LABEL: sub_i32_uniform: 1425; GFX1064: ; %bb.0: ; %entry 1426; GFX1064-NEXT: s_clause 0x1 1427; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1428; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 1429; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1430; GFX1064-NEXT: ; implicit-def: $vgpr1 1431; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1432; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1433; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1434; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1435; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1436; GFX1064-NEXT: ; %bb.1: 1437; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1438; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1439; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1440; GFX1064-NEXT: s_mul_i32 s2, s6, s2 1441; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1442; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1443; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1444; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1445; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1446; GFX1064-NEXT: buffer_gl0_inv 1447; GFX1064-NEXT: .LBB8_2: 1448; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1449; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1450; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 1452; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1453; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1454; GFX1064-NEXT: s_mov_b32 s6, -1 1455; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1456; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1457; GFX1064-NEXT: s_endpgm 1458; 1459; GFX1032-LABEL: sub_i32_uniform: 1460; GFX1032: ; %bb.0: ; %entry 1461; GFX1032-NEXT: s_clause 0x1 1462; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1463; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1464; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1465; GFX1032-NEXT: ; implicit-def: $vgpr1 1466; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1467; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1468; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1469; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1470; GFX1032-NEXT: ; %bb.1: 1471; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1472; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1473; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1474; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1475; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1476; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1477; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1478; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1479; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX1032-NEXT: buffer_gl0_inv 1481; GFX1032-NEXT: .LBB8_2: 1482; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1483; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1484; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1485; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1486; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1487; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1488; GFX1032-NEXT: s_mov_b32 s6, -1 1489; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1490; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1491; GFX1032-NEXT: s_endpgm 1492entry: 1493 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1494 store i32 %old, i32 addrspace(1)* %out 1495 ret void 1496} 1497 1498define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1499; 1500; 1501; GFX7LESS-LABEL: sub_i32_varying: 1502; GFX7LESS: ; %bb.0: ; %entry 1503; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1504; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1505; GFX7LESS-NEXT: s_mov_b32 m0, -1 1506; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1507; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1508; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1509; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1510; GFX7LESS-NEXT: s_mov_b32 s2, -1 1511; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1512; GFX7LESS-NEXT: s_endpgm 1513; 1514; GFX8-LABEL: sub_i32_varying: 1515; GFX8: ; %bb.0: ; %entry 1516; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1517; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1518; GFX8-NEXT: v_mov_b32_e32 v1, 0 1519; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1520; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1521; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1522; GFX8-NEXT: v_mov_b32_e32 v2, v0 1523; GFX8-NEXT: s_not_b64 exec, exec 1524; GFX8-NEXT: v_mov_b32_e32 v2, 0 1525; GFX8-NEXT: s_not_b64 exec, exec 1526; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1527; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1528; GFX8-NEXT: s_nop 1 1529; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1530; GFX8-NEXT: s_nop 1 1531; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1532; GFX8-NEXT: s_nop 1 1533; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1534; GFX8-NEXT: s_nop 1 1535; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1536; GFX8-NEXT: s_nop 1 1537; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1538; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1539; GFX8-NEXT: s_nop 0 1540; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1541; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1542; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1543; GFX8-NEXT: ; implicit-def: $vgpr0 1544; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1545; GFX8-NEXT: s_cbranch_execz .LBB9_2 1546; GFX8-NEXT: ; %bb.1: 1547; GFX8-NEXT: v_mov_b32_e32 v0, 0 1548; GFX8-NEXT: v_mov_b32_e32 v3, s4 1549; GFX8-NEXT: s_mov_b32 m0, -1 1550; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1551; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1552; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1553; GFX8-NEXT: .LBB9_2: 1554; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1555; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1556; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1557; GFX8-NEXT: v_mov_b32_e32 v0, v1 1558; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1559; GFX8-NEXT: s_mov_b32 s3, 0xf000 1560; GFX8-NEXT: s_mov_b32 s2, -1 1561; GFX8-NEXT: s_nop 0 1562; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1563; GFX8-NEXT: s_endpgm 1564; 1565; GFX9-LABEL: sub_i32_varying: 1566; GFX9: ; %bb.0: ; %entry 1567; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1568; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1569; GFX9-NEXT: v_mov_b32_e32 v1, 0 1570; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1571; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1572; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1573; GFX9-NEXT: v_mov_b32_e32 v2, v0 1574; GFX9-NEXT: s_not_b64 exec, exec 1575; GFX9-NEXT: v_mov_b32_e32 v2, 0 1576; GFX9-NEXT: s_not_b64 exec, exec 1577; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1578; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1579; GFX9-NEXT: s_nop 1 1580; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1581; GFX9-NEXT: s_nop 1 1582; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1583; GFX9-NEXT: s_nop 1 1584; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1585; GFX9-NEXT: s_nop 1 1586; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1587; GFX9-NEXT: s_nop 1 1588; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1589; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1590; GFX9-NEXT: s_nop 0 1591; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1592; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1593; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1594; GFX9-NEXT: ; implicit-def: $vgpr0 1595; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1596; GFX9-NEXT: s_cbranch_execz .LBB9_2 1597; GFX9-NEXT: ; %bb.1: 1598; GFX9-NEXT: v_mov_b32_e32 v0, 0 1599; GFX9-NEXT: v_mov_b32_e32 v3, s4 1600; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1602; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1603; GFX9-NEXT: .LBB9_2: 1604; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1605; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1606; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1607; GFX9-NEXT: v_mov_b32_e32 v0, v1 1608; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1609; GFX9-NEXT: s_mov_b32 s3, 0xf000 1610; GFX9-NEXT: s_mov_b32 s2, -1 1611; GFX9-NEXT: s_nop 0 1612; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1613; GFX9-NEXT: s_endpgm 1614; 1615; GFX1064-LABEL: sub_i32_varying: 1616; GFX1064: ; %bb.0: ; %entry 1617; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1618; GFX1064-NEXT: s_not_b64 exec, exec 1619; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1620; GFX1064-NEXT: s_not_b64 exec, exec 1621; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1622; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1623; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1624; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1625; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1626; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1627; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1628; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1629; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1630; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1631; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1632; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1633; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1634; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1635; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1636; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1637; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1638; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1639; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 1640; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1641; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1642; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1643; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 1644; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 1645; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 1646; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1647; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1648; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1649; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 1650; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1651; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1652; GFX1064-NEXT: s_mov_b32 s2, -1 1653; GFX1064-NEXT: ; implicit-def: $vgpr0 1654; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1655; GFX1064-NEXT: s_cbranch_execz .LBB9_2 1656; GFX1064-NEXT: ; %bb.1: 1657; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1658; GFX1064-NEXT: v_mov_b32_e32 v4, s7 1659; GFX1064-NEXT: s_mov_b32 s3, s7 1660; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1661; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1662; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 1663; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1664; GFX1064-NEXT: buffer_gl0_inv 1665; GFX1064-NEXT: .LBB9_2: 1666; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1667; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1668; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1669; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1670; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1671; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1672; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1673; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1674; GFX1064-NEXT: s_endpgm 1675; 1676; GFX1032-LABEL: sub_i32_varying: 1677; GFX1032: ; %bb.0: ; %entry 1678; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1679; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1680; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1681; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1682; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1683; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1684; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1685; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1686; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1687; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1688; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1689; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1690; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1691; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1692; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1693; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1694; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1695; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1696; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1697; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1698; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1699; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1700; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1701; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1702; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1703; GFX1032-NEXT: s_mov_b32 s2, -1 1704; GFX1032-NEXT: ; implicit-def: $vgpr0 1705; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1706; GFX1032-NEXT: s_cbranch_execz .LBB9_2 1707; GFX1032-NEXT: ; %bb.1: 1708; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1709; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1710; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1711; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1712; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 1713; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1714; GFX1032-NEXT: buffer_gl0_inv 1715; GFX1032-NEXT: .LBB9_2: 1716; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1717; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1718; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1719; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1720; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1721; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1722; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1723; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1724; GFX1032-NEXT: s_endpgm 1725entry: 1726 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1727 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1728 store i32 %old, i32 addrspace(1)* %out 1729 ret void 1730} 1731 1732define amdgpu_kernel void @sub_i32_varying_nouse() { 1733; GFX7LESS-LABEL: sub_i32_varying_nouse: 1734; GFX7LESS: ; %bb.0: ; %entry 1735; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1736; GFX7LESS-NEXT: s_mov_b32 m0, -1 1737; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1738; GFX7LESS-NEXT: ds_sub_u32 v1, v0 1739; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX7LESS-NEXT: s_endpgm 1741; 1742; GFX8-LABEL: sub_i32_varying_nouse: 1743; GFX8: ; %bb.0: ; %entry 1744; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 1745; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 1746; GFX8-NEXT: v_mov_b32_e32 v1, v0 1747; GFX8-NEXT: s_not_b64 exec, exec 1748; GFX8-NEXT: v_mov_b32_e32 v1, 0 1749; GFX8-NEXT: s_not_b64 exec, exec 1750; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 1751; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1752; GFX8-NEXT: s_nop 1 1753; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1754; GFX8-NEXT: s_nop 1 1755; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1756; GFX8-NEXT: s_nop 1 1757; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1758; GFX8-NEXT: s_nop 1 1759; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1760; GFX8-NEXT: s_nop 1 1761; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1762; GFX8-NEXT: v_readlane_b32 s2, v1, 63 1763; GFX8-NEXT: s_mov_b64 exec, s[0:1] 1764; GFX8-NEXT: s_mov_b32 s0, s2 1765; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1766; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1767; GFX8-NEXT: s_cbranch_execz .LBB10_2 1768; GFX8-NEXT: ; %bb.1: 1769; GFX8-NEXT: v_mov_b32_e32 v0, 0 1770; GFX8-NEXT: v_mov_b32_e32 v2, s0 1771; GFX8-NEXT: s_mov_b32 m0, -1 1772; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1773; GFX8-NEXT: ds_sub_u32 v0, v2 1774; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1775; GFX8-NEXT: .LBB10_2: 1776; GFX8-NEXT: s_endpgm 1777; 1778; GFX9-LABEL: sub_i32_varying_nouse: 1779; GFX9: ; %bb.0: ; %entry 1780; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 1781; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 1782; GFX9-NEXT: v_mov_b32_e32 v1, v0 1783; GFX9-NEXT: s_not_b64 exec, exec 1784; GFX9-NEXT: v_mov_b32_e32 v1, 0 1785; GFX9-NEXT: s_not_b64 exec, exec 1786; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 1787; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1788; GFX9-NEXT: s_nop 1 1789; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1790; GFX9-NEXT: s_nop 1 1791; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1792; GFX9-NEXT: s_nop 1 1793; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1794; GFX9-NEXT: s_nop 1 1795; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1796; GFX9-NEXT: s_nop 1 1797; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1798; GFX9-NEXT: v_readlane_b32 s2, v1, 63 1799; GFX9-NEXT: s_mov_b64 exec, s[0:1] 1800; GFX9-NEXT: s_mov_b32 s0, s2 1801; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1802; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1803; GFX9-NEXT: s_cbranch_execz .LBB10_2 1804; GFX9-NEXT: ; %bb.1: 1805; GFX9-NEXT: v_mov_b32_e32 v0, 0 1806; GFX9-NEXT: v_mov_b32_e32 v2, s0 1807; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1808; GFX9-NEXT: ds_sub_u32 v0, v2 1809; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1810; GFX9-NEXT: .LBB10_2: 1811; GFX9-NEXT: s_endpgm 1812; 1813; GFX1064-LABEL: sub_i32_varying_nouse: 1814; GFX1064: ; %bb.0: ; %entry 1815; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1816; GFX1064-NEXT: s_not_b64 exec, exec 1817; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1818; GFX1064-NEXT: s_not_b64 exec, exec 1819; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1820; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1821; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1822; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1823; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1824; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1825; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1826; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 1827; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1828; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1829; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1830; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 1831; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 1832; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1833; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1834; GFX1064-NEXT: s_add_i32 s0, s2, s3 1835; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1836; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1837; GFX1064-NEXT: s_cbranch_execz .LBB10_2 1838; GFX1064-NEXT: ; %bb.1: 1839; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1840; GFX1064-NEXT: v_mov_b32_e32 v3, s0 1841; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1842; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1843; GFX1064-NEXT: ds_sub_u32 v0, v3 1844; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1845; GFX1064-NEXT: buffer_gl0_inv 1846; GFX1064-NEXT: .LBB10_2: 1847; GFX1064-NEXT: s_endpgm 1848; 1849; GFX1032-LABEL: sub_i32_varying_nouse: 1850; GFX1032: ; %bb.0: ; %entry 1851; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1852; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1853; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1854; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1855; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1856; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1857; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1858; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1859; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1860; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1861; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1862; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 1863; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1864; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1865; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1866; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 1867; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1868; GFX1032-NEXT: s_cbranch_execz .LBB10_2 1869; GFX1032-NEXT: ; %bb.1: 1870; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1871; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1872; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1873; GFX1032-NEXT: ds_sub_u32 v3, v0 1874; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1875; GFX1032-NEXT: buffer_gl0_inv 1876; GFX1032-NEXT: .LBB10_2: 1877; GFX1032-NEXT: s_endpgm 1878entry: 1879 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1880 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1881 ret void 1882} 1883 1884define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 1885; 1886; 1887; GFX7LESS-LABEL: sub_i64_constant: 1888; GFX7LESS: ; %bb.0: ; %entry 1889; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1890; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1891; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1892; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1893; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1894; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1895; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1896; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 1897; GFX7LESS-NEXT: ; %bb.1: 1898; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1899; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1900; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1901; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1902; GFX7LESS-NEXT: s_mov_b32 m0, -1 1903; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1904; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1905; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1906; GFX7LESS-NEXT: .LBB11_2: 1907; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1908; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1909; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1910; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1911; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1912; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1913; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1914; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1915; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1916; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1917; GFX7LESS-NEXT: s_mov_b32 s2, -1 1918; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1919; GFX7LESS-NEXT: s_endpgm 1920; 1921; GFX8-LABEL: sub_i64_constant: 1922; GFX8: ; %bb.0: ; %entry 1923; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1924; GFX8-NEXT: s_mov_b64 s[4:5], exec 1925; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1926; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1927; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1928; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1929; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1930; GFX8-NEXT: s_cbranch_execz .LBB11_2 1931; GFX8-NEXT: ; %bb.1: 1932; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1933; GFX8-NEXT: s_mul_i32 s4, s4, 5 1934; GFX8-NEXT: v_mov_b32_e32 v0, s4 1935; GFX8-NEXT: v_mov_b32_e32 v1, 0 1936; GFX8-NEXT: s_mov_b32 m0, -1 1937; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1938; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1939; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1940; GFX8-NEXT: .LBB11_2: 1941; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1942; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1943; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1944; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1945; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1946; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1947; GFX8-NEXT: v_mov_b32_e32 v2, s3 1948; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1949; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1950; GFX8-NEXT: s_mov_b32 s3, 0xf000 1951; GFX8-NEXT: s_mov_b32 s2, -1 1952; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1953; GFX8-NEXT: s_endpgm 1954; 1955; GFX9-LABEL: sub_i64_constant: 1956; GFX9: ; %bb.0: ; %entry 1957; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1958; GFX9-NEXT: s_mov_b64 s[4:5], exec 1959; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1960; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1961; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1962; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1963; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1964; GFX9-NEXT: s_cbranch_execz .LBB11_2 1965; GFX9-NEXT: ; %bb.1: 1966; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1967; GFX9-NEXT: s_mul_i32 s4, s4, 5 1968; GFX9-NEXT: v_mov_b32_e32 v0, s4 1969; GFX9-NEXT: v_mov_b32_e32 v1, 0 1970; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1971; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1972; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1973; GFX9-NEXT: .LBB11_2: 1974; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1976; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1977; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1978; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1979; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1980; GFX9-NEXT: v_mov_b32_e32 v2, s3 1981; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 1982; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 1983; GFX9-NEXT: s_mov_b32 s3, 0xf000 1984; GFX9-NEXT: s_mov_b32 s2, -1 1985; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1986; GFX9-NEXT: s_endpgm 1987; 1988; GFX1064-LABEL: sub_i64_constant: 1989; GFX1064: ; %bb.0: ; %entry 1990; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1991; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1992; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1993; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1994; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1995; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1996; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1997; GFX1064-NEXT: s_cbranch_execz .LBB11_2 1998; GFX1064-NEXT: ; %bb.1: 1999; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2000; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2001; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2002; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2003; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2004; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2005; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2006; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2007; GFX1064-NEXT: buffer_gl0_inv 2008; GFX1064-NEXT: .LBB11_2: 2009; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2010; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2011; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2012; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2013; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2014; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2015; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2016; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2017; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2018; GFX1064-NEXT: s_mov_b32 s2, -1 2019; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2020; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2021; GFX1064-NEXT: s_endpgm 2022; 2023; GFX1032-LABEL: sub_i64_constant: 2024; GFX1032: ; %bb.0: ; %entry 2025; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2026; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2027; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2028; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2029; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2030; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2031; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2032; GFX1032-NEXT: ; %bb.1: 2033; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2034; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2035; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2036; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2037; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2038; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2039; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2040; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2041; GFX1032-NEXT: buffer_gl0_inv 2042; GFX1032-NEXT: .LBB11_2: 2043; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2044; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2045; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2046; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2047; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2048; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2049; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2050; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2051; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2052; GFX1032-NEXT: s_mov_b32 s2, -1 2053; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2054; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2055; GFX1032-NEXT: s_endpgm 2056entry: 2057 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2058 store i64 %old, i64 addrspace(1)* %out 2059 ret void 2060} 2061 2062define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2063; 2064; 2065; GFX7LESS-LABEL: sub_i64_uniform: 2066; GFX7LESS: ; %bb.0: ; %entry 2067; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2068; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2069; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2070; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 2071; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2072; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2073; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2074; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 2075; GFX7LESS-NEXT: ; %bb.1: 2076; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2077; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 2078; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2079; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2080; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2081; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 2082; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2083; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 2084; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2085; GFX7LESS-NEXT: s_mov_b32 m0, -1 2086; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2087; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2088; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2089; GFX7LESS-NEXT: .LBB12_2: 2090; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2091; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2092; GFX7LESS-NEXT: s_mov_b32 s6, -1 2093; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2094; GFX7LESS-NEXT: s_mov_b32 s4, s0 2095; GFX7LESS-NEXT: s_mov_b32 s5, s1 2096; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 2097; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 2098; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 2099; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 2100; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 2101; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 2102; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 2103; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 2104; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2105; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2106; GFX7LESS-NEXT: s_endpgm 2107; 2108; GFX8-LABEL: sub_i64_uniform: 2109; GFX8: ; %bb.0: ; %entry 2110; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2111; GFX8-NEXT: s_mov_b64 s[6:7], exec 2112; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2113; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2114; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2115; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2116; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2117; GFX8-NEXT: s_cbranch_execz .LBB12_2 2118; GFX8-NEXT: ; %bb.1: 2119; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 2120; GFX8-NEXT: v_mov_b32_e32 v0, s8 2121; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2122; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 2123; GFX8-NEXT: s_mul_i32 s6, s3, s8 2124; GFX8-NEXT: v_mov_b32_e32 v3, 0 2125; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 2126; GFX8-NEXT: s_mov_b32 m0, -1 2127; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2128; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2129; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2130; GFX8-NEXT: .LBB12_2: 2131; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2132; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2133; GFX8-NEXT: s_mov_b32 s4, s0 2134; GFX8-NEXT: s_mov_b32 s5, s1 2135; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 2136; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2137; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2138; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2139; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 2140; GFX8-NEXT: v_mov_b32_e32 v3, s1 2141; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 2142; GFX8-NEXT: s_mov_b32 s7, 0xf000 2143; GFX8-NEXT: s_mov_b32 s6, -1 2144; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2145; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2146; GFX8-NEXT: s_endpgm 2147; 2148; GFX9-LABEL: sub_i64_uniform: 2149; GFX9: ; %bb.0: ; %entry 2150; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2151; GFX9-NEXT: s_mov_b64 s[6:7], exec 2152; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2153; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2154; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2155; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2156; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2157; GFX9-NEXT: s_cbranch_execz .LBB12_2 2158; GFX9-NEXT: ; %bb.1: 2159; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2160; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2161; GFX9-NEXT: s_mul_i32 s7, s3, s6 2162; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2163; GFX9-NEXT: s_add_i32 s8, s8, s7 2164; GFX9-NEXT: s_mul_i32 s6, s2, s6 2165; GFX9-NEXT: v_mov_b32_e32 v0, s6 2166; GFX9-NEXT: v_mov_b32_e32 v1, s8 2167; GFX9-NEXT: v_mov_b32_e32 v3, 0 2168; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2169; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2170; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2171; GFX9-NEXT: .LBB12_2: 2172; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2173; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2174; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 2175; GFX9-NEXT: s_mov_b32 s4, s0 2176; GFX9-NEXT: s_mov_b32 s5, s1 2177; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 2178; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2179; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2180; GFX9-NEXT: v_mov_b32_e32 v1, v4 2181; GFX9-NEXT: v_mov_b32_e32 v2, s1 2182; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 2183; GFX9-NEXT: s_mov_b32 s7, 0xf000 2184; GFX9-NEXT: s_mov_b32 s6, -1 2185; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2186; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2187; GFX9-NEXT: s_endpgm 2188; 2189; GFX1064-LABEL: sub_i64_uniform: 2190; GFX1064: ; %bb.0: ; %entry 2191; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2192; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2193; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2194; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2195; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2196; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2197; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2198; GFX1064-NEXT: s_cbranch_execz .LBB12_2 2199; GFX1064-NEXT: ; %bb.1: 2200; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2201; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2202; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2203; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2204; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2205; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2206; GFX1064-NEXT: s_add_i32 s8, s8, s7 2207; GFX1064-NEXT: v_mov_b32_e32 v0, s6 2208; GFX1064-NEXT: v_mov_b32_e32 v1, s8 2209; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2210; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2211; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2212; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX1064-NEXT: buffer_gl0_inv 2214; GFX1064-NEXT: .LBB12_2: 2215; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2216; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2217; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2218; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 2219; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 2220; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] 2221; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2222; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2223; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 2224; GFX1064-NEXT: v_mov_b32_e32 v1, v4 2225; GFX1064-NEXT: s_mov_b32 s2, -1 2226; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2227; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2228; GFX1064-NEXT: s_endpgm 2229; 2230; GFX1032-LABEL: sub_i64_uniform: 2231; GFX1032: ; %bb.0: ; %entry 2232; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2233; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2234; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2235; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 2236; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2237; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2238; GFX1032-NEXT: s_cbranch_execz .LBB12_2 2239; GFX1032-NEXT: ; %bb.1: 2240; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2241; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2242; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2243; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2244; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2245; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2246; GFX1032-NEXT: s_add_i32 s7, s7, s6 2247; GFX1032-NEXT: v_mov_b32_e32 v0, s5 2248; GFX1032-NEXT: v_mov_b32_e32 v1, s7 2249; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2250; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2251; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2252; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2253; GFX1032-NEXT: buffer_gl0_inv 2254; GFX1032-NEXT: .LBB12_2: 2255; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2256; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2257; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2258; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 2259; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 2260; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] 2261; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2262; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2263; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 2264; GFX1032-NEXT: v_mov_b32_e32 v1, v4 2265; GFX1032-NEXT: s_mov_b32 s2, -1 2266; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2267; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2268; GFX1032-NEXT: s_endpgm 2269entry: 2270 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2271 store i64 %old, i64 addrspace(1)* %out 2272 ret void 2273} 2274 2275define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2276; 2277; 2278; GFX7LESS-LABEL: sub_i64_varying: 2279; GFX7LESS: ; %bb.0: ; %entry 2280; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2281; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2282; GFX7LESS-NEXT: s_mov_b32 m0, -1 2283; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2284; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2285; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2286; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2287; GFX7LESS-NEXT: s_mov_b32 s2, -1 2288; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2289; GFX7LESS-NEXT: s_endpgm 2290; 2291; GFX8-LABEL: sub_i64_varying: 2292; GFX8: ; %bb.0: ; %entry 2293; GFX8-NEXT: v_mov_b32_e32 v1, 0 2294; GFX8-NEXT: s_mov_b32 m0, -1 2295; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2296; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2297; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2298; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2299; GFX8-NEXT: s_mov_b32 s3, 0xf000 2300; GFX8-NEXT: s_mov_b32 s2, -1 2301; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2302; GFX8-NEXT: s_endpgm 2303; 2304; GFX9-LABEL: sub_i64_varying: 2305; GFX9: ; %bb.0: ; %entry 2306; GFX9-NEXT: v_mov_b32_e32 v1, 0 2307; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2308; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2309; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2310; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2311; GFX9-NEXT: s_mov_b32 s3, 0xf000 2312; GFX9-NEXT: s_mov_b32 s2, -1 2313; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2314; GFX9-NEXT: s_endpgm 2315; 2316; GFX10-LABEL: sub_i64_varying: 2317; GFX10: ; %bb.0: ; %entry 2318; GFX10-NEXT: v_mov_b32_e32 v1, 0 2319; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2320; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2321; GFX10-NEXT: s_mov_b32 s2, -1 2322; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2323; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2324; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2325; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2326; GFX10-NEXT: buffer_gl0_inv 2327; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2328; GFX10-NEXT: s_endpgm 2329entry: 2330 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2331 %zext = zext i32 %lane to i64 2332 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2333 store i64 %old, i64 addrspace(1)* %out 2334 ret void 2335} 2336 2337define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2338; 2339; 2340; GFX7LESS-LABEL: and_i32_varying: 2341; GFX7LESS: ; %bb.0: ; %entry 2342; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2343; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2344; GFX7LESS-NEXT: s_mov_b32 m0, -1 2345; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2346; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2347; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2348; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2349; GFX7LESS-NEXT: s_mov_b32 s2, -1 2350; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2351; GFX7LESS-NEXT: s_endpgm 2352; 2353; GFX8-LABEL: and_i32_varying: 2354; GFX8: ; %bb.0: ; %entry 2355; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2356; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2357; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2358; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2359; GFX8-NEXT: v_mov_b32_e32 v1, -1 2360; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2361; GFX8-NEXT: v_mov_b32_e32 v2, v0 2362; GFX8-NEXT: s_not_b64 exec, exec 2363; GFX8-NEXT: v_mov_b32_e32 v2, -1 2364; GFX8-NEXT: s_not_b64 exec, exec 2365; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2366; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2367; GFX8-NEXT: s_nop 1 2368; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2369; GFX8-NEXT: s_nop 1 2370; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2371; GFX8-NEXT: s_nop 1 2372; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2373; GFX8-NEXT: s_nop 1 2374; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2375; GFX8-NEXT: s_nop 1 2376; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2377; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2378; GFX8-NEXT: s_nop 0 2379; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2380; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2381; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2382; GFX8-NEXT: ; implicit-def: $vgpr0 2383; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2384; GFX8-NEXT: s_cbranch_execz .LBB14_2 2385; GFX8-NEXT: ; %bb.1: 2386; GFX8-NEXT: v_mov_b32_e32 v0, 0 2387; GFX8-NEXT: v_mov_b32_e32 v3, s4 2388; GFX8-NEXT: s_mov_b32 m0, -1 2389; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2390; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2391; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2392; GFX8-NEXT: .LBB14_2: 2393; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2394; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2395; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2396; GFX8-NEXT: v_mov_b32_e32 v0, v1 2397; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2398; GFX8-NEXT: s_mov_b32 s3, 0xf000 2399; GFX8-NEXT: s_mov_b32 s2, -1 2400; GFX8-NEXT: s_nop 0 2401; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2402; GFX8-NEXT: s_endpgm 2403; 2404; GFX9-LABEL: and_i32_varying: 2405; GFX9: ; %bb.0: ; %entry 2406; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2407; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2408; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2409; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2410; GFX9-NEXT: v_mov_b32_e32 v1, -1 2411; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2412; GFX9-NEXT: v_mov_b32_e32 v2, v0 2413; GFX9-NEXT: s_not_b64 exec, exec 2414; GFX9-NEXT: v_mov_b32_e32 v2, -1 2415; GFX9-NEXT: s_not_b64 exec, exec 2416; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2417; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2418; GFX9-NEXT: s_nop 1 2419; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2420; GFX9-NEXT: s_nop 1 2421; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2422; GFX9-NEXT: s_nop 1 2423; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2424; GFX9-NEXT: s_nop 1 2425; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2426; GFX9-NEXT: s_nop 1 2427; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2428; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2429; GFX9-NEXT: s_nop 0 2430; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2431; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2432; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2433; GFX9-NEXT: ; implicit-def: $vgpr0 2434; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2435; GFX9-NEXT: s_cbranch_execz .LBB14_2 2436; GFX9-NEXT: ; %bb.1: 2437; GFX9-NEXT: v_mov_b32_e32 v0, 0 2438; GFX9-NEXT: v_mov_b32_e32 v3, s4 2439; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2440; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2441; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2442; GFX9-NEXT: .LBB14_2: 2443; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2445; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2446; GFX9-NEXT: v_mov_b32_e32 v0, v1 2447; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2448; GFX9-NEXT: s_mov_b32 s3, 0xf000 2449; GFX9-NEXT: s_mov_b32 s2, -1 2450; GFX9-NEXT: s_nop 0 2451; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2452; GFX9-NEXT: s_endpgm 2453; 2454; GFX1064-LABEL: and_i32_varying: 2455; GFX1064: ; %bb.0: ; %entry 2456; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2457; GFX1064-NEXT: s_not_b64 exec, exec 2458; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2459; GFX1064-NEXT: s_not_b64 exec, exec 2460; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2461; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2462; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2463; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2464; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2465; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2466; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2467; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2468; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2469; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2470; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2471; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2472; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2473; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2474; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2475; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2476; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2477; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2478; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2479; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2480; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2481; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2482; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2483; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2484; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2485; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2486; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2487; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2488; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2489; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2490; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2491; GFX1064-NEXT: s_mov_b32 s2, -1 2492; GFX1064-NEXT: ; implicit-def: $vgpr0 2493; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2494; GFX1064-NEXT: s_cbranch_execz .LBB14_2 2495; GFX1064-NEXT: ; %bb.1: 2496; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2497; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2498; GFX1064-NEXT: s_mov_b32 s3, s7 2499; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2500; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2501; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 2502; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2503; GFX1064-NEXT: buffer_gl0_inv 2504; GFX1064-NEXT: .LBB14_2: 2505; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2506; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2507; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2508; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2509; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2510; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2511; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2512; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2513; GFX1064-NEXT: s_endpgm 2514; 2515; GFX1032-LABEL: and_i32_varying: 2516; GFX1032: ; %bb.0: ; %entry 2517; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2518; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2519; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2520; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2521; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2522; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2523; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2524; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2525; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2526; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2527; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2528; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2529; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2530; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2531; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2532; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2533; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2534; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2535; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2536; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2537; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2538; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2539; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2540; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2541; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2542; GFX1032-NEXT: s_mov_b32 s2, -1 2543; GFX1032-NEXT: ; implicit-def: $vgpr0 2544; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2545; GFX1032-NEXT: s_cbranch_execz .LBB14_2 2546; GFX1032-NEXT: ; %bb.1: 2547; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2548; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2549; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2550; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2551; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 2552; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2553; GFX1032-NEXT: buffer_gl0_inv 2554; GFX1032-NEXT: .LBB14_2: 2555; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2556; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2557; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2558; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2559; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2560; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2561; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2562; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2563; GFX1032-NEXT: s_endpgm 2564entry: 2565 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2566 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2567 store i32 %old, i32 addrspace(1)* %out 2568 ret void 2569} 2570 2571define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2572; 2573; 2574; GFX7LESS-LABEL: or_i32_varying: 2575; GFX7LESS: ; %bb.0: ; %entry 2576; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2577; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2578; GFX7LESS-NEXT: s_mov_b32 m0, -1 2579; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2580; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2581; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2582; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2583; GFX7LESS-NEXT: s_mov_b32 s2, -1 2584; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2585; GFX7LESS-NEXT: s_endpgm 2586; 2587; GFX8-LABEL: or_i32_varying: 2588; GFX8: ; %bb.0: ; %entry 2589; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2590; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2591; GFX8-NEXT: v_mov_b32_e32 v1, 0 2592; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2593; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2594; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2595; GFX8-NEXT: v_mov_b32_e32 v2, v0 2596; GFX8-NEXT: s_not_b64 exec, exec 2597; GFX8-NEXT: v_mov_b32_e32 v2, 0 2598; GFX8-NEXT: s_not_b64 exec, exec 2599; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2600; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2601; GFX8-NEXT: s_nop 1 2602; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2603; GFX8-NEXT: s_nop 1 2604; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2605; GFX8-NEXT: s_nop 1 2606; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2607; GFX8-NEXT: s_nop 1 2608; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2609; GFX8-NEXT: s_nop 1 2610; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2611; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2612; GFX8-NEXT: s_nop 0 2613; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2614; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2615; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2616; GFX8-NEXT: ; implicit-def: $vgpr0 2617; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2618; GFX8-NEXT: s_cbranch_execz .LBB15_2 2619; GFX8-NEXT: ; %bb.1: 2620; GFX8-NEXT: v_mov_b32_e32 v0, 0 2621; GFX8-NEXT: v_mov_b32_e32 v3, s4 2622; GFX8-NEXT: s_mov_b32 m0, -1 2623; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2624; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2625; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2626; GFX8-NEXT: .LBB15_2: 2627; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2628; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2629; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2630; GFX8-NEXT: v_mov_b32_e32 v0, v1 2631; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2632; GFX8-NEXT: s_mov_b32 s3, 0xf000 2633; GFX8-NEXT: s_mov_b32 s2, -1 2634; GFX8-NEXT: s_nop 0 2635; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2636; GFX8-NEXT: s_endpgm 2637; 2638; GFX9-LABEL: or_i32_varying: 2639; GFX9: ; %bb.0: ; %entry 2640; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2641; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2642; GFX9-NEXT: v_mov_b32_e32 v1, 0 2643; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2644; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2645; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2646; GFX9-NEXT: v_mov_b32_e32 v2, v0 2647; GFX9-NEXT: s_not_b64 exec, exec 2648; GFX9-NEXT: v_mov_b32_e32 v2, 0 2649; GFX9-NEXT: s_not_b64 exec, exec 2650; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2651; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2652; GFX9-NEXT: s_nop 1 2653; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2654; GFX9-NEXT: s_nop 1 2655; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2656; GFX9-NEXT: s_nop 1 2657; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2658; GFX9-NEXT: s_nop 1 2659; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2660; GFX9-NEXT: s_nop 1 2661; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2662; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2663; GFX9-NEXT: s_nop 0 2664; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2665; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2666; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2667; GFX9-NEXT: ; implicit-def: $vgpr0 2668; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2669; GFX9-NEXT: s_cbranch_execz .LBB15_2 2670; GFX9-NEXT: ; %bb.1: 2671; GFX9-NEXT: v_mov_b32_e32 v0, 0 2672; GFX9-NEXT: v_mov_b32_e32 v3, s4 2673; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2674; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2675; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2676; GFX9-NEXT: .LBB15_2: 2677; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2678; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2679; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2680; GFX9-NEXT: v_mov_b32_e32 v0, v1 2681; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2682; GFX9-NEXT: s_mov_b32 s3, 0xf000 2683; GFX9-NEXT: s_mov_b32 s2, -1 2684; GFX9-NEXT: s_nop 0 2685; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2686; GFX9-NEXT: s_endpgm 2687; 2688; GFX1064-LABEL: or_i32_varying: 2689; GFX1064: ; %bb.0: ; %entry 2690; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2691; GFX1064-NEXT: s_not_b64 exec, exec 2692; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2693; GFX1064-NEXT: s_not_b64 exec, exec 2694; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2695; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2696; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2697; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2698; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2699; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2700; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2701; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2702; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2703; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2704; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2705; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2706; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2707; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2708; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2709; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2710; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2711; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2712; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2713; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2714; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2715; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2716; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2717; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2718; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2719; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2720; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2721; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2722; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2723; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2724; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2725; GFX1064-NEXT: s_mov_b32 s2, -1 2726; GFX1064-NEXT: ; implicit-def: $vgpr0 2727; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2728; GFX1064-NEXT: s_cbranch_execz .LBB15_2 2729; GFX1064-NEXT: ; %bb.1: 2730; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2731; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2732; GFX1064-NEXT: s_mov_b32 s3, s7 2733; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2734; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2735; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 2736; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2737; GFX1064-NEXT: buffer_gl0_inv 2738; GFX1064-NEXT: .LBB15_2: 2739; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2740; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2741; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2742; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2743; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2744; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2745; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2746; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2747; GFX1064-NEXT: s_endpgm 2748; 2749; GFX1032-LABEL: or_i32_varying: 2750; GFX1032: ; %bb.0: ; %entry 2751; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2752; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2753; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2754; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2755; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2756; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2757; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2758; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2759; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2760; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2761; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2762; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2763; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2764; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2765; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2766; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2767; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2768; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2769; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2770; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2771; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2772; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2773; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2774; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2775; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2776; GFX1032-NEXT: s_mov_b32 s2, -1 2777; GFX1032-NEXT: ; implicit-def: $vgpr0 2778; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2779; GFX1032-NEXT: s_cbranch_execz .LBB15_2 2780; GFX1032-NEXT: ; %bb.1: 2781; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2782; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2783; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2784; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2785; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 2786; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2787; GFX1032-NEXT: buffer_gl0_inv 2788; GFX1032-NEXT: .LBB15_2: 2789; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2790; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2791; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2792; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2793; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 2794; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2795; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2796; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2797; GFX1032-NEXT: s_endpgm 2798entry: 2799 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2800 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2801 store i32 %old, i32 addrspace(1)* %out 2802 ret void 2803} 2804 2805define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 2806; 2807; 2808; GFX7LESS-LABEL: xor_i32_varying: 2809; GFX7LESS: ; %bb.0: ; %entry 2810; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2811; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2812; GFX7LESS-NEXT: s_mov_b32 m0, -1 2813; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2814; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 2815; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2816; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2817; GFX7LESS-NEXT: s_mov_b32 s2, -1 2818; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2819; GFX7LESS-NEXT: s_endpgm 2820; 2821; GFX8-LABEL: xor_i32_varying: 2822; GFX8: ; %bb.0: ; %entry 2823; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2824; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2825; GFX8-NEXT: v_mov_b32_e32 v1, 0 2826; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2827; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2828; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2829; GFX8-NEXT: v_mov_b32_e32 v2, v0 2830; GFX8-NEXT: s_not_b64 exec, exec 2831; GFX8-NEXT: v_mov_b32_e32 v2, 0 2832; GFX8-NEXT: s_not_b64 exec, exec 2833; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2834; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2835; GFX8-NEXT: s_nop 1 2836; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2837; GFX8-NEXT: s_nop 1 2838; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2839; GFX8-NEXT: s_nop 1 2840; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2841; GFX8-NEXT: s_nop 1 2842; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2843; GFX8-NEXT: s_nop 1 2844; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2845; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2846; GFX8-NEXT: s_nop 0 2847; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2848; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2849; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2850; GFX8-NEXT: ; implicit-def: $vgpr0 2851; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2852; GFX8-NEXT: s_cbranch_execz .LBB16_2 2853; GFX8-NEXT: ; %bb.1: 2854; GFX8-NEXT: v_mov_b32_e32 v0, 0 2855; GFX8-NEXT: v_mov_b32_e32 v3, s4 2856; GFX8-NEXT: s_mov_b32 m0, -1 2857; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2858; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 2859; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2860; GFX8-NEXT: .LBB16_2: 2861; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2862; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2863; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2864; GFX8-NEXT: v_mov_b32_e32 v0, v1 2865; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 2866; GFX8-NEXT: s_mov_b32 s3, 0xf000 2867; GFX8-NEXT: s_mov_b32 s2, -1 2868; GFX8-NEXT: s_nop 0 2869; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2870; GFX8-NEXT: s_endpgm 2871; 2872; GFX9-LABEL: xor_i32_varying: 2873; GFX9: ; %bb.0: ; %entry 2874; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2875; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2876; GFX9-NEXT: v_mov_b32_e32 v1, 0 2877; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2878; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2879; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2880; GFX9-NEXT: v_mov_b32_e32 v2, v0 2881; GFX9-NEXT: s_not_b64 exec, exec 2882; GFX9-NEXT: v_mov_b32_e32 v2, 0 2883; GFX9-NEXT: s_not_b64 exec, exec 2884; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2885; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2886; GFX9-NEXT: s_nop 1 2887; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2888; GFX9-NEXT: s_nop 1 2889; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2890; GFX9-NEXT: s_nop 1 2891; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2892; GFX9-NEXT: s_nop 1 2893; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2894; GFX9-NEXT: s_nop 1 2895; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2896; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2897; GFX9-NEXT: s_nop 0 2898; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2899; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2900; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2901; GFX9-NEXT: ; implicit-def: $vgpr0 2902; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2903; GFX9-NEXT: s_cbranch_execz .LBB16_2 2904; GFX9-NEXT: ; %bb.1: 2905; GFX9-NEXT: v_mov_b32_e32 v0, 0 2906; GFX9-NEXT: v_mov_b32_e32 v3, s4 2907; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2908; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 2909; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2910; GFX9-NEXT: .LBB16_2: 2911; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2912; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2913; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2914; GFX9-NEXT: v_mov_b32_e32 v0, v1 2915; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2916; GFX9-NEXT: s_mov_b32 s3, 0xf000 2917; GFX9-NEXT: s_mov_b32 s2, -1 2918; GFX9-NEXT: s_nop 0 2919; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2920; GFX9-NEXT: s_endpgm 2921; 2922; GFX1064-LABEL: xor_i32_varying: 2923; GFX1064: ; %bb.0: ; %entry 2924; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2925; GFX1064-NEXT: s_not_b64 exec, exec 2926; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2927; GFX1064-NEXT: s_not_b64 exec, exec 2928; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2929; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2930; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2931; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2932; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2933; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2934; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2935; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2936; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2937; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2938; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2939; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2940; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2941; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2942; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2943; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2944; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2945; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2946; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2947; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2948; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2949; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2950; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2951; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2952; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2953; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2954; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2955; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2956; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2957; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2958; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2959; GFX1064-NEXT: s_mov_b32 s2, -1 2960; GFX1064-NEXT: ; implicit-def: $vgpr0 2961; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2962; GFX1064-NEXT: s_cbranch_execz .LBB16_2 2963; GFX1064-NEXT: ; %bb.1: 2964; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2965; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2966; GFX1064-NEXT: s_mov_b32 s3, s7 2967; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2968; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2969; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 2970; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2971; GFX1064-NEXT: buffer_gl0_inv 2972; GFX1064-NEXT: .LBB16_2: 2973; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2974; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2975; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2976; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2977; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 2978; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2979; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2980; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2981; GFX1064-NEXT: s_endpgm 2982; 2983; GFX1032-LABEL: xor_i32_varying: 2984; GFX1032: ; %bb.0: ; %entry 2985; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2986; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2987; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2988; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2989; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2990; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2991; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2992; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2993; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2994; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2995; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2996; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2997; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2998; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2999; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3000; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3001; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3002; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3003; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3004; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3005; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3006; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3007; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3008; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3009; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3010; GFX1032-NEXT: s_mov_b32 s2, -1 3011; GFX1032-NEXT: ; implicit-def: $vgpr0 3012; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3013; GFX1032-NEXT: s_cbranch_execz .LBB16_2 3014; GFX1032-NEXT: ; %bb.1: 3015; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3016; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3017; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3018; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3019; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 3020; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3021; GFX1032-NEXT: buffer_gl0_inv 3022; GFX1032-NEXT: .LBB16_2: 3023; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3024; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3025; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3026; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3027; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3028; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3029; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3030; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3031; GFX1032-NEXT: s_endpgm 3032entry: 3033 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3034 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3035 store i32 %old, i32 addrspace(1)* %out 3036 ret void 3037} 3038 3039define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3040; 3041; 3042; GFX7LESS-LABEL: max_i32_varying: 3043; GFX7LESS: ; %bb.0: ; %entry 3044; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3045; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3046; GFX7LESS-NEXT: s_mov_b32 m0, -1 3047; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3048; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3049; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3050; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3051; GFX7LESS-NEXT: s_mov_b32 s2, -1 3052; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3053; GFX7LESS-NEXT: s_endpgm 3054; 3055; GFX8-LABEL: max_i32_varying: 3056; GFX8: ; %bb.0: ; %entry 3057; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3058; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3059; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3060; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3061; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3062; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3063; GFX8-NEXT: v_mov_b32_e32 v2, v0 3064; GFX8-NEXT: s_not_b64 exec, exec 3065; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 3066; GFX8-NEXT: s_not_b64 exec, exec 3067; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3068; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3069; GFX8-NEXT: s_nop 1 3070; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3071; GFX8-NEXT: s_nop 1 3072; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3073; GFX8-NEXT: s_nop 1 3074; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3075; GFX8-NEXT: s_nop 1 3076; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3077; GFX8-NEXT: s_nop 1 3078; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3079; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3080; GFX8-NEXT: s_nop 0 3081; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3082; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3083; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3084; GFX8-NEXT: ; implicit-def: $vgpr0 3085; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3086; GFX8-NEXT: s_cbranch_execz .LBB17_2 3087; GFX8-NEXT: ; %bb.1: 3088; GFX8-NEXT: v_mov_b32_e32 v0, 0 3089; GFX8-NEXT: v_mov_b32_e32 v3, s4 3090; GFX8-NEXT: s_mov_b32 m0, -1 3091; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3092; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3093; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3094; GFX8-NEXT: .LBB17_2: 3095; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3096; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3097; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3098; GFX8-NEXT: v_mov_b32_e32 v0, v1 3099; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3100; GFX8-NEXT: s_mov_b32 s3, 0xf000 3101; GFX8-NEXT: s_mov_b32 s2, -1 3102; GFX8-NEXT: s_nop 0 3103; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3104; GFX8-NEXT: s_endpgm 3105; 3106; GFX9-LABEL: max_i32_varying: 3107; GFX9: ; %bb.0: ; %entry 3108; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3109; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3110; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3111; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3112; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3113; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3114; GFX9-NEXT: v_mov_b32_e32 v2, v0 3115; GFX9-NEXT: s_not_b64 exec, exec 3116; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 3117; GFX9-NEXT: s_not_b64 exec, exec 3118; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3119; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3120; GFX9-NEXT: s_nop 1 3121; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3122; GFX9-NEXT: s_nop 1 3123; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3124; GFX9-NEXT: s_nop 1 3125; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3126; GFX9-NEXT: s_nop 1 3127; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3128; GFX9-NEXT: s_nop 1 3129; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3130; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3131; GFX9-NEXT: s_nop 0 3132; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3133; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3134; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3135; GFX9-NEXT: ; implicit-def: $vgpr0 3136; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3137; GFX9-NEXT: s_cbranch_execz .LBB17_2 3138; GFX9-NEXT: ; %bb.1: 3139; GFX9-NEXT: v_mov_b32_e32 v0, 0 3140; GFX9-NEXT: v_mov_b32_e32 v3, s4 3141; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3142; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3143; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3144; GFX9-NEXT: .LBB17_2: 3145; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3147; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3148; GFX9-NEXT: v_mov_b32_e32 v0, v1 3149; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3150; GFX9-NEXT: s_mov_b32 s3, 0xf000 3151; GFX9-NEXT: s_mov_b32 s2, -1 3152; GFX9-NEXT: s_nop 0 3153; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3154; GFX9-NEXT: s_endpgm 3155; 3156; GFX1064-LABEL: max_i32_varying: 3157; GFX1064: ; %bb.0: ; %entry 3158; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3159; GFX1064-NEXT: s_not_b64 exec, exec 3160; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3161; GFX1064-NEXT: s_not_b64 exec, exec 3162; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3163; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3164; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 3165; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3166; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3167; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3168; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3169; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3170; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3171; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3172; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3173; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3174; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3175; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3176; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3177; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3178; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3179; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3180; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3181; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3182; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3183; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3184; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3185; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3186; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3187; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3188; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3189; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3190; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3191; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3192; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3193; GFX1064-NEXT: s_mov_b32 s2, -1 3194; GFX1064-NEXT: ; implicit-def: $vgpr0 3195; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3196; GFX1064-NEXT: s_cbranch_execz .LBB17_2 3197; GFX1064-NEXT: ; %bb.1: 3198; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3199; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3200; GFX1064-NEXT: s_mov_b32 s3, s7 3201; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3202; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3203; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 3204; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3205; GFX1064-NEXT: buffer_gl0_inv 3206; GFX1064-NEXT: .LBB17_2: 3207; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3208; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3209; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3210; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3211; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3212; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3213; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3214; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3215; GFX1064-NEXT: s_endpgm 3216; 3217; GFX1032-LABEL: max_i32_varying: 3218; GFX1032: ; %bb.0: ; %entry 3219; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3220; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3221; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3222; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3223; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3224; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3225; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3226; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3227; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3228; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3229; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3230; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3231; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3232; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3233; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3234; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 3235; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3236; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3237; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3238; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3239; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3240; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3241; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3242; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3243; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3244; GFX1032-NEXT: s_mov_b32 s2, -1 3245; GFX1032-NEXT: ; implicit-def: $vgpr0 3246; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3247; GFX1032-NEXT: s_cbranch_execz .LBB17_2 3248; GFX1032-NEXT: ; %bb.1: 3249; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3250; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3251; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3252; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3253; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 3254; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3255; GFX1032-NEXT: buffer_gl0_inv 3256; GFX1032-NEXT: .LBB17_2: 3257; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3258; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3259; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3260; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3261; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3262; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3263; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3264; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3265; GFX1032-NEXT: s_endpgm 3266entry: 3267 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3268 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3269 store i32 %old, i32 addrspace(1)* %out 3270 ret void 3271} 3272 3273define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3274; 3275; 3276; GFX7LESS-LABEL: max_i64_constant: 3277; GFX7LESS: ; %bb.0: ; %entry 3278; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3279; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3280; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3281; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3282; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3283; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3284; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 3285; GFX7LESS-NEXT: ; %bb.1: 3286; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 3287; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3288; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3289; GFX7LESS-NEXT: s_mov_b32 m0, -1 3290; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3291; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3292; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3293; GFX7LESS-NEXT: .LBB18_2: 3294; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3295; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3296; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3297; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3298; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3299; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3300; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3301; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3302; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3303; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3304; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3305; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3306; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3307; GFX7LESS-NEXT: s_mov_b32 s2, -1 3308; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3309; GFX7LESS-NEXT: s_endpgm 3310; 3311; GFX8-LABEL: max_i64_constant: 3312; GFX8: ; %bb.0: ; %entry 3313; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3314; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3315; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3316; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3317; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3318; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3319; GFX8-NEXT: s_cbranch_execz .LBB18_2 3320; GFX8-NEXT: ; %bb.1: 3321; GFX8-NEXT: v_mov_b32_e32 v0, 5 3322; GFX8-NEXT: v_mov_b32_e32 v2, 0 3323; GFX8-NEXT: v_mov_b32_e32 v1, 0 3324; GFX8-NEXT: s_mov_b32 m0, -1 3325; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3326; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3327; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3328; GFX8-NEXT: .LBB18_2: 3329; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3330; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3331; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3332; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3333; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3334; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3335; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3336; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3337; GFX8-NEXT: v_mov_b32_e32 v2, s3 3338; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3339; GFX8-NEXT: v_mov_b32_e32 v2, s2 3340; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3341; GFX8-NEXT: s_mov_b32 s3, 0xf000 3342; GFX8-NEXT: s_mov_b32 s2, -1 3343; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3344; GFX8-NEXT: s_endpgm 3345; 3346; GFX9-LABEL: max_i64_constant: 3347; GFX9: ; %bb.0: ; %entry 3348; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3349; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3350; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3351; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3352; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3353; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3354; GFX9-NEXT: s_cbranch_execz .LBB18_2 3355; GFX9-NEXT: ; %bb.1: 3356; GFX9-NEXT: v_mov_b32_e32 v0, 5 3357; GFX9-NEXT: v_mov_b32_e32 v1, 0 3358; GFX9-NEXT: v_mov_b32_e32 v2, 0 3359; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3360; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3361; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3362; GFX9-NEXT: .LBB18_2: 3363; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3364; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3365; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3366; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3367; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3368; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3369; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3370; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3371; GFX9-NEXT: v_mov_b32_e32 v2, s3 3372; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3373; GFX9-NEXT: v_mov_b32_e32 v2, s2 3374; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3375; GFX9-NEXT: s_mov_b32 s3, 0xf000 3376; GFX9-NEXT: s_mov_b32 s2, -1 3377; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3378; GFX9-NEXT: s_endpgm 3379; 3380; GFX1064-LABEL: max_i64_constant: 3381; GFX1064: ; %bb.0: ; %entry 3382; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3383; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3384; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3385; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3386; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3387; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3388; GFX1064-NEXT: s_cbranch_execz .LBB18_2 3389; GFX1064-NEXT: ; %bb.1: 3390; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3391; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3392; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3393; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3394; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3395; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3396; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3397; GFX1064-NEXT: buffer_gl0_inv 3398; GFX1064-NEXT: .LBB18_2: 3399; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3400; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3401; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3402; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3403; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3404; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3405; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3406; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3407; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3408; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3409; GFX1064-NEXT: s_mov_b32 s2, -1 3410; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3411; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3412; GFX1064-NEXT: s_endpgm 3413; 3414; GFX1032-LABEL: max_i64_constant: 3415; GFX1032: ; %bb.0: ; %entry 3416; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3417; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3418; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3419; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3420; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3421; GFX1032-NEXT: s_cbranch_execz .LBB18_2 3422; GFX1032-NEXT: ; %bb.1: 3423; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3424; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3425; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3426; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3427; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3428; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3429; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3430; GFX1032-NEXT: buffer_gl0_inv 3431; GFX1032-NEXT: .LBB18_2: 3432; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3433; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3434; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3435; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3436; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3437; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3438; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3439; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3440; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3441; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3442; GFX1032-NEXT: s_mov_b32 s2, -1 3443; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3444; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3445; GFX1032-NEXT: s_endpgm 3446entry: 3447 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3448 store i64 %old, i64 addrspace(1)* %out 3449 ret void 3450} 3451 3452define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3453; 3454; 3455; GFX7LESS-LABEL: min_i32_varying: 3456; GFX7LESS: ; %bb.0: ; %entry 3457; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3458; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3459; GFX7LESS-NEXT: s_mov_b32 m0, -1 3460; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3461; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3462; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3463; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3464; GFX7LESS-NEXT: s_mov_b32 s2, -1 3465; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3466; GFX7LESS-NEXT: s_endpgm 3467; 3468; GFX8-LABEL: min_i32_varying: 3469; GFX8: ; %bb.0: ; %entry 3470; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3471; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3472; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3473; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3474; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3475; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3476; GFX8-NEXT: v_mov_b32_e32 v2, v0 3477; GFX8-NEXT: s_not_b64 exec, exec 3478; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 3479; GFX8-NEXT: s_not_b64 exec, exec 3480; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3481; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3482; GFX8-NEXT: s_nop 1 3483; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3484; GFX8-NEXT: s_nop 1 3485; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3486; GFX8-NEXT: s_nop 1 3487; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3488; GFX8-NEXT: s_nop 1 3489; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3490; GFX8-NEXT: s_nop 1 3491; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3492; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3493; GFX8-NEXT: s_nop 0 3494; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3495; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3496; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3497; GFX8-NEXT: ; implicit-def: $vgpr0 3498; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3499; GFX8-NEXT: s_cbranch_execz .LBB19_2 3500; GFX8-NEXT: ; %bb.1: 3501; GFX8-NEXT: v_mov_b32_e32 v0, 0 3502; GFX8-NEXT: v_mov_b32_e32 v3, s4 3503; GFX8-NEXT: s_mov_b32 m0, -1 3504; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3505; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3506; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3507; GFX8-NEXT: .LBB19_2: 3508; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3509; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3510; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3511; GFX8-NEXT: v_mov_b32_e32 v0, v1 3512; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3513; GFX8-NEXT: s_mov_b32 s3, 0xf000 3514; GFX8-NEXT: s_mov_b32 s2, -1 3515; GFX8-NEXT: s_nop 0 3516; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3517; GFX8-NEXT: s_endpgm 3518; 3519; GFX9-LABEL: min_i32_varying: 3520; GFX9: ; %bb.0: ; %entry 3521; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3522; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3523; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3524; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3525; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3526; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3527; GFX9-NEXT: v_mov_b32_e32 v2, v0 3528; GFX9-NEXT: s_not_b64 exec, exec 3529; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 3530; GFX9-NEXT: s_not_b64 exec, exec 3531; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3532; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3533; GFX9-NEXT: s_nop 1 3534; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3535; GFX9-NEXT: s_nop 1 3536; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3537; GFX9-NEXT: s_nop 1 3538; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3539; GFX9-NEXT: s_nop 1 3540; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3541; GFX9-NEXT: s_nop 1 3542; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3543; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3544; GFX9-NEXT: s_nop 0 3545; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3546; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3547; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3548; GFX9-NEXT: ; implicit-def: $vgpr0 3549; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3550; GFX9-NEXT: s_cbranch_execz .LBB19_2 3551; GFX9-NEXT: ; %bb.1: 3552; GFX9-NEXT: v_mov_b32_e32 v0, 0 3553; GFX9-NEXT: v_mov_b32_e32 v3, s4 3554; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3555; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3556; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3557; GFX9-NEXT: .LBB19_2: 3558; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3559; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3560; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3561; GFX9-NEXT: v_mov_b32_e32 v0, v1 3562; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3563; GFX9-NEXT: s_mov_b32 s3, 0xf000 3564; GFX9-NEXT: s_mov_b32 s2, -1 3565; GFX9-NEXT: s_nop 0 3566; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3567; GFX9-NEXT: s_endpgm 3568; 3569; GFX1064-LABEL: min_i32_varying: 3570; GFX1064: ; %bb.0: ; %entry 3571; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3572; GFX1064-NEXT: s_not_b64 exec, exec 3573; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3574; GFX1064-NEXT: s_not_b64 exec, exec 3575; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3576; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3577; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 3578; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3579; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3580; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3581; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3582; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3583; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3584; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3585; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3586; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3587; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3588; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3589; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3590; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3591; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3592; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3593; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3594; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3595; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3596; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3597; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3598; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3599; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3600; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3601; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3602; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3603; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3604; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3605; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3606; GFX1064-NEXT: s_mov_b32 s2, -1 3607; GFX1064-NEXT: ; implicit-def: $vgpr0 3608; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3609; GFX1064-NEXT: s_cbranch_execz .LBB19_2 3610; GFX1064-NEXT: ; %bb.1: 3611; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3612; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3613; GFX1064-NEXT: s_mov_b32 s3, s7 3614; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3615; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3616; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 3617; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3618; GFX1064-NEXT: buffer_gl0_inv 3619; GFX1064-NEXT: .LBB19_2: 3620; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3621; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3622; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3623; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3624; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3625; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3626; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3627; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3628; GFX1064-NEXT: s_endpgm 3629; 3630; GFX1032-LABEL: min_i32_varying: 3631; GFX1032: ; %bb.0: ; %entry 3632; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3633; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3634; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3635; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3636; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3637; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3638; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3639; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3640; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3641; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3642; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3643; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3644; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3645; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3646; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3647; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 3648; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3649; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3650; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3651; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3652; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3653; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3654; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3655; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3656; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3657; GFX1032-NEXT: s_mov_b32 s2, -1 3658; GFX1032-NEXT: ; implicit-def: $vgpr0 3659; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3660; GFX1032-NEXT: s_cbranch_execz .LBB19_2 3661; GFX1032-NEXT: ; %bb.1: 3662; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3663; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3664; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3665; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3666; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 3667; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3668; GFX1032-NEXT: buffer_gl0_inv 3669; GFX1032-NEXT: .LBB19_2: 3670; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3671; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3672; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3673; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3674; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3675; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3676; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3677; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3678; GFX1032-NEXT: s_endpgm 3679entry: 3680 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3681 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3682 store i32 %old, i32 addrspace(1)* %out 3683 ret void 3684} 3685 3686define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3687; 3688; 3689; GFX7LESS-LABEL: min_i64_constant: 3690; GFX7LESS: ; %bb.0: ; %entry 3691; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3692; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3693; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3694; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3695; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3696; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3697; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 3698; GFX7LESS-NEXT: ; %bb.1: 3699; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 3700; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3701; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3702; GFX7LESS-NEXT: s_mov_b32 m0, -1 3703; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3704; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3705; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3706; GFX7LESS-NEXT: .LBB20_2: 3707; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3708; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3709; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3710; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3711; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3712; GFX7LESS-NEXT: s_mov_b32 s2, -1 3713; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3714; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3715; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3716; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3717; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3718; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3719; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3720; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3721; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3722; GFX7LESS-NEXT: s_endpgm 3723; 3724; GFX8-LABEL: min_i64_constant: 3725; GFX8: ; %bb.0: ; %entry 3726; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3727; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3728; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3729; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3730; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3731; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3732; GFX8-NEXT: s_cbranch_execz .LBB20_2 3733; GFX8-NEXT: ; %bb.1: 3734; GFX8-NEXT: v_mov_b32_e32 v0, 5 3735; GFX8-NEXT: v_mov_b32_e32 v2, 0 3736; GFX8-NEXT: v_mov_b32_e32 v1, 0 3737; GFX8-NEXT: s_mov_b32 m0, -1 3738; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3739; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3740; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3741; GFX8-NEXT: .LBB20_2: 3742; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3743; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3744; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3745; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3746; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3747; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3748; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3749; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3750; GFX8-NEXT: v_mov_b32_e32 v2, s5 3751; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3752; GFX8-NEXT: v_mov_b32_e32 v2, s4 3753; GFX8-NEXT: s_mov_b32 s2, -1 3754; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3755; GFX8-NEXT: s_mov_b32 s3, 0xf000 3756; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3757; GFX8-NEXT: s_endpgm 3758; 3759; GFX9-LABEL: min_i64_constant: 3760; GFX9: ; %bb.0: ; %entry 3761; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3762; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3763; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3764; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3765; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3766; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3767; GFX9-NEXT: s_cbranch_execz .LBB20_2 3768; GFX9-NEXT: ; %bb.1: 3769; GFX9-NEXT: v_mov_b32_e32 v0, 5 3770; GFX9-NEXT: v_mov_b32_e32 v1, 0 3771; GFX9-NEXT: v_mov_b32_e32 v2, 0 3772; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3773; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3774; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3775; GFX9-NEXT: .LBB20_2: 3776; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3777; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3778; GFX9-NEXT: v_readfirstlane_b32 s4, v0 3779; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 3780; GFX9-NEXT: v_readfirstlane_b32 s5, v1 3781; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3782; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3783; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3784; GFX9-NEXT: v_mov_b32_e32 v2, s5 3785; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3786; GFX9-NEXT: v_mov_b32_e32 v2, s4 3787; GFX9-NEXT: s_mov_b32 s2, -1 3788; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3789; GFX9-NEXT: s_mov_b32 s3, 0xf000 3790; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3791; GFX9-NEXT: s_endpgm 3792; 3793; GFX1064-LABEL: min_i64_constant: 3794; GFX1064: ; %bb.0: ; %entry 3795; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3796; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3797; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3798; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3799; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3800; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3801; GFX1064-NEXT: s_cbranch_execz .LBB20_2 3802; GFX1064-NEXT: ; %bb.1: 3803; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3804; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3805; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3806; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3807; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3808; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3809; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3810; GFX1064-NEXT: buffer_gl0_inv 3811; GFX1064-NEXT: .LBB20_2: 3812; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3813; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3814; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3815; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3816; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 3817; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3818; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 3819; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3820; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3821; GFX1064-NEXT: s_mov_b32 s2, -1 3822; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3823; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3824; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3825; GFX1064-NEXT: s_endpgm 3826; 3827; GFX1032-LABEL: min_i64_constant: 3828; GFX1032: ; %bb.0: ; %entry 3829; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3830; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3831; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3832; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3833; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3834; GFX1032-NEXT: s_cbranch_execz .LBB20_2 3835; GFX1032-NEXT: ; %bb.1: 3836; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3837; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3838; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3839; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3840; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3841; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3842; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3843; GFX1032-NEXT: buffer_gl0_inv 3844; GFX1032-NEXT: .LBB20_2: 3845; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3846; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3847; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3848; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3849; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 3850; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 3851; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 3852; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3853; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3854; GFX1032-NEXT: s_mov_b32 s2, -1 3855; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3856; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3857; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3858; GFX1032-NEXT: s_endpgm 3859entry: 3860 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 3861 store i64 %old, i64 addrspace(1)* %out 3862 ret void 3863} 3864 3865define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 3866; 3867; 3868; GFX7LESS-LABEL: umax_i32_varying: 3869; GFX7LESS: ; %bb.0: ; %entry 3870; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3871; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3872; GFX7LESS-NEXT: s_mov_b32 m0, -1 3873; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3874; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 3875; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3876; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3877; GFX7LESS-NEXT: s_mov_b32 s2, -1 3878; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3879; GFX7LESS-NEXT: s_endpgm 3880; 3881; GFX8-LABEL: umax_i32_varying: 3882; GFX8: ; %bb.0: ; %entry 3883; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3884; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3885; GFX8-NEXT: v_mov_b32_e32 v1, 0 3886; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3887; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3888; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3889; GFX8-NEXT: v_mov_b32_e32 v2, v0 3890; GFX8-NEXT: s_not_b64 exec, exec 3891; GFX8-NEXT: v_mov_b32_e32 v2, 0 3892; GFX8-NEXT: s_not_b64 exec, exec 3893; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3894; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3895; GFX8-NEXT: s_nop 1 3896; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3897; GFX8-NEXT: s_nop 1 3898; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3899; GFX8-NEXT: s_nop 1 3900; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3901; GFX8-NEXT: s_nop 1 3902; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3903; GFX8-NEXT: s_nop 1 3904; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3905; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3906; GFX8-NEXT: s_nop 0 3907; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3908; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3909; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3910; GFX8-NEXT: ; implicit-def: $vgpr0 3911; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3912; GFX8-NEXT: s_cbranch_execz .LBB21_2 3913; GFX8-NEXT: ; %bb.1: 3914; GFX8-NEXT: v_mov_b32_e32 v0, 0 3915; GFX8-NEXT: v_mov_b32_e32 v3, s4 3916; GFX8-NEXT: s_mov_b32 m0, -1 3917; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3918; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 3919; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3920; GFX8-NEXT: .LBB21_2: 3921; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3922; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3923; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3924; GFX8-NEXT: v_mov_b32_e32 v0, v1 3925; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 3926; GFX8-NEXT: s_mov_b32 s3, 0xf000 3927; GFX8-NEXT: s_mov_b32 s2, -1 3928; GFX8-NEXT: s_nop 0 3929; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3930; GFX8-NEXT: s_endpgm 3931; 3932; GFX9-LABEL: umax_i32_varying: 3933; GFX9: ; %bb.0: ; %entry 3934; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3935; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3936; GFX9-NEXT: v_mov_b32_e32 v1, 0 3937; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3938; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3939; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3940; GFX9-NEXT: v_mov_b32_e32 v2, v0 3941; GFX9-NEXT: s_not_b64 exec, exec 3942; GFX9-NEXT: v_mov_b32_e32 v2, 0 3943; GFX9-NEXT: s_not_b64 exec, exec 3944; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3945; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3946; GFX9-NEXT: s_nop 1 3947; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3948; GFX9-NEXT: s_nop 1 3949; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3950; GFX9-NEXT: s_nop 1 3951; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3952; GFX9-NEXT: s_nop 1 3953; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3954; GFX9-NEXT: s_nop 1 3955; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3956; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3957; GFX9-NEXT: s_nop 0 3958; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3959; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3960; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3961; GFX9-NEXT: ; implicit-def: $vgpr0 3962; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3963; GFX9-NEXT: s_cbranch_execz .LBB21_2 3964; GFX9-NEXT: ; %bb.1: 3965; GFX9-NEXT: v_mov_b32_e32 v0, 0 3966; GFX9-NEXT: v_mov_b32_e32 v3, s4 3967; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3968; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 3969; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3970; GFX9-NEXT: .LBB21_2: 3971; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3972; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3973; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3974; GFX9-NEXT: v_mov_b32_e32 v0, v1 3975; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 3976; GFX9-NEXT: s_mov_b32 s3, 0xf000 3977; GFX9-NEXT: s_mov_b32 s2, -1 3978; GFX9-NEXT: s_nop 0 3979; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3980; GFX9-NEXT: s_endpgm 3981; 3982; GFX1064-LABEL: umax_i32_varying: 3983; GFX1064: ; %bb.0: ; %entry 3984; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3985; GFX1064-NEXT: s_not_b64 exec, exec 3986; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3987; GFX1064-NEXT: s_not_b64 exec, exec 3988; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3989; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3990; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3991; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3992; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3993; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3994; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3995; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3996; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3997; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3998; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3999; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4000; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4001; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4002; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4003; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4004; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4005; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4006; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4007; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4008; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4009; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4010; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4011; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4012; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4013; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4014; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4015; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4016; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4017; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4018; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4019; GFX1064-NEXT: s_mov_b32 s2, -1 4020; GFX1064-NEXT: ; implicit-def: $vgpr0 4021; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4022; GFX1064-NEXT: s_cbranch_execz .LBB21_2 4023; GFX1064-NEXT: ; %bb.1: 4024; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4025; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4026; GFX1064-NEXT: s_mov_b32 s3, s7 4027; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4028; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4029; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 4030; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4031; GFX1064-NEXT: buffer_gl0_inv 4032; GFX1064-NEXT: .LBB21_2: 4033; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4034; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4035; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4036; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4037; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4038; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4039; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4040; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4041; GFX1064-NEXT: s_endpgm 4042; 4043; GFX1032-LABEL: umax_i32_varying: 4044; GFX1032: ; %bb.0: ; %entry 4045; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4046; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4047; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4048; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4049; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4050; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4051; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4052; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4053; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4054; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4055; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4056; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4057; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4058; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4059; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4060; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4061; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4062; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4063; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4064; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4065; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4066; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4067; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4068; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4069; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4070; GFX1032-NEXT: s_mov_b32 s2, -1 4071; GFX1032-NEXT: ; implicit-def: $vgpr0 4072; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4073; GFX1032-NEXT: s_cbranch_execz .LBB21_2 4074; GFX1032-NEXT: ; %bb.1: 4075; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4076; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4077; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4078; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4079; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 4080; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4081; GFX1032-NEXT: buffer_gl0_inv 4082; GFX1032-NEXT: .LBB21_2: 4083; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4084; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4085; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4086; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4087; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4088; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4089; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4090; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4091; GFX1032-NEXT: s_endpgm 4092entry: 4093 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4094 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4095 store i32 %old, i32 addrspace(1)* %out 4096 ret void 4097} 4098 4099define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4100; 4101; 4102; GFX7LESS-LABEL: umax_i64_constant: 4103; GFX7LESS: ; %bb.0: ; %entry 4104; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4105; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4106; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4107; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4108; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4109; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4110; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 4111; GFX7LESS-NEXT: ; %bb.1: 4112; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4113; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4114; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4115; GFX7LESS-NEXT: s_mov_b32 m0, -1 4116; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4117; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4118; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4119; GFX7LESS-NEXT: .LBB22_2: 4120; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4121; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4122; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4123; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4124; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4125; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4126; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4127; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4128; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4129; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4130; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4131; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4132; GFX7LESS-NEXT: s_mov_b32 s2, -1 4133; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4134; GFX7LESS-NEXT: s_endpgm 4135; 4136; GFX8-LABEL: umax_i64_constant: 4137; GFX8: ; %bb.0: ; %entry 4138; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4139; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4140; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4141; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4142; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4143; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4144; GFX8-NEXT: s_cbranch_execz .LBB22_2 4145; GFX8-NEXT: ; %bb.1: 4146; GFX8-NEXT: v_mov_b32_e32 v0, 5 4147; GFX8-NEXT: v_mov_b32_e32 v2, 0 4148; GFX8-NEXT: v_mov_b32_e32 v1, 0 4149; GFX8-NEXT: s_mov_b32 m0, -1 4150; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4151; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4152; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4153; GFX8-NEXT: .LBB22_2: 4154; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4155; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4156; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4157; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4158; GFX8-NEXT: v_mov_b32_e32 v1, 0 4159; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4160; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4161; GFX8-NEXT: v_mov_b32_e32 v2, s2 4162; GFX8-NEXT: v_mov_b32_e32 v1, s3 4163; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4164; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4165; GFX8-NEXT: s_mov_b32 s3, 0xf000 4166; GFX8-NEXT: s_mov_b32 s2, -1 4167; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4168; GFX8-NEXT: s_endpgm 4169; 4170; GFX9-LABEL: umax_i64_constant: 4171; GFX9: ; %bb.0: ; %entry 4172; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4173; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4174; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4175; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4176; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4177; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4178; GFX9-NEXT: s_cbranch_execz .LBB22_2 4179; GFX9-NEXT: ; %bb.1: 4180; GFX9-NEXT: v_mov_b32_e32 v0, 5 4181; GFX9-NEXT: v_mov_b32_e32 v1, 0 4182; GFX9-NEXT: v_mov_b32_e32 v2, 0 4183; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4184; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4185; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4186; GFX9-NEXT: .LBB22_2: 4187; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4188; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4189; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4190; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4191; GFX9-NEXT: v_mov_b32_e32 v1, 0 4192; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4193; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4194; GFX9-NEXT: v_mov_b32_e32 v2, s2 4195; GFX9-NEXT: v_mov_b32_e32 v1, s3 4196; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4197; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4198; GFX9-NEXT: s_mov_b32 s3, 0xf000 4199; GFX9-NEXT: s_mov_b32 s2, -1 4200; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4201; GFX9-NEXT: s_endpgm 4202; 4203; GFX1064-LABEL: umax_i64_constant: 4204; GFX1064: ; %bb.0: ; %entry 4205; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4206; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4207; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4208; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4209; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4210; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4211; GFX1064-NEXT: s_cbranch_execz .LBB22_2 4212; GFX1064-NEXT: ; %bb.1: 4213; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4214; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4215; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4216; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4217; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4218; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4219; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4220; GFX1064-NEXT: buffer_gl0_inv 4221; GFX1064-NEXT: .LBB22_2: 4222; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4223; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4224; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4225; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4226; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4227; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4228; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4229; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4230; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4231; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4232; GFX1064-NEXT: s_mov_b32 s2, -1 4233; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4234; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4235; GFX1064-NEXT: s_endpgm 4236; 4237; GFX1032-LABEL: umax_i64_constant: 4238; GFX1032: ; %bb.0: ; %entry 4239; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4240; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4241; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4242; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4243; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4244; GFX1032-NEXT: s_cbranch_execz .LBB22_2 4245; GFX1032-NEXT: ; %bb.1: 4246; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4247; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4248; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4249; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4250; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4251; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4252; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4253; GFX1032-NEXT: buffer_gl0_inv 4254; GFX1032-NEXT: .LBB22_2: 4255; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4256; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4257; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4258; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4259; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4260; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4261; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4262; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4263; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4264; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4265; GFX1032-NEXT: s_mov_b32 s2, -1 4266; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4267; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4268; GFX1032-NEXT: s_endpgm 4269entry: 4270 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4271 store i64 %old, i64 addrspace(1)* %out 4272 ret void 4273} 4274 4275define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4276; 4277; 4278; GFX7LESS-LABEL: umin_i32_varying: 4279; GFX7LESS: ; %bb.0: ; %entry 4280; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4281; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4282; GFX7LESS-NEXT: s_mov_b32 m0, -1 4283; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4284; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4285; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4286; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4287; GFX7LESS-NEXT: s_mov_b32 s2, -1 4288; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4289; GFX7LESS-NEXT: s_endpgm 4290; 4291; GFX8-LABEL: umin_i32_varying: 4292; GFX8: ; %bb.0: ; %entry 4293; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4294; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4295; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4296; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4297; GFX8-NEXT: v_mov_b32_e32 v1, -1 4298; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4299; GFX8-NEXT: v_mov_b32_e32 v2, v0 4300; GFX8-NEXT: s_not_b64 exec, exec 4301; GFX8-NEXT: v_mov_b32_e32 v2, -1 4302; GFX8-NEXT: s_not_b64 exec, exec 4303; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4304; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4305; GFX8-NEXT: s_nop 1 4306; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4307; GFX8-NEXT: s_nop 1 4308; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4309; GFX8-NEXT: s_nop 1 4310; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4311; GFX8-NEXT: s_nop 1 4312; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4313; GFX8-NEXT: s_nop 1 4314; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4315; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4316; GFX8-NEXT: s_nop 0 4317; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4318; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4319; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4320; GFX8-NEXT: ; implicit-def: $vgpr0 4321; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4322; GFX8-NEXT: s_cbranch_execz .LBB23_2 4323; GFX8-NEXT: ; %bb.1: 4324; GFX8-NEXT: v_mov_b32_e32 v0, 0 4325; GFX8-NEXT: v_mov_b32_e32 v3, s4 4326; GFX8-NEXT: s_mov_b32 m0, -1 4327; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4328; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4329; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4330; GFX8-NEXT: .LBB23_2: 4331; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4334; GFX8-NEXT: v_mov_b32_e32 v0, v1 4335; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4336; GFX8-NEXT: s_mov_b32 s3, 0xf000 4337; GFX8-NEXT: s_mov_b32 s2, -1 4338; GFX8-NEXT: s_nop 0 4339; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4340; GFX8-NEXT: s_endpgm 4341; 4342; GFX9-LABEL: umin_i32_varying: 4343; GFX9: ; %bb.0: ; %entry 4344; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4345; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4346; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4347; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4348; GFX9-NEXT: v_mov_b32_e32 v1, -1 4349; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4350; GFX9-NEXT: v_mov_b32_e32 v2, v0 4351; GFX9-NEXT: s_not_b64 exec, exec 4352; GFX9-NEXT: v_mov_b32_e32 v2, -1 4353; GFX9-NEXT: s_not_b64 exec, exec 4354; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4355; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4356; GFX9-NEXT: s_nop 1 4357; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4358; GFX9-NEXT: s_nop 1 4359; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4360; GFX9-NEXT: s_nop 1 4361; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4362; GFX9-NEXT: s_nop 1 4363; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4364; GFX9-NEXT: s_nop 1 4365; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4366; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4367; GFX9-NEXT: s_nop 0 4368; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4369; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4370; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4371; GFX9-NEXT: ; implicit-def: $vgpr0 4372; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4373; GFX9-NEXT: s_cbranch_execz .LBB23_2 4374; GFX9-NEXT: ; %bb.1: 4375; GFX9-NEXT: v_mov_b32_e32 v0, 0 4376; GFX9-NEXT: v_mov_b32_e32 v3, s4 4377; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4378; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4379; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4380; GFX9-NEXT: .LBB23_2: 4381; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4383; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4384; GFX9-NEXT: v_mov_b32_e32 v0, v1 4385; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4386; GFX9-NEXT: s_mov_b32 s3, 0xf000 4387; GFX9-NEXT: s_mov_b32 s2, -1 4388; GFX9-NEXT: s_nop 0 4389; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4390; GFX9-NEXT: s_endpgm 4391; 4392; GFX1064-LABEL: umin_i32_varying: 4393; GFX1064: ; %bb.0: ; %entry 4394; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4395; GFX1064-NEXT: s_not_b64 exec, exec 4396; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4397; GFX1064-NEXT: s_not_b64 exec, exec 4398; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4399; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4400; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4401; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4402; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4403; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4404; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4405; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4406; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4407; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4408; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4409; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4410; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4411; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4412; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4413; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4414; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4415; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4416; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4417; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4418; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4419; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4420; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4421; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4422; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4423; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4424; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4425; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4426; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4427; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4428; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4429; GFX1064-NEXT: s_mov_b32 s2, -1 4430; GFX1064-NEXT: ; implicit-def: $vgpr0 4431; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4432; GFX1064-NEXT: s_cbranch_execz .LBB23_2 4433; GFX1064-NEXT: ; %bb.1: 4434; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4435; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4436; GFX1064-NEXT: s_mov_b32 s3, s7 4437; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4438; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4439; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 4440; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4441; GFX1064-NEXT: buffer_gl0_inv 4442; GFX1064-NEXT: .LBB23_2: 4443; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4444; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4445; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4446; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4447; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4448; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4449; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4450; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4451; GFX1064-NEXT: s_endpgm 4452; 4453; GFX1032-LABEL: umin_i32_varying: 4454; GFX1032: ; %bb.0: ; %entry 4455; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4456; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4457; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4458; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4459; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4460; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4461; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4462; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4463; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4464; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4465; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4466; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4467; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4468; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4469; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4470; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4471; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4472; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4473; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4474; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4475; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4476; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4477; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4478; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4479; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4480; GFX1032-NEXT: s_mov_b32 s2, -1 4481; GFX1032-NEXT: ; implicit-def: $vgpr0 4482; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4483; GFX1032-NEXT: s_cbranch_execz .LBB23_2 4484; GFX1032-NEXT: ; %bb.1: 4485; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4486; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4487; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4488; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4489; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 4490; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4491; GFX1032-NEXT: buffer_gl0_inv 4492; GFX1032-NEXT: .LBB23_2: 4493; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4494; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4495; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4496; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4497; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4498; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4499; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4500; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4501; GFX1032-NEXT: s_endpgm 4502entry: 4503 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4504 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4505 store i32 %old, i32 addrspace(1)* %out 4506 ret void 4507} 4508 4509define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4510; 4511; 4512; GFX7LESS-LABEL: umin_i64_constant: 4513; GFX7LESS: ; %bb.0: ; %entry 4514; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4515; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4516; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4517; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4518; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4519; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4520; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 4521; GFX7LESS-NEXT: ; %bb.1: 4522; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4523; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4524; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4525; GFX7LESS-NEXT: s_mov_b32 m0, -1 4526; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4527; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4528; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4529; GFX7LESS-NEXT: .LBB24_2: 4530; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4531; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4532; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4533; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4534; GFX7LESS-NEXT: s_mov_b32 s2, -1 4535; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4536; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4537; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4538; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4539; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4540; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4541; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4542; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4543; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4544; GFX7LESS-NEXT: s_endpgm 4545; 4546; GFX8-LABEL: umin_i64_constant: 4547; GFX8: ; %bb.0: ; %entry 4548; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4549; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4550; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4551; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4552; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4553; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4554; GFX8-NEXT: s_cbranch_execz .LBB24_2 4555; GFX8-NEXT: ; %bb.1: 4556; GFX8-NEXT: v_mov_b32_e32 v0, 5 4557; GFX8-NEXT: v_mov_b32_e32 v2, 0 4558; GFX8-NEXT: v_mov_b32_e32 v1, 0 4559; GFX8-NEXT: s_mov_b32 m0, -1 4560; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4561; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4562; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4563; GFX8-NEXT: .LBB24_2: 4564; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4565; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4566; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4567; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4568; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4569; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4570; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4571; GFX8-NEXT: v_mov_b32_e32 v2, s5 4572; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4573; GFX8-NEXT: v_mov_b32_e32 v2, s4 4574; GFX8-NEXT: s_mov_b32 s2, -1 4575; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4576; GFX8-NEXT: s_mov_b32 s3, 0xf000 4577; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4578; GFX8-NEXT: s_endpgm 4579; 4580; GFX9-LABEL: umin_i64_constant: 4581; GFX9: ; %bb.0: ; %entry 4582; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4583; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4584; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4585; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4586; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4587; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4588; GFX9-NEXT: s_cbranch_execz .LBB24_2 4589; GFX9-NEXT: ; %bb.1: 4590; GFX9-NEXT: v_mov_b32_e32 v0, 5 4591; GFX9-NEXT: v_mov_b32_e32 v1, 0 4592; GFX9-NEXT: v_mov_b32_e32 v2, 0 4593; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4594; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4595; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4596; GFX9-NEXT: .LBB24_2: 4597; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4599; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4600; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4601; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4602; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4603; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4604; GFX9-NEXT: v_mov_b32_e32 v2, s5 4605; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4606; GFX9-NEXT: v_mov_b32_e32 v2, s4 4607; GFX9-NEXT: s_mov_b32 s2, -1 4608; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4609; GFX9-NEXT: s_mov_b32 s3, 0xf000 4610; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4611; GFX9-NEXT: s_endpgm 4612; 4613; GFX1064-LABEL: umin_i64_constant: 4614; GFX1064: ; %bb.0: ; %entry 4615; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4616; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4617; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4618; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4619; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4620; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4621; GFX1064-NEXT: s_cbranch_execz .LBB24_2 4622; GFX1064-NEXT: ; %bb.1: 4623; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4624; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4625; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4626; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4627; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4628; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4629; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4630; GFX1064-NEXT: buffer_gl0_inv 4631; GFX1064-NEXT: .LBB24_2: 4632; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4633; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4634; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4635; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4636; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4637; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4638; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4639; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4640; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4641; GFX1064-NEXT: s_mov_b32 s2, -1 4642; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4643; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4644; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4645; GFX1064-NEXT: s_endpgm 4646; 4647; GFX1032-LABEL: umin_i64_constant: 4648; GFX1032: ; %bb.0: ; %entry 4649; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4650; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4651; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4652; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4653; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4654; GFX1032-NEXT: s_cbranch_execz .LBB24_2 4655; GFX1032-NEXT: ; %bb.1: 4656; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4657; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4658; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4659; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4660; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4661; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4662; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4663; GFX1032-NEXT: buffer_gl0_inv 4664; GFX1032-NEXT: .LBB24_2: 4665; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4666; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4667; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4668; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4669; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4670; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4671; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4672; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4673; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4674; GFX1032-NEXT: s_mov_b32 s2, -1 4675; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4676; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4677; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4678; GFX1032-NEXT: s_endpgm 4679entry: 4680 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4681 store i64 %old, i64 addrspace(1)* %out 4682 ret void 4683} 4684