1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: .LBB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz .LBB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, 0 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: .LBB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz .LBB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, 0 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: .LBB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz .LBB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, 0 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: .LBB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 140; GFX1032-NEXT: s_mov_b32 s3, exec_lo 141; GFX1032-NEXT: ; implicit-def: $vgpr1 142; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 143; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 144; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 145; GFX1032-NEXT: s_cbranch_execz .LBB0_2 146; GFX1032-NEXT: ; %bb.1: 147; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 148; GFX1032-NEXT: v_mov_b32_e32 v1, 0 149; GFX1032-NEXT: s_mul_i32 s3, s3, 5 150; GFX1032-NEXT: v_mov_b32_e32 v2, s3 151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 154; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 155; GFX1032-NEXT: buffer_gl0_inv 156; GFX1032-NEXT: .LBB0_2: 157; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 158; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 159; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 161; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 162; GFX1032-NEXT: s_mov_b32 s2, -1 163; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 164; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 165; GFX1032-NEXT: s_endpgm 166entry: 167 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 168 store i32 %old, i32 addrspace(1)* %out 169 ret void 170} 171 172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 173; 174; 175; GFX7LESS-LABEL: add_i32_uniform: 176; GFX7LESS: ; %bb.0: ; %entry 177; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 178; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 179; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 180; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 181; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 182; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 183; GFX7LESS-NEXT: ; implicit-def: $vgpr1 184; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 185; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 186; GFX7LESS-NEXT: ; %bb.1: 187; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 190; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 191; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 192; GFX7LESS-NEXT: s_mov_b32 m0, -1 193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 194; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: .LBB1_2: 197; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 198; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 199; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 200; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 201; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 202; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 203; GFX7LESS-NEXT: s_mov_b32 s6, -1 204; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 205; GFX7LESS-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_uniform: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 210; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 211; GFX8-NEXT: s_mov_b64 s[2:3], exec 212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 215; GFX8-NEXT: ; implicit-def: $vgpr1 216; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 217; GFX8-NEXT: s_cbranch_execz .LBB1_2 218; GFX8-NEXT: ; %bb.1: 219; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_mul_i32 s2, s6, s2 222; GFX8-NEXT: v_mov_b32_e32 v1, 0 223; GFX8-NEXT: v_mov_b32_e32 v2, s2 224; GFX8-NEXT: s_mov_b32 m0, -1 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: .LBB1_2: 229; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 231; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 232; GFX8-NEXT: v_readfirstlane_b32 s0, v1 233; GFX8-NEXT: s_mov_b32 s7, 0xf000 234; GFX8-NEXT: s_mov_b32 s6, -1 235; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 236; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 237; GFX8-NEXT: s_endpgm 238; 239; GFX9-LABEL: add_i32_uniform: 240; GFX9: ; %bb.0: ; %entry 241; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 242; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 243; GFX9-NEXT: s_mov_b64 s[2:3], exec 244; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 245; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX9-NEXT: ; implicit-def: $vgpr1 248; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 249; GFX9-NEXT: s_cbranch_execz .LBB1_2 250; GFX9-NEXT: ; %bb.1: 251; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: s_mul_i32 s2, s6, s2 254; GFX9-NEXT: v_mov_b32_e32 v1, 0 255; GFX9-NEXT: v_mov_b32_e32 v2, s2 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: .LBB1_2: 260; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 263; GFX9-NEXT: v_readfirstlane_b32 s0, v1 264; GFX9-NEXT: s_mov_b32 s7, 0xf000 265; GFX9-NEXT: s_mov_b32 s6, -1 266; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 267; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 268; GFX9-NEXT: s_endpgm 269; 270; GFX1064-LABEL: add_i32_uniform: 271; GFX1064: ; %bb.0: ; %entry 272; GFX1064-NEXT: s_clause 0x1 273; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 274; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 275; GFX1064-NEXT: s_mov_b64 s[2:3], exec 276; GFX1064-NEXT: ; implicit-def: $vgpr1 277; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 278; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 279; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 280; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 281; GFX1064-NEXT: s_cbranch_execz .LBB1_2 282; GFX1064-NEXT: ; %bb.1: 283; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 284; GFX1064-NEXT: v_mov_b32_e32 v1, 0 285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 286; GFX1064-NEXT: s_mul_i32 s2, s6, s2 287; GFX1064-NEXT: v_mov_b32_e32 v2, s2 288; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: buffer_gl0_inv 293; GFX1064-NEXT: .LBB1_2: 294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 295; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 296; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 297; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 298; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 299; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 300; GFX1064-NEXT: s_mov_b32 s6, -1 301; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 302; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 303; GFX1064-NEXT: s_endpgm 304; 305; GFX1032-LABEL: add_i32_uniform: 306; GFX1032: ; %bb.0: ; %entry 307; GFX1032-NEXT: s_clause 0x1 308; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 309; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 310; GFX1032-NEXT: s_mov_b32 s3, exec_lo 311; GFX1032-NEXT: ; implicit-def: $vgpr1 312; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 313; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 314; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 315; GFX1032-NEXT: s_cbranch_execz .LBB1_2 316; GFX1032-NEXT: ; %bb.1: 317; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 318; GFX1032-NEXT: v_mov_b32_e32 v1, 0 319; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 320; GFX1032-NEXT: s_mul_i32 s1, s2, s1 321; GFX1032-NEXT: v_mov_b32_e32 v2, s1 322; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 323; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 324; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: buffer_gl0_inv 327; GFX1032-NEXT: .LBB1_2: 328; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 329; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 330; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 331; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 332; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 333; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 334; GFX1032-NEXT: s_mov_b32 s6, -1 335; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 336; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX1032-NEXT: s_endpgm 338entry: 339 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 340 store i32 %old, i32 addrspace(1)* %out 341 ret void 342} 343 344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 345; 346; 347; GFX7LESS-LABEL: add_i32_varying: 348; GFX7LESS: ; %bb.0: ; %entry 349; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 350; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 351; GFX7LESS-NEXT: s_mov_b32 m0, -1 352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 356; GFX7LESS-NEXT: s_mov_b32 s2, -1 357; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 358; GFX7LESS-NEXT: s_endpgm 359; 360; GFX8-LABEL: add_i32_varying: 361; GFX8: ; %bb.0: ; %entry 362; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 363; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 364; GFX8-NEXT: v_mov_b32_e32 v1, 0 365; GFX8-NEXT: s_mov_b64 exec, s[2:3] 366; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 367; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 368; GFX8-NEXT: v_mov_b32_e32 v2, v0 369; GFX8-NEXT: s_not_b64 exec, exec 370; GFX8-NEXT: v_mov_b32_e32 v2, 0 371; GFX8-NEXT: s_not_b64 exec, exec 372; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 373; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 374; GFX8-NEXT: s_nop 1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 382; GFX8-NEXT: s_nop 1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 384; GFX8-NEXT: v_readlane_b32 s4, v2, 63 385; GFX8-NEXT: s_nop 0 386; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 387; GFX8-NEXT: s_mov_b64 exec, s[2:3] 388; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 389; GFX8-NEXT: ; implicit-def: $vgpr0 390; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 391; GFX8-NEXT: s_cbranch_execz .LBB2_2 392; GFX8-NEXT: ; %bb.1: 393; GFX8-NEXT: v_mov_b32_e32 v0, 0 394; GFX8-NEXT: v_mov_b32_e32 v3, s4 395; GFX8-NEXT: s_mov_b32 m0, -1 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: .LBB2_2: 400; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 402; GFX8-NEXT: v_readfirstlane_b32 s2, v0 403; GFX8-NEXT: v_mov_b32_e32 v0, v1 404; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 405; GFX8-NEXT: s_mov_b32 s3, 0xf000 406; GFX8-NEXT: s_mov_b32 s2, -1 407; GFX8-NEXT: s_nop 0 408; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 409; GFX8-NEXT: s_endpgm 410; 411; GFX9-LABEL: add_i32_varying: 412; GFX9: ; %bb.0: ; %entry 413; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 414; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 415; GFX9-NEXT: v_mov_b32_e32 v1, 0 416; GFX9-NEXT: s_mov_b64 exec, s[2:3] 417; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 418; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 419; GFX9-NEXT: v_mov_b32_e32 v2, v0 420; GFX9-NEXT: s_not_b64 exec, exec 421; GFX9-NEXT: v_mov_b32_e32 v2, 0 422; GFX9-NEXT: s_not_b64 exec, exec 423; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 424; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 425; GFX9-NEXT: s_nop 1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 435; GFX9-NEXT: v_readlane_b32 s4, v2, 63 436; GFX9-NEXT: s_nop 0 437; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 438; GFX9-NEXT: s_mov_b64 exec, s[2:3] 439; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 440; GFX9-NEXT: ; implicit-def: $vgpr0 441; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 442; GFX9-NEXT: s_cbranch_execz .LBB2_2 443; GFX9-NEXT: ; %bb.1: 444; GFX9-NEXT: v_mov_b32_e32 v0, 0 445; GFX9-NEXT: v_mov_b32_e32 v3, s4 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: .LBB2_2: 450; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: v_readfirstlane_b32 s2, v0 453; GFX9-NEXT: v_mov_b32_e32 v0, v1 454; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 455; GFX9-NEXT: s_mov_b32 s3, 0xf000 456; GFX9-NEXT: s_mov_b32 s2, -1 457; GFX9-NEXT: s_nop 0 458; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX9-NEXT: s_endpgm 460; 461; GFX1064-LABEL: add_i32_varying: 462; GFX1064: ; %bb.0: ; %entry 463; GFX1064-NEXT: v_mov_b32_e32 v1, v0 464; GFX1064-NEXT: s_not_b64 exec, exec 465; GFX1064-NEXT: v_mov_b32_e32 v1, 0 466; GFX1064-NEXT: s_not_b64 exec, exec 467; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX1064-NEXT: v_mov_b32_e32 v3, 0 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX1064-NEXT: v_mov_b32_e32 v2, v1 474; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 475; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 476; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 477; GFX1064-NEXT: v_mov_b32_e32 v2, s4 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 479; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 480; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 481; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 482; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 484; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 485; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 486; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 487; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 488; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 489; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 490; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 491; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 492; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 493; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 498; GFX1064-NEXT: s_mov_b32 s2, -1 499; GFX1064-NEXT: ; implicit-def: $vgpr0 500; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 501; GFX1064-NEXT: s_cbranch_execz .LBB2_2 502; GFX1064-NEXT: ; %bb.1: 503; GFX1064-NEXT: v_mov_b32_e32 v0, 0 504; GFX1064-NEXT: v_mov_b32_e32 v4, s7 505; GFX1064-NEXT: s_mov_b32 s3, s7 506; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 507; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 508; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 509; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 510; GFX1064-NEXT: buffer_gl0_inv 511; GFX1064-NEXT: .LBB2_2: 512; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 513; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 514; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 515; GFX1064-NEXT: v_mov_b32_e32 v0, v3 516; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 517; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 518; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 519; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 520; GFX1064-NEXT: s_endpgm 521; 522; GFX1032-LABEL: add_i32_varying: 523; GFX1032: ; %bb.0: ; %entry 524; GFX1032-NEXT: v_mov_b32_e32 v1, v0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: v_mov_b32_e32 v1, 0 527; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 528; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 532; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX1032-NEXT: v_mov_b32_e32 v2, v1 534; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 535; GFX1032-NEXT: s_mov_b32 exec_lo, s2 536; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 538; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 539; GFX1032-NEXT: v_mov_b32_e32 v3, 0 540; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 541; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 542; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 543; GFX1032-NEXT: s_mov_b32 exec_lo, s2 544; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 545; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 546; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 549; GFX1032-NEXT: s_mov_b32 s2, -1 550; GFX1032-NEXT: ; implicit-def: $vgpr0 551; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 552; GFX1032-NEXT: s_cbranch_execz .LBB2_2 553; GFX1032-NEXT: ; %bb.1: 554; GFX1032-NEXT: v_mov_b32_e32 v0, 0 555; GFX1032-NEXT: v_mov_b32_e32 v4, s4 556; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 557; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 558; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 559; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 560; GFX1032-NEXT: buffer_gl0_inv 561; GFX1032-NEXT: .LBB2_2: 562; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 563; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 564; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 565; GFX1032-NEXT: v_mov_b32_e32 v0, v3 566; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 567; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 568; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 569; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; GFX1032-NEXT: s_endpgm 571entry: 572 %lane = call i32 @llvm.amdgcn.workitem.id.x() 573 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 574 store i32 %old, i32 addrspace(1)* %out 575 ret void 576} 577 578define amdgpu_kernel void @add_i32_varying_nouse() { 579; GFX7LESS-LABEL: add_i32_varying_nouse: 580; GFX7LESS: ; %bb.0: ; %entry 581; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 582; GFX7LESS-NEXT: s_mov_b32 m0, -1 583; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 584; GFX7LESS-NEXT: ds_add_u32 v1, v0 585; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 586; GFX7LESS-NEXT: s_endpgm 587; 588; GFX8-LABEL: add_i32_varying_nouse: 589; GFX8: ; %bb.0: ; %entry 590; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 591; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 592; GFX8-NEXT: v_mov_b32_e32 v1, v0 593; GFX8-NEXT: s_not_b64 exec, exec 594; GFX8-NEXT: v_mov_b32_e32 v1, 0 595; GFX8-NEXT: s_not_b64 exec, exec 596; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 597; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 598; GFX8-NEXT: s_nop 1 599; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 600; GFX8-NEXT: s_nop 1 601; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 602; GFX8-NEXT: s_nop 1 603; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 604; GFX8-NEXT: s_nop 1 605; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 606; GFX8-NEXT: s_nop 1 607; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 608; GFX8-NEXT: v_readlane_b32 s2, v1, 63 609; GFX8-NEXT: s_mov_b64 exec, s[0:1] 610; GFX8-NEXT: s_mov_b32 s0, s2 611; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 612; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 613; GFX8-NEXT: s_cbranch_execz .LBB3_2 614; GFX8-NEXT: ; %bb.1: 615; GFX8-NEXT: v_mov_b32_e32 v0, 0 616; GFX8-NEXT: v_mov_b32_e32 v2, s0 617; GFX8-NEXT: s_mov_b32 m0, -1 618; GFX8-NEXT: s_waitcnt lgkmcnt(0) 619; GFX8-NEXT: ds_add_u32 v0, v2 620; GFX8-NEXT: s_waitcnt lgkmcnt(0) 621; GFX8-NEXT: .LBB3_2: 622; GFX8-NEXT: s_endpgm 623; 624; GFX9-LABEL: add_i32_varying_nouse: 625; GFX9: ; %bb.0: ; %entry 626; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 627; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 628; GFX9-NEXT: v_mov_b32_e32 v1, v0 629; GFX9-NEXT: s_not_b64 exec, exec 630; GFX9-NEXT: v_mov_b32_e32 v1, 0 631; GFX9-NEXT: s_not_b64 exec, exec 632; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 633; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 634; GFX9-NEXT: s_nop 1 635; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX9-NEXT: s_nop 1 637; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 638; GFX9-NEXT: s_nop 1 639; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 640; GFX9-NEXT: s_nop 1 641; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 642; GFX9-NEXT: s_nop 1 643; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 644; GFX9-NEXT: v_readlane_b32 s2, v1, 63 645; GFX9-NEXT: s_mov_b64 exec, s[0:1] 646; GFX9-NEXT: s_mov_b32 s0, s2 647; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 648; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 649; GFX9-NEXT: s_cbranch_execz .LBB3_2 650; GFX9-NEXT: ; %bb.1: 651; GFX9-NEXT: v_mov_b32_e32 v0, 0 652; GFX9-NEXT: v_mov_b32_e32 v2, s0 653; GFX9-NEXT: s_waitcnt lgkmcnt(0) 654; GFX9-NEXT: ds_add_u32 v0, v2 655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-NEXT: .LBB3_2: 657; GFX9-NEXT: s_endpgm 658; 659; GFX1064-LABEL: add_i32_varying_nouse: 660; GFX1064: ; %bb.0: ; %entry 661; GFX1064-NEXT: v_mov_b32_e32 v1, v0 662; GFX1064-NEXT: s_not_b64 exec, exec 663; GFX1064-NEXT: v_mov_b32_e32 v1, 0 664; GFX1064-NEXT: s_not_b64 exec, exec 665; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 666; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX1064-NEXT: v_mov_b32_e32 v2, v1 671; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 672; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 673; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 674; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 675; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 676; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 677; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 678; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 679; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 680; GFX1064-NEXT: s_add_i32 s0, s2, s3 681; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 682; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 683; GFX1064-NEXT: s_cbranch_execz .LBB3_2 684; GFX1064-NEXT: ; %bb.1: 685; GFX1064-NEXT: v_mov_b32_e32 v0, 0 686; GFX1064-NEXT: v_mov_b32_e32 v3, s0 687; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 688; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 689; GFX1064-NEXT: ds_add_u32 v0, v3 690; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 691; GFX1064-NEXT: buffer_gl0_inv 692; GFX1064-NEXT: .LBB3_2: 693; GFX1064-NEXT: s_endpgm 694; 695; GFX1032-LABEL: add_i32_varying_nouse: 696; GFX1032: ; %bb.0: ; %entry 697; GFX1032-NEXT: v_mov_b32_e32 v1, v0 698; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 699; GFX1032-NEXT: v_mov_b32_e32 v1, 0 700; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 701; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 702; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 703; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 704; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 705; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1032-NEXT: v_mov_b32_e32 v2, v1 707; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 708; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 709; GFX1032-NEXT: s_mov_b32 exec_lo, s0 710; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 711; GFX1032-NEXT: v_mov_b32_e32 v0, v1 712; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 713; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 714; GFX1032-NEXT: s_cbranch_execz .LBB3_2 715; GFX1032-NEXT: ; %bb.1: 716; GFX1032-NEXT: v_mov_b32_e32 v3, 0 717; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 718; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 719; GFX1032-NEXT: ds_add_u32 v3, v0 720; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 721; GFX1032-NEXT: buffer_gl0_inv 722; GFX1032-NEXT: .LBB3_2: 723; GFX1032-NEXT: s_endpgm 724entry: 725 %lane = call i32 @llvm.amdgcn.workitem.id.x() 726 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 727 ret void 728} 729 730define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 731; 732; 733; GFX7LESS-LABEL: add_i64_constant: 734; GFX7LESS: ; %bb.0: ; %entry 735; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 736; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 737; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 738; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 739; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 740; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 741; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 742; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 743; GFX7LESS-NEXT: ; %bb.1: 744; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 745; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 746; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 747; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 748; GFX7LESS-NEXT: s_mov_b32 m0, -1 749; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 750; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 751; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 752; GFX7LESS-NEXT: .LBB4_2: 753; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 754; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 755; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 756; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 757; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 758; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 759; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 760; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 761; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 762; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 763; GFX7LESS-NEXT: s_mov_b32 s2, -1 764; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 765; GFX7LESS-NEXT: s_endpgm 766; 767; GFX8-LABEL: add_i64_constant: 768; GFX8: ; %bb.0: ; %entry 769; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 770; GFX8-NEXT: s_mov_b64 s[4:5], exec 771; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 772; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 773; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 774; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 775; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 776; GFX8-NEXT: s_cbranch_execz .LBB4_2 777; GFX8-NEXT: ; %bb.1: 778; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 779; GFX8-NEXT: s_mul_i32 s4, s4, 5 780; GFX8-NEXT: v_mov_b32_e32 v0, s4 781; GFX8-NEXT: v_mov_b32_e32 v1, 0 782; GFX8-NEXT: s_mov_b32 m0, -1 783; GFX8-NEXT: s_waitcnt lgkmcnt(0) 784; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 785; GFX8-NEXT: s_waitcnt lgkmcnt(0) 786; GFX8-NEXT: .LBB4_2: 787; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 788; GFX8-NEXT: s_waitcnt lgkmcnt(0) 789; GFX8-NEXT: v_readfirstlane_b32 s2, v0 790; GFX8-NEXT: v_readfirstlane_b32 s3, v1 791; GFX8-NEXT: v_mov_b32_e32 v0, s2 792; GFX8-NEXT: v_mov_b32_e32 v1, s3 793; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 794; GFX8-NEXT: s_mov_b32 s3, 0xf000 795; GFX8-NEXT: s_mov_b32 s2, -1 796; GFX8-NEXT: s_nop 2 797; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 798; GFX8-NEXT: s_endpgm 799; 800; GFX9-LABEL: add_i64_constant: 801; GFX9: ; %bb.0: ; %entry 802; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 803; GFX9-NEXT: s_mov_b64 s[4:5], exec 804; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 805; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 806; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 807; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 808; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 809; GFX9-NEXT: s_cbranch_execz .LBB4_2 810; GFX9-NEXT: ; %bb.1: 811; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 812; GFX9-NEXT: s_mul_i32 s4, s4, 5 813; GFX9-NEXT: v_mov_b32_e32 v0, s4 814; GFX9-NEXT: v_mov_b32_e32 v1, 0 815; GFX9-NEXT: s_waitcnt lgkmcnt(0) 816; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 817; GFX9-NEXT: s_waitcnt lgkmcnt(0) 818; GFX9-NEXT: .LBB4_2: 819; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 820; GFX9-NEXT: s_waitcnt lgkmcnt(0) 821; GFX9-NEXT: v_readfirstlane_b32 s2, v0 822; GFX9-NEXT: v_readfirstlane_b32 s3, v1 823; GFX9-NEXT: v_mov_b32_e32 v0, s2 824; GFX9-NEXT: v_mov_b32_e32 v1, s3 825; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] 826; GFX9-NEXT: s_mov_b32 s3, 0xf000 827; GFX9-NEXT: s_mov_b32 s2, -1 828; GFX9-NEXT: s_nop 2 829; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 830; GFX9-NEXT: s_endpgm 831; 832; GFX1064-LABEL: add_i64_constant: 833; GFX1064: ; %bb.0: ; %entry 834; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 835; GFX1064-NEXT: s_mov_b64 s[4:5], exec 836; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 837; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 838; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 839; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 840; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 841; GFX1064-NEXT: s_cbranch_execz .LBB4_2 842; GFX1064-NEXT: ; %bb.1: 843; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 844; GFX1064-NEXT: v_mov_b32_e32 v1, 0 845; GFX1064-NEXT: s_mul_i32 s4, s4, 5 846; GFX1064-NEXT: v_mov_b32_e32 v0, s4 847; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 848; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 849; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 850; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 851; GFX1064-NEXT: buffer_gl0_inv 852; GFX1064-NEXT: .LBB4_2: 853; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 854; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 855; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 856; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 857; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] 858; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 859; GFX1064-NEXT: s_mov_b32 s2, -1 860; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 861; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 862; GFX1064-NEXT: s_endpgm 863; 864; GFX1032-LABEL: add_i64_constant: 865; GFX1032: ; %bb.0: ; %entry 866; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 867; GFX1032-NEXT: s_mov_b32 s3, exec_lo 868; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 869; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 870; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 871; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 872; GFX1032-NEXT: s_cbranch_execz .LBB4_2 873; GFX1032-NEXT: ; %bb.1: 874; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 875; GFX1032-NEXT: v_mov_b32_e32 v1, 0 876; GFX1032-NEXT: s_mul_i32 s3, s3, 5 877; GFX1032-NEXT: v_mov_b32_e32 v0, s3 878; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 879; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 880; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 881; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 882; GFX1032-NEXT: buffer_gl0_inv 883; GFX1032-NEXT: .LBB4_2: 884; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 885; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 886; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 887; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 888; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] 889; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 890; GFX1032-NEXT: s_mov_b32 s2, -1 891; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 892; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 893; GFX1032-NEXT: s_endpgm 894entry: 895 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 896 store i64 %old, i64 addrspace(1)* %out 897 ret void 898} 899 900define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 901; 902; 903; GFX7LESS-LABEL: add_i64_uniform: 904; GFX7LESS: ; %bb.0: ; %entry 905; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 906; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 907; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 908; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 909; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 910; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 911; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 912; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 913; GFX7LESS-NEXT: ; %bb.1: 914; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 915; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 916; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 917; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 918; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 919; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 920; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 921; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 922; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 923; GFX7LESS-NEXT: s_mov_b32 m0, -1 924; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 925; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 926; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 927; GFX7LESS-NEXT: .LBB5_2: 928; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 929; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 930; GFX7LESS-NEXT: s_mov_b32 s6, -1 931; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 932; GFX7LESS-NEXT: s_mov_b32 s4, s0 933; GFX7LESS-NEXT: s_mov_b32 s5, s1 934; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 935; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 936; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 937; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 938; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 939; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 940; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 941; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 942; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc 943; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 944; GFX7LESS-NEXT: s_endpgm 945; 946; GFX8-LABEL: add_i64_uniform: 947; GFX8: ; %bb.0: ; %entry 948; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 949; GFX8-NEXT: s_mov_b64 s[6:7], exec 950; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 951; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 952; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 953; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 954; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 955; GFX8-NEXT: s_cbranch_execz .LBB5_2 956; GFX8-NEXT: ; %bb.1: 957; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 958; GFX8-NEXT: v_mov_b32_e32 v0, s8 959; GFX8-NEXT: s_waitcnt lgkmcnt(0) 960; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 961; GFX8-NEXT: s_mul_i32 s6, s3, s8 962; GFX8-NEXT: v_mov_b32_e32 v3, 0 963; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 964; GFX8-NEXT: s_mov_b32 m0, -1 965; GFX8-NEXT: s_waitcnt lgkmcnt(0) 966; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 967; GFX8-NEXT: s_waitcnt lgkmcnt(0) 968; GFX8-NEXT: .LBB5_2: 969; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 970; GFX8-NEXT: s_waitcnt lgkmcnt(0) 971; GFX8-NEXT: v_readfirstlane_b32 s4, v0 972; GFX8-NEXT: v_readfirstlane_b32 s5, v1 973; GFX8-NEXT: v_mov_b32_e32 v0, s4 974; GFX8-NEXT: v_mov_b32_e32 v1, s5 975; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 976; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 977; GFX8-NEXT: s_mov_b32 s7, 0xf000 978; GFX8-NEXT: s_mov_b32 s6, -1 979; GFX8-NEXT: s_mov_b32 s4, s0 980; GFX8-NEXT: s_mov_b32 s5, s1 981; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 982; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 983; GFX8-NEXT: s_endpgm 984; 985; GFX9-LABEL: add_i64_uniform: 986; GFX9: ; %bb.0: ; %entry 987; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 988; GFX9-NEXT: s_mov_b64 s[6:7], exec 989; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 990; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 991; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 992; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 993; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 994; GFX9-NEXT: s_cbranch_execz .LBB5_2 995; GFX9-NEXT: ; %bb.1: 996; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 998; GFX9-NEXT: s_mul_i32 s7, s3, s6 999; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1000; GFX9-NEXT: s_add_i32 s8, s8, s7 1001; GFX9-NEXT: s_mul_i32 s6, s2, s6 1002; GFX9-NEXT: v_mov_b32_e32 v0, s6 1003; GFX9-NEXT: v_mov_b32_e32 v1, s8 1004; GFX9-NEXT: v_mov_b32_e32 v3, 0 1005; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1007; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1008; GFX9-NEXT: .LBB5_2: 1009; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1010; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX9-NEXT: v_readfirstlane_b32 s4, v0 1012; GFX9-NEXT: v_readfirstlane_b32 s5, v1 1013; GFX9-NEXT: v_mov_b32_e32 v0, s4 1014; GFX9-NEXT: v_mov_b32_e32 v1, s5 1015; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 1016; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] 1017; GFX9-NEXT: s_mov_b32 s7, 0xf000 1018; GFX9-NEXT: s_mov_b32 s6, -1 1019; GFX9-NEXT: s_mov_b32 s4, s0 1020; GFX9-NEXT: s_mov_b32 s5, s1 1021; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 1022; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1023; GFX9-NEXT: s_endpgm 1024; 1025; GFX1064-LABEL: add_i64_uniform: 1026; GFX1064: ; %bb.0: ; %entry 1027; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1028; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1029; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1030; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 1031; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1032; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1033; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1034; GFX1064-NEXT: s_cbranch_execz .LBB5_2 1035; GFX1064-NEXT: ; %bb.1: 1036; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1037; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1038; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1039; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1040; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1041; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1042; GFX1064-NEXT: s_add_i32 s8, s8, s7 1043; GFX1064-NEXT: v_mov_b32_e32 v0, s6 1044; GFX1064-NEXT: v_mov_b32_e32 v1, s8 1045; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1046; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1047; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1048; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1049; GFX1064-NEXT: buffer_gl0_inv 1050; GFX1064-NEXT: .LBB5_2: 1051; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1052; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1053; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 1054; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 1055; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1056; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 1057; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, s[4:5] 1058; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1059; GFX1064-NEXT: s_mov_b32 s2, -1 1060; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1 1061; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1062; GFX1064-NEXT: s_endpgm 1063; 1064; GFX1032-LABEL: add_i64_uniform: 1065; GFX1032: ; %bb.0: ; %entry 1066; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1067; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1068; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 1069; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 1070; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 1071; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1072; GFX1032-NEXT: s_cbranch_execz .LBB5_2 1073; GFX1032-NEXT: ; %bb.1: 1074; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1075; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1076; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1077; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1078; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1079; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1080; GFX1032-NEXT: s_add_i32 s7, s7, s6 1081; GFX1032-NEXT: v_mov_b32_e32 v0, s5 1082; GFX1032-NEXT: v_mov_b32_e32 v1, s7 1083; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1084; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1085; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] 1086; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1087; GFX1032-NEXT: buffer_gl0_inv 1088; GFX1032-NEXT: .LBB5_2: 1089; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1090; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1091; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 1092; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 1093; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1094; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 1095; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1096; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] 1097; GFX1032-NEXT: s_mov_b32 s2, -1 1098; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1 1099; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1100; GFX1032-NEXT: s_endpgm 1101entry: 1102 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1103 store i64 %old, i64 addrspace(1)* %out 1104 ret void 1105} 1106 1107define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1108; 1109; 1110; GFX7LESS-LABEL: add_i64_varying: 1111; GFX7LESS: ; %bb.0: ; %entry 1112; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1113; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1114; GFX7LESS-NEXT: s_mov_b32 m0, -1 1115; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1117; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1118; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1119; GFX7LESS-NEXT: s_mov_b32 s2, -1 1120; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1121; GFX7LESS-NEXT: s_endpgm 1122; 1123; GFX8-LABEL: add_i64_varying: 1124; GFX8: ; %bb.0: ; %entry 1125; GFX8-NEXT: v_mov_b32_e32 v1, 0 1126; GFX8-NEXT: s_mov_b32 m0, -1 1127; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1128; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1129; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1130; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1131; GFX8-NEXT: s_mov_b32 s3, 0xf000 1132; GFX8-NEXT: s_mov_b32 s2, -1 1133; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1134; GFX8-NEXT: s_endpgm 1135; 1136; GFX9-LABEL: add_i64_varying: 1137; GFX9: ; %bb.0: ; %entry 1138; GFX9-NEXT: v_mov_b32_e32 v1, 0 1139; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1140; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1142; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1143; GFX9-NEXT: s_mov_b32 s3, 0xf000 1144; GFX9-NEXT: s_mov_b32 s2, -1 1145; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1146; GFX9-NEXT: s_endpgm 1147; 1148; GFX10-LABEL: add_i64_varying: 1149; GFX10: ; %bb.0: ; %entry 1150; GFX10-NEXT: v_mov_b32_e32 v1, 0 1151; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1152; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1153; GFX10-NEXT: s_mov_b32 s2, -1 1154; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1155; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1156; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] 1157; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1158; GFX10-NEXT: buffer_gl0_inv 1159; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1160; GFX10-NEXT: s_endpgm 1161entry: 1162 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1163 %zext = zext i32 %lane to i64 1164 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1165 store i64 %old, i64 addrspace(1)* %out 1166 ret void 1167} 1168 1169define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1170; 1171; 1172; GFX7LESS-LABEL: sub_i32_constant: 1173; GFX7LESS: ; %bb.0: ; %entry 1174; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1175; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1176; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1177; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1178; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1179; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1180; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1181; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 1182; GFX7LESS-NEXT: ; %bb.1: 1183; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1184; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1185; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1186; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1187; GFX7LESS-NEXT: s_mov_b32 m0, -1 1188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1189; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1190; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1191; GFX7LESS-NEXT: .LBB7_2: 1192; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1194; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1195; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1196; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1197; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1198; GFX7LESS-NEXT: s_mov_b32 s2, -1 1199; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1200; GFX7LESS-NEXT: s_endpgm 1201; 1202; GFX8-LABEL: sub_i32_constant: 1203; GFX8: ; %bb.0: ; %entry 1204; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1205; GFX8-NEXT: s_mov_b64 s[2:3], exec 1206; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1207; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1208; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1209; GFX8-NEXT: ; implicit-def: $vgpr1 1210; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1211; GFX8-NEXT: s_cbranch_execz .LBB7_2 1212; GFX8-NEXT: ; %bb.1: 1213; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1214; GFX8-NEXT: s_mul_i32 s2, s2, 5 1215; GFX8-NEXT: v_mov_b32_e32 v1, 0 1216; GFX8-NEXT: v_mov_b32_e32 v2, s2 1217; GFX8-NEXT: s_mov_b32 m0, -1 1218; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1219; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1221; GFX8-NEXT: .LBB7_2: 1222; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1223; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1224; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1225; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1226; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1227; GFX8-NEXT: s_mov_b32 s3, 0xf000 1228; GFX8-NEXT: s_mov_b32 s2, -1 1229; GFX8-NEXT: s_nop 0 1230; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1231; GFX8-NEXT: s_endpgm 1232; 1233; GFX9-LABEL: sub_i32_constant: 1234; GFX9: ; %bb.0: ; %entry 1235; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1236; GFX9-NEXT: s_mov_b64 s[2:3], exec 1237; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1238; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1239; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1240; GFX9-NEXT: ; implicit-def: $vgpr1 1241; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1242; GFX9-NEXT: s_cbranch_execz .LBB7_2 1243; GFX9-NEXT: ; %bb.1: 1244; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1245; GFX9-NEXT: s_mul_i32 s2, s2, 5 1246; GFX9-NEXT: v_mov_b32_e32 v1, 0 1247; GFX9-NEXT: v_mov_b32_e32 v2, s2 1248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1251; GFX9-NEXT: .LBB7_2: 1252; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1253; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1254; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1255; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1256; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1257; GFX9-NEXT: s_mov_b32 s3, 0xf000 1258; GFX9-NEXT: s_mov_b32 s2, -1 1259; GFX9-NEXT: s_nop 0 1260; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1261; GFX9-NEXT: s_endpgm 1262; 1263; GFX1064-LABEL: sub_i32_constant: 1264; GFX1064: ; %bb.0: ; %entry 1265; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1266; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1267; GFX1064-NEXT: ; implicit-def: $vgpr1 1268; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1269; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1270; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1271; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1272; GFX1064-NEXT: s_cbranch_execz .LBB7_2 1273; GFX1064-NEXT: ; %bb.1: 1274; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1275; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1276; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1277; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1278; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1279; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1280; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1281; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1282; GFX1064-NEXT: buffer_gl0_inv 1283; GFX1064-NEXT: .LBB7_2: 1284; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1285; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1286; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1287; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1288; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1289; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1290; GFX1064-NEXT: s_mov_b32 s2, -1 1291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1292; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1293; GFX1064-NEXT: s_endpgm 1294; 1295; GFX1032-LABEL: sub_i32_constant: 1296; GFX1032: ; %bb.0: ; %entry 1297; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1298; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1299; GFX1032-NEXT: ; implicit-def: $vgpr1 1300; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1301; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1302; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1303; GFX1032-NEXT: s_cbranch_execz .LBB7_2 1304; GFX1032-NEXT: ; %bb.1: 1305; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1306; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1307; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1308; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1309; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1310; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1311; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1312; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1313; GFX1032-NEXT: buffer_gl0_inv 1314; GFX1032-NEXT: .LBB7_2: 1315; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1316; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1317; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1318; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1319; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1320; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1321; GFX1032-NEXT: s_mov_b32 s2, -1 1322; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1323; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1324; GFX1032-NEXT: s_endpgm 1325entry: 1326 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1327 store i32 %old, i32 addrspace(1)* %out 1328 ret void 1329} 1330 1331define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1332; 1333; 1334; GFX7LESS-LABEL: sub_i32_uniform: 1335; GFX7LESS: ; %bb.0: ; %entry 1336; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1337; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1338; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb 1339; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1340; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1341; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1342; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1343; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc 1344; GFX7LESS-NEXT: s_cbranch_execz .LBB8_2 1345; GFX7LESS-NEXT: ; %bb.1: 1346; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1347; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1348; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 1349; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1350; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1351; GFX7LESS-NEXT: s_mov_b32 m0, -1 1352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1353; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1355; GFX7LESS-NEXT: .LBB8_2: 1356; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] 1357; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1358; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1359; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 1360; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1361; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1362; GFX7LESS-NEXT: s_mov_b32 s6, -1 1363; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1364; GFX7LESS-NEXT: s_endpgm 1365; 1366; GFX8-LABEL: sub_i32_uniform: 1367; GFX8: ; %bb.0: ; %entry 1368; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1369; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c 1370; GFX8-NEXT: s_mov_b64 s[2:3], exec 1371; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1372; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1373; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1374; GFX8-NEXT: ; implicit-def: $vgpr1 1375; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc 1376; GFX8-NEXT: s_cbranch_execz .LBB8_2 1377; GFX8-NEXT: ; %bb.1: 1378; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1379; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1380; GFX8-NEXT: s_mul_i32 s2, s6, s2 1381; GFX8-NEXT: v_mov_b32_e32 v1, 0 1382; GFX8-NEXT: v_mov_b32_e32 v2, s2 1383; GFX8-NEXT: s_mov_b32 m0, -1 1384; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1385; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1386; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1387; GFX8-NEXT: .LBB8_2: 1388; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] 1389; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1390; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 1391; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1392; GFX8-NEXT: s_mov_b32 s7, 0xf000 1393; GFX8-NEXT: s_mov_b32 s6, -1 1394; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1395; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1396; GFX8-NEXT: s_endpgm 1397; 1398; GFX9-LABEL: sub_i32_uniform: 1399; GFX9: ; %bb.0: ; %entry 1400; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1401; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c 1402; GFX9-NEXT: s_mov_b64 s[2:3], exec 1403; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1404; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1405; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1406; GFX9-NEXT: ; implicit-def: $vgpr1 1407; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1408; GFX9-NEXT: s_cbranch_execz .LBB8_2 1409; GFX9-NEXT: ; %bb.1: 1410; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1411; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1412; GFX9-NEXT: s_mul_i32 s2, s6, s2 1413; GFX9-NEXT: v_mov_b32_e32 v1, 0 1414; GFX9-NEXT: v_mov_b32_e32 v2, s2 1415; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1417; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1418; GFX9-NEXT: .LBB8_2: 1419; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1420; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1421; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 1422; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1423; GFX9-NEXT: s_mov_b32 s7, 0xf000 1424; GFX9-NEXT: s_mov_b32 s6, -1 1425; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1426; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1427; GFX9-NEXT: s_endpgm 1428; 1429; GFX1064-LABEL: sub_i32_uniform: 1430; GFX1064: ; %bb.0: ; %entry 1431; GFX1064-NEXT: s_clause 0x1 1432; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1433; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c 1434; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1435; GFX1064-NEXT: ; implicit-def: $vgpr1 1436; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1437; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1438; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1439; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1440; GFX1064-NEXT: s_cbranch_execz .LBB8_2 1441; GFX1064-NEXT: ; %bb.1: 1442; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1443; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1444; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1445; GFX1064-NEXT: s_mul_i32 s2, s6, s2 1446; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1447; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1448; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1449; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1450; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX1064-NEXT: buffer_gl0_inv 1452; GFX1064-NEXT: .LBB8_2: 1453; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1454; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1455; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1456; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 1457; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1458; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1459; GFX1064-NEXT: s_mov_b32 s6, -1 1460; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1461; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1462; GFX1064-NEXT: s_endpgm 1463; 1464; GFX1032-LABEL: sub_i32_uniform: 1465; GFX1032: ; %bb.0: ; %entry 1466; GFX1032-NEXT: s_clause 0x1 1467; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1468; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1469; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1470; GFX1032-NEXT: ; implicit-def: $vgpr1 1471; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 1472; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1473; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1474; GFX1032-NEXT: s_cbranch_execz .LBB8_2 1475; GFX1032-NEXT: ; %bb.1: 1476; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1477; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1478; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1479; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1480; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1481; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1482; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1483; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1484; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1485; GFX1032-NEXT: buffer_gl0_inv 1486; GFX1032-NEXT: .LBB8_2: 1487; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1488; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1489; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1491; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1492; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1493; GFX1032-NEXT: s_mov_b32 s6, -1 1494; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1495; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1496; GFX1032-NEXT: s_endpgm 1497entry: 1498 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1499 store i32 %old, i32 addrspace(1)* %out 1500 ret void 1501} 1502 1503define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1504; 1505; 1506; GFX7LESS-LABEL: sub_i32_varying: 1507; GFX7LESS: ; %bb.0: ; %entry 1508; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1509; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1510; GFX7LESS-NEXT: s_mov_b32 m0, -1 1511; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1512; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1513; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1514; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1515; GFX7LESS-NEXT: s_mov_b32 s2, -1 1516; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1517; GFX7LESS-NEXT: s_endpgm 1518; 1519; GFX8-LABEL: sub_i32_varying: 1520; GFX8: ; %bb.0: ; %entry 1521; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1522; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1523; GFX8-NEXT: v_mov_b32_e32 v1, 0 1524; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1525; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1526; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1527; GFX8-NEXT: v_mov_b32_e32 v2, v0 1528; GFX8-NEXT: s_not_b64 exec, exec 1529; GFX8-NEXT: v_mov_b32_e32 v2, 0 1530; GFX8-NEXT: s_not_b64 exec, exec 1531; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1532; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1533; GFX8-NEXT: s_nop 1 1534; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1535; GFX8-NEXT: s_nop 1 1536; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1537; GFX8-NEXT: s_nop 1 1538; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1539; GFX8-NEXT: s_nop 1 1540; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1541; GFX8-NEXT: s_nop 1 1542; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1543; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1544; GFX8-NEXT: s_nop 0 1545; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1546; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1547; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1548; GFX8-NEXT: ; implicit-def: $vgpr0 1549; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1550; GFX8-NEXT: s_cbranch_execz .LBB9_2 1551; GFX8-NEXT: ; %bb.1: 1552; GFX8-NEXT: v_mov_b32_e32 v0, 0 1553; GFX8-NEXT: v_mov_b32_e32 v3, s4 1554; GFX8-NEXT: s_mov_b32 m0, -1 1555; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1556; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1557; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1558; GFX8-NEXT: .LBB9_2: 1559; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1560; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1561; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1562; GFX8-NEXT: v_mov_b32_e32 v0, v1 1563; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1564; GFX8-NEXT: s_mov_b32 s3, 0xf000 1565; GFX8-NEXT: s_mov_b32 s2, -1 1566; GFX8-NEXT: s_nop 0 1567; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1568; GFX8-NEXT: s_endpgm 1569; 1570; GFX9-LABEL: sub_i32_varying: 1571; GFX9: ; %bb.0: ; %entry 1572; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1573; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1574; GFX9-NEXT: v_mov_b32_e32 v1, 0 1575; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1576; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1577; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 1578; GFX9-NEXT: v_mov_b32_e32 v2, v0 1579; GFX9-NEXT: s_not_b64 exec, exec 1580; GFX9-NEXT: v_mov_b32_e32 v2, 0 1581; GFX9-NEXT: s_not_b64 exec, exec 1582; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1583; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1584; GFX9-NEXT: s_nop 1 1585; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1586; GFX9-NEXT: s_nop 1 1587; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1588; GFX9-NEXT: s_nop 1 1589; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1590; GFX9-NEXT: s_nop 1 1591; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1592; GFX9-NEXT: s_nop 1 1593; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1594; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1595; GFX9-NEXT: s_nop 0 1596; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1597; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1598; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1599; GFX9-NEXT: ; implicit-def: $vgpr0 1600; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1601; GFX9-NEXT: s_cbranch_execz .LBB9_2 1602; GFX9-NEXT: ; %bb.1: 1603; GFX9-NEXT: v_mov_b32_e32 v0, 0 1604; GFX9-NEXT: v_mov_b32_e32 v3, s4 1605; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1606; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1607; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1608; GFX9-NEXT: .LBB9_2: 1609; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1610; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1611; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1612; GFX9-NEXT: v_mov_b32_e32 v0, v1 1613; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1614; GFX9-NEXT: s_mov_b32 s3, 0xf000 1615; GFX9-NEXT: s_mov_b32 s2, -1 1616; GFX9-NEXT: s_nop 0 1617; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1618; GFX9-NEXT: s_endpgm 1619; 1620; GFX1064-LABEL: sub_i32_varying: 1621; GFX1064: ; %bb.0: ; %entry 1622; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1623; GFX1064-NEXT: s_not_b64 exec, exec 1624; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1625; GFX1064-NEXT: s_not_b64 exec, exec 1626; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1627; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1628; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1629; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1630; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1631; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1632; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1633; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1634; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1635; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1636; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1637; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1638; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1639; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1640; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1641; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1642; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1643; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1644; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 1645; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1646; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1647; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1648; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 1649; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 1650; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 1651; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1652; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1653; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1654; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 1655; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1656; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1657; GFX1064-NEXT: s_mov_b32 s2, -1 1658; GFX1064-NEXT: ; implicit-def: $vgpr0 1659; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1660; GFX1064-NEXT: s_cbranch_execz .LBB9_2 1661; GFX1064-NEXT: ; %bb.1: 1662; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1663; GFX1064-NEXT: v_mov_b32_e32 v4, s7 1664; GFX1064-NEXT: s_mov_b32 s3, s7 1665; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1666; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1667; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 1668; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX1064-NEXT: buffer_gl0_inv 1670; GFX1064-NEXT: .LBB9_2: 1671; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1672; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1673; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1674; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1675; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1676; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1677; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1678; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1679; GFX1064-NEXT: s_endpgm 1680; 1681; GFX1032-LABEL: sub_i32_varying: 1682; GFX1032: ; %bb.0: ; %entry 1683; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1684; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1685; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1686; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1687; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1688; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1689; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1690; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1691; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1692; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1693; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1694; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1695; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1696; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1697; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1698; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1699; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1700; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1701; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1702; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1703; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1704; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1705; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1706; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1707; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1708; GFX1032-NEXT: s_mov_b32 s2, -1 1709; GFX1032-NEXT: ; implicit-def: $vgpr0 1710; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1711; GFX1032-NEXT: s_cbranch_execz .LBB9_2 1712; GFX1032-NEXT: ; %bb.1: 1713; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1714; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1715; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1716; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1717; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 1718; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1719; GFX1032-NEXT: buffer_gl0_inv 1720; GFX1032-NEXT: .LBB9_2: 1721; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1722; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1723; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1724; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1725; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1726; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1727; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1728; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1729; GFX1032-NEXT: s_endpgm 1730entry: 1731 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1732 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1733 store i32 %old, i32 addrspace(1)* %out 1734 ret void 1735} 1736 1737define amdgpu_kernel void @sub_i32_varying_nouse() { 1738; GFX7LESS-LABEL: sub_i32_varying_nouse: 1739; GFX7LESS: ; %bb.0: ; %entry 1740; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1741; GFX7LESS-NEXT: s_mov_b32 m0, -1 1742; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1743; GFX7LESS-NEXT: ds_sub_u32 v1, v0 1744; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1745; GFX7LESS-NEXT: s_endpgm 1746; 1747; GFX8-LABEL: sub_i32_varying_nouse: 1748; GFX8: ; %bb.0: ; %entry 1749; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 1750; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 1751; GFX8-NEXT: v_mov_b32_e32 v1, v0 1752; GFX8-NEXT: s_not_b64 exec, exec 1753; GFX8-NEXT: v_mov_b32_e32 v1, 0 1754; GFX8-NEXT: s_not_b64 exec, exec 1755; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 1756; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1757; GFX8-NEXT: s_nop 1 1758; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1759; GFX8-NEXT: s_nop 1 1760; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1761; GFX8-NEXT: s_nop 1 1762; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1763; GFX8-NEXT: s_nop 1 1764; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1765; GFX8-NEXT: s_nop 1 1766; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1767; GFX8-NEXT: v_readlane_b32 s2, v1, 63 1768; GFX8-NEXT: s_mov_b64 exec, s[0:1] 1769; GFX8-NEXT: s_mov_b32 s0, s2 1770; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1771; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1772; GFX8-NEXT: s_cbranch_execz .LBB10_2 1773; GFX8-NEXT: ; %bb.1: 1774; GFX8-NEXT: v_mov_b32_e32 v0, 0 1775; GFX8-NEXT: v_mov_b32_e32 v2, s0 1776; GFX8-NEXT: s_mov_b32 m0, -1 1777; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1778; GFX8-NEXT: ds_sub_u32 v0, v2 1779; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1780; GFX8-NEXT: .LBB10_2: 1781; GFX8-NEXT: s_endpgm 1782; 1783; GFX9-LABEL: sub_i32_varying_nouse: 1784; GFX9: ; %bb.0: ; %entry 1785; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 1786; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 1787; GFX9-NEXT: v_mov_b32_e32 v1, v0 1788; GFX9-NEXT: s_not_b64 exec, exec 1789; GFX9-NEXT: v_mov_b32_e32 v1, 0 1790; GFX9-NEXT: s_not_b64 exec, exec 1791; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 1792; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1793; GFX9-NEXT: s_nop 1 1794; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1795; GFX9-NEXT: s_nop 1 1796; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1797; GFX9-NEXT: s_nop 1 1798; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1799; GFX9-NEXT: s_nop 1 1800; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1801; GFX9-NEXT: s_nop 1 1802; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1803; GFX9-NEXT: v_readlane_b32 s2, v1, 63 1804; GFX9-NEXT: s_mov_b64 exec, s[0:1] 1805; GFX9-NEXT: s_mov_b32 s0, s2 1806; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1807; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1808; GFX9-NEXT: s_cbranch_execz .LBB10_2 1809; GFX9-NEXT: ; %bb.1: 1810; GFX9-NEXT: v_mov_b32_e32 v0, 0 1811; GFX9-NEXT: v_mov_b32_e32 v2, s0 1812; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1813; GFX9-NEXT: ds_sub_u32 v0, v2 1814; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1815; GFX9-NEXT: .LBB10_2: 1816; GFX9-NEXT: s_endpgm 1817; 1818; GFX1064-LABEL: sub_i32_varying_nouse: 1819; GFX1064: ; %bb.0: ; %entry 1820; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1821; GFX1064-NEXT: s_not_b64 exec, exec 1822; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1823; GFX1064-NEXT: s_not_b64 exec, exec 1824; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1825; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1826; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1827; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1828; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1829; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1830; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1831; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 1832; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1833; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1834; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1835; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 1836; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 1837; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1838; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1839; GFX1064-NEXT: s_add_i32 s0, s2, s3 1840; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1841; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1842; GFX1064-NEXT: s_cbranch_execz .LBB10_2 1843; GFX1064-NEXT: ; %bb.1: 1844; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1845; GFX1064-NEXT: v_mov_b32_e32 v3, s0 1846; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1847; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1848; GFX1064-NEXT: ds_sub_u32 v0, v3 1849; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1850; GFX1064-NEXT: buffer_gl0_inv 1851; GFX1064-NEXT: .LBB10_2: 1852; GFX1064-NEXT: s_endpgm 1853; 1854; GFX1032-LABEL: sub_i32_varying_nouse: 1855; GFX1032: ; %bb.0: ; %entry 1856; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1857; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1858; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1859; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1860; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1861; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1862; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1863; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1864; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1865; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1866; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1867; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 1868; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1869; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 1870; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1871; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 1872; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1873; GFX1032-NEXT: s_cbranch_execz .LBB10_2 1874; GFX1032-NEXT: ; %bb.1: 1875; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1876; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1877; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1878; GFX1032-NEXT: ds_sub_u32 v3, v0 1879; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1880; GFX1032-NEXT: buffer_gl0_inv 1881; GFX1032-NEXT: .LBB10_2: 1882; GFX1032-NEXT: s_endpgm 1883entry: 1884 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1885 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1886 ret void 1887} 1888 1889define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 1890; 1891; 1892; GFX7LESS-LABEL: sub_i64_constant: 1893; GFX7LESS: ; %bb.0: ; %entry 1894; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1895; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1896; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1897; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 1898; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1899; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 1900; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1901; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 1902; GFX7LESS-NEXT: ; %bb.1: 1903; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1904; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1905; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1906; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 1907; GFX7LESS-NEXT: s_mov_b32 m0, -1 1908; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1909; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1910; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX7LESS-NEXT: .LBB11_2: 1912; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1913; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1914; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 1915; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 1916; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1917; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1918; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1919; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1920; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1921; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1922; GFX7LESS-NEXT: s_mov_b32 s2, -1 1923; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1924; GFX7LESS-NEXT: s_endpgm 1925; 1926; GFX8-LABEL: sub_i64_constant: 1927; GFX8: ; %bb.0: ; %entry 1928; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1929; GFX8-NEXT: s_mov_b64 s[4:5], exec 1930; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1931; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1932; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1933; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 1934; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1935; GFX8-NEXT: s_cbranch_execz .LBB11_2 1936; GFX8-NEXT: ; %bb.1: 1937; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1938; GFX8-NEXT: s_mul_i32 s4, s4, 5 1939; GFX8-NEXT: v_mov_b32_e32 v0, s4 1940; GFX8-NEXT: v_mov_b32_e32 v1, 0 1941; GFX8-NEXT: s_mov_b32 m0, -1 1942; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1943; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1944; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1945; GFX8-NEXT: .LBB11_2: 1946; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1947; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1948; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1949; GFX8-NEXT: v_readfirstlane_b32 s3, v1 1950; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1951; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1952; GFX8-NEXT: v_mov_b32_e32 v2, s3 1953; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1954; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1955; GFX8-NEXT: s_mov_b32 s3, 0xf000 1956; GFX8-NEXT: s_mov_b32 s2, -1 1957; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1958; GFX8-NEXT: s_endpgm 1959; 1960; GFX9-LABEL: sub_i64_constant: 1961; GFX9: ; %bb.0: ; %entry 1962; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1963; GFX9-NEXT: s_mov_b64 s[4:5], exec 1964; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1965; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1966; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1967; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 1968; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1969; GFX9-NEXT: s_cbranch_execz .LBB11_2 1970; GFX9-NEXT: ; %bb.1: 1971; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1972; GFX9-NEXT: s_mul_i32 s4, s4, 5 1973; GFX9-NEXT: v_mov_b32_e32 v0, s4 1974; GFX9-NEXT: v_mov_b32_e32 v1, 0 1975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1976; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 1977; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1978; GFX9-NEXT: .LBB11_2: 1979; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1980; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1981; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1982; GFX9-NEXT: v_readfirstlane_b32 s3, v1 1983; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 1984; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 1985; GFX9-NEXT: v_mov_b32_e32 v2, s3 1986; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 1987; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 1988; GFX9-NEXT: s_mov_b32 s3, 0xf000 1989; GFX9-NEXT: s_mov_b32 s2, -1 1990; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1991; GFX9-NEXT: s_endpgm 1992; 1993; GFX1064-LABEL: sub_i64_constant: 1994; GFX1064: ; %bb.0: ; %entry 1995; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1996; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1997; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1998; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 1999; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2000; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2001; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2002; GFX1064-NEXT: s_cbranch_execz .LBB11_2 2003; GFX1064-NEXT: ; %bb.1: 2004; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2005; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2006; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2007; GFX1064-NEXT: v_mov_b32_e32 v0, s4 2008; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2009; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2010; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2011; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2012; GFX1064-NEXT: buffer_gl0_inv 2013; GFX1064-NEXT: .LBB11_2: 2014; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2015; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2016; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2017; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2018; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 2019; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2020; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 2021; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 2022; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2023; GFX1064-NEXT: s_mov_b32 s2, -1 2024; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2025; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2026; GFX1064-NEXT: s_endpgm 2027; 2028; GFX1032-LABEL: sub_i64_constant: 2029; GFX1032: ; %bb.0: ; %entry 2030; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2031; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2032; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2033; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 2034; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2035; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2036; GFX1032-NEXT: s_cbranch_execz .LBB11_2 2037; GFX1032-NEXT: ; %bb.1: 2038; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2039; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2040; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2041; GFX1032-NEXT: v_mov_b32_e32 v0, s3 2042; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2043; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2044; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2045; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2046; GFX1032-NEXT: buffer_gl0_inv 2047; GFX1032-NEXT: .LBB11_2: 2048; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2049; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2050; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2051; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 2052; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 2053; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 2054; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 2055; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 2056; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2057; GFX1032-NEXT: s_mov_b32 s2, -1 2058; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2059; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2060; GFX1032-NEXT: s_endpgm 2061entry: 2062 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2063 store i64 %old, i64 addrspace(1)* %out 2064 ret void 2065} 2066 2067define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2068; 2069; 2070; GFX7LESS-LABEL: sub_i64_uniform: 2071; GFX7LESS: ; %bb.0: ; %entry 2072; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2073; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2074; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2075; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 2076; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2077; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 2078; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2079; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 2080; GFX7LESS-NEXT: ; %bb.1: 2081; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2082; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 2083; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2084; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2085; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2086; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 2087; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2088; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 2089; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 2090; GFX7LESS-NEXT: s_mov_b32 m0, -1 2091; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2092; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2093; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2094; GFX7LESS-NEXT: .LBB12_2: 2095; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2096; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2097; GFX7LESS-NEXT: s_mov_b32 s6, -1 2098; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2099; GFX7LESS-NEXT: s_mov_b32 s4, s0 2100; GFX7LESS-NEXT: s_mov_b32 s5, s1 2101; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 2102; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 2103; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 2104; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 2105; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 2106; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 2107; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 2108; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 2109; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2110; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2111; GFX7LESS-NEXT: s_endpgm 2112; 2113; GFX8-LABEL: sub_i64_uniform: 2114; GFX8: ; %bb.0: ; %entry 2115; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2116; GFX8-NEXT: s_mov_b64 s[6:7], exec 2117; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2118; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2119; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2120; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 2121; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2122; GFX8-NEXT: s_cbranch_execz .LBB12_2 2123; GFX8-NEXT: ; %bb.1: 2124; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] 2125; GFX8-NEXT: v_mov_b32_e32 v0, s8 2126; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2127; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 2128; GFX8-NEXT: s_mul_i32 s6, s3, s8 2129; GFX8-NEXT: v_mov_b32_e32 v3, 0 2130; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 2131; GFX8-NEXT: s_mov_b32 m0, -1 2132; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2133; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2134; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2135; GFX8-NEXT: .LBB12_2: 2136; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2137; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2138; GFX8-NEXT: s_mov_b32 s4, s0 2139; GFX8-NEXT: s_mov_b32 s5, s1 2140; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 2141; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 2142; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2143; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2144; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 2145; GFX8-NEXT: v_mov_b32_e32 v3, s1 2146; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 2147; GFX8-NEXT: s_mov_b32 s7, 0xf000 2148; GFX8-NEXT: s_mov_b32 s6, -1 2149; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 2150; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2151; GFX8-NEXT: s_endpgm 2152; 2153; GFX9-LABEL: sub_i64_uniform: 2154; GFX9: ; %bb.0: ; %entry 2155; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2156; GFX9-NEXT: s_mov_b64 s[6:7], exec 2157; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2158; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2159; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2160; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2161; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2162; GFX9-NEXT: s_cbranch_execz .LBB12_2 2163; GFX9-NEXT: ; %bb.1: 2164; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2165; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2166; GFX9-NEXT: s_mul_i32 s7, s3, s6 2167; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2168; GFX9-NEXT: s_add_i32 s8, s8, s7 2169; GFX9-NEXT: s_mul_i32 s6, s2, s6 2170; GFX9-NEXT: v_mov_b32_e32 v0, s6 2171; GFX9-NEXT: v_mov_b32_e32 v1, s8 2172; GFX9-NEXT: v_mov_b32_e32 v3, 0 2173; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2174; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2175; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2176; GFX9-NEXT: .LBB12_2: 2177; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2178; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2179; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 2180; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 2181; GFX9-NEXT: s_mov_b32 s4, s0 2182; GFX9-NEXT: s_mov_b32 s5, s1 2183; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2184; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2185; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 2186; GFX9-NEXT: v_mov_b32_e32 v3, s1 2187; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2 2188; GFX9-NEXT: s_mov_b32 s7, 0xf000 2189; GFX9-NEXT: s_mov_b32 s6, -1 2190; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc 2191; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2192; GFX9-NEXT: s_endpgm 2193; 2194; GFX1064-LABEL: sub_i64_uniform: 2195; GFX1064: ; %bb.0: ; %entry 2196; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2197; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2198; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2199; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 2200; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 2201; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 2202; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2203; GFX1064-NEXT: s_cbranch_execz .LBB12_2 2204; GFX1064-NEXT: ; %bb.1: 2205; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2206; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2207; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2208; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2209; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2210; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2211; GFX1064-NEXT: s_add_i32 s8, s8, s7 2212; GFX1064-NEXT: v_mov_b32_e32 v0, s6 2213; GFX1064-NEXT: v_mov_b32_e32 v1, s8 2214; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2215; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2216; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2217; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2218; GFX1064-NEXT: buffer_gl0_inv 2219; GFX1064-NEXT: .LBB12_2: 2220; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2221; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2222; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2223; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 2224; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 2225; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 2226; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 2227; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2228; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 2229; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v2 2230; GFX1064-NEXT: s_mov_b32 s2, -1 2231; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2232; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2233; GFX1064-NEXT: s_endpgm 2234; 2235; GFX1032-LABEL: sub_i64_uniform: 2236; GFX1032: ; %bb.0: ; %entry 2237; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2238; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2239; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 2240; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 2241; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 2242; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2243; GFX1032-NEXT: s_cbranch_execz .LBB12_2 2244; GFX1032-NEXT: ; %bb.1: 2245; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2246; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2247; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2248; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2249; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2250; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2251; GFX1032-NEXT: s_add_i32 s7, s7, s6 2252; GFX1032-NEXT: v_mov_b32_e32 v0, s5 2253; GFX1032-NEXT: v_mov_b32_e32 v1, s7 2254; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2255; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2256; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] 2257; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2258; GFX1032-NEXT: buffer_gl0_inv 2259; GFX1032-NEXT: .LBB12_2: 2260; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2261; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2262; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2263; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 2264; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s2, v2, 0 2265; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 2266; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 2267; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2268; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 2269; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v2 2270; GFX1032-NEXT: s_mov_b32 s2, -1 2271; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2272; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2273; GFX1032-NEXT: s_endpgm 2274entry: 2275 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2276 store i64 %old, i64 addrspace(1)* %out 2277 ret void 2278} 2279 2280define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2281; 2282; 2283; GFX7LESS-LABEL: sub_i64_varying: 2284; GFX7LESS: ; %bb.0: ; %entry 2285; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2286; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2287; GFX7LESS-NEXT: s_mov_b32 m0, -1 2288; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2289; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2290; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2291; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2292; GFX7LESS-NEXT: s_mov_b32 s2, -1 2293; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2294; GFX7LESS-NEXT: s_endpgm 2295; 2296; GFX8-LABEL: sub_i64_varying: 2297; GFX8: ; %bb.0: ; %entry 2298; GFX8-NEXT: v_mov_b32_e32 v1, 0 2299; GFX8-NEXT: s_mov_b32 m0, -1 2300; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2301; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2303; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2304; GFX8-NEXT: s_mov_b32 s3, 0xf000 2305; GFX8-NEXT: s_mov_b32 s2, -1 2306; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2307; GFX8-NEXT: s_endpgm 2308; 2309; GFX9-LABEL: sub_i64_varying: 2310; GFX9: ; %bb.0: ; %entry 2311; GFX9-NEXT: v_mov_b32_e32 v1, 0 2312; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2314; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2315; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2316; GFX9-NEXT: s_mov_b32 s3, 0xf000 2317; GFX9-NEXT: s_mov_b32 s2, -1 2318; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2319; GFX9-NEXT: s_endpgm 2320; 2321; GFX10-LABEL: sub_i64_varying: 2322; GFX10: ; %bb.0: ; %entry 2323; GFX10-NEXT: v_mov_b32_e32 v1, 0 2324; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2325; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2326; GFX10-NEXT: s_mov_b32 s2, -1 2327; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2328; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2329; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] 2330; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2331; GFX10-NEXT: buffer_gl0_inv 2332; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2333; GFX10-NEXT: s_endpgm 2334entry: 2335 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2336 %zext = zext i32 %lane to i64 2337 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2338 store i64 %old, i64 addrspace(1)* %out 2339 ret void 2340} 2341 2342define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2343; 2344; 2345; GFX7LESS-LABEL: and_i32_varying: 2346; GFX7LESS: ; %bb.0: ; %entry 2347; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2348; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2349; GFX7LESS-NEXT: s_mov_b32 m0, -1 2350; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2351; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2353; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2354; GFX7LESS-NEXT: s_mov_b32 s2, -1 2355; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2356; GFX7LESS-NEXT: s_endpgm 2357; 2358; GFX8-LABEL: and_i32_varying: 2359; GFX8: ; %bb.0: ; %entry 2360; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2361; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2362; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2363; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2364; GFX8-NEXT: v_mov_b32_e32 v1, -1 2365; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2366; GFX8-NEXT: v_mov_b32_e32 v2, v0 2367; GFX8-NEXT: s_not_b64 exec, exec 2368; GFX8-NEXT: v_mov_b32_e32 v2, -1 2369; GFX8-NEXT: s_not_b64 exec, exec 2370; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2371; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2372; GFX8-NEXT: s_nop 1 2373; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2374; GFX8-NEXT: s_nop 1 2375; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2376; GFX8-NEXT: s_nop 1 2377; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2378; GFX8-NEXT: s_nop 1 2379; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2380; GFX8-NEXT: s_nop 1 2381; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2382; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2383; GFX8-NEXT: s_nop 0 2384; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2385; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2386; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2387; GFX8-NEXT: ; implicit-def: $vgpr0 2388; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2389; GFX8-NEXT: s_cbranch_execz .LBB14_2 2390; GFX8-NEXT: ; %bb.1: 2391; GFX8-NEXT: v_mov_b32_e32 v0, 0 2392; GFX8-NEXT: v_mov_b32_e32 v3, s4 2393; GFX8-NEXT: s_mov_b32 m0, -1 2394; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2395; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2397; GFX8-NEXT: .LBB14_2: 2398; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2399; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2400; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2401; GFX8-NEXT: v_mov_b32_e32 v0, v1 2402; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2403; GFX8-NEXT: s_mov_b32 s3, 0xf000 2404; GFX8-NEXT: s_mov_b32 s2, -1 2405; GFX8-NEXT: s_nop 0 2406; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2407; GFX8-NEXT: s_endpgm 2408; 2409; GFX9-LABEL: and_i32_varying: 2410; GFX9: ; %bb.0: ; %entry 2411; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2412; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2413; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2414; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2415; GFX9-NEXT: v_mov_b32_e32 v1, -1 2416; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2417; GFX9-NEXT: v_mov_b32_e32 v2, v0 2418; GFX9-NEXT: s_not_b64 exec, exec 2419; GFX9-NEXT: v_mov_b32_e32 v2, -1 2420; GFX9-NEXT: s_not_b64 exec, exec 2421; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2422; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2423; GFX9-NEXT: s_nop 1 2424; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2425; GFX9-NEXT: s_nop 1 2426; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2427; GFX9-NEXT: s_nop 1 2428; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2429; GFX9-NEXT: s_nop 1 2430; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2431; GFX9-NEXT: s_nop 1 2432; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2433; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2434; GFX9-NEXT: s_nop 0 2435; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2436; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2437; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2438; GFX9-NEXT: ; implicit-def: $vgpr0 2439; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2440; GFX9-NEXT: s_cbranch_execz .LBB14_2 2441; GFX9-NEXT: ; %bb.1: 2442; GFX9-NEXT: v_mov_b32_e32 v0, 0 2443; GFX9-NEXT: v_mov_b32_e32 v3, s4 2444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2445; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2447; GFX9-NEXT: .LBB14_2: 2448; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2449; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2450; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2451; GFX9-NEXT: v_mov_b32_e32 v0, v1 2452; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2453; GFX9-NEXT: s_mov_b32 s3, 0xf000 2454; GFX9-NEXT: s_mov_b32 s2, -1 2455; GFX9-NEXT: s_nop 0 2456; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2457; GFX9-NEXT: s_endpgm 2458; 2459; GFX1064-LABEL: and_i32_varying: 2460; GFX1064: ; %bb.0: ; %entry 2461; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2462; GFX1064-NEXT: s_not_b64 exec, exec 2463; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2464; GFX1064-NEXT: s_not_b64 exec, exec 2465; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2466; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2467; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2468; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2469; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2470; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2471; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2472; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2473; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2474; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2475; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2476; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2477; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2478; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2479; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2480; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2481; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2482; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2483; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2484; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2485; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2486; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2487; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2488; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2489; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2490; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2491; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2492; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2493; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2494; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2495; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2496; GFX1064-NEXT: s_mov_b32 s2, -1 2497; GFX1064-NEXT: ; implicit-def: $vgpr0 2498; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2499; GFX1064-NEXT: s_cbranch_execz .LBB14_2 2500; GFX1064-NEXT: ; %bb.1: 2501; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2502; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2503; GFX1064-NEXT: s_mov_b32 s3, s7 2504; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2505; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2506; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 2507; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2508; GFX1064-NEXT: buffer_gl0_inv 2509; GFX1064-NEXT: .LBB14_2: 2510; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2511; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2512; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2513; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2514; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2515; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2516; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2517; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2518; GFX1064-NEXT: s_endpgm 2519; 2520; GFX1032-LABEL: and_i32_varying: 2521; GFX1032: ; %bb.0: ; %entry 2522; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2523; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2524; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2526; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2527; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2528; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2529; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2530; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2531; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2532; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2533; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2534; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2535; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2536; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2537; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2538; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2539; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2540; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2541; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2542; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2543; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2544; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2545; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2546; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2547; GFX1032-NEXT: s_mov_b32 s2, -1 2548; GFX1032-NEXT: ; implicit-def: $vgpr0 2549; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2550; GFX1032-NEXT: s_cbranch_execz .LBB14_2 2551; GFX1032-NEXT: ; %bb.1: 2552; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2553; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2554; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2555; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2556; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 2557; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2558; GFX1032-NEXT: buffer_gl0_inv 2559; GFX1032-NEXT: .LBB14_2: 2560; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2561; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2562; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2563; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2564; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2565; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2566; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2567; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2568; GFX1032-NEXT: s_endpgm 2569entry: 2570 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2571 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2572 store i32 %old, i32 addrspace(1)* %out 2573 ret void 2574} 2575 2576define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2577; 2578; 2579; GFX7LESS-LABEL: or_i32_varying: 2580; GFX7LESS: ; %bb.0: ; %entry 2581; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2582; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2583; GFX7LESS-NEXT: s_mov_b32 m0, -1 2584; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2585; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2586; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2587; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2588; GFX7LESS-NEXT: s_mov_b32 s2, -1 2589; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2590; GFX7LESS-NEXT: s_endpgm 2591; 2592; GFX8-LABEL: or_i32_varying: 2593; GFX8: ; %bb.0: ; %entry 2594; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2595; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2596; GFX8-NEXT: v_mov_b32_e32 v1, 0 2597; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2598; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2599; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2600; GFX8-NEXT: v_mov_b32_e32 v2, v0 2601; GFX8-NEXT: s_not_b64 exec, exec 2602; GFX8-NEXT: v_mov_b32_e32 v2, 0 2603; GFX8-NEXT: s_not_b64 exec, exec 2604; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2605; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2606; GFX8-NEXT: s_nop 1 2607; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2608; GFX8-NEXT: s_nop 1 2609; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2610; GFX8-NEXT: s_nop 1 2611; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2612; GFX8-NEXT: s_nop 1 2613; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2614; GFX8-NEXT: s_nop 1 2615; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2616; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2617; GFX8-NEXT: s_nop 0 2618; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2619; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2620; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2621; GFX8-NEXT: ; implicit-def: $vgpr0 2622; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2623; GFX8-NEXT: s_cbranch_execz .LBB15_2 2624; GFX8-NEXT: ; %bb.1: 2625; GFX8-NEXT: v_mov_b32_e32 v0, 0 2626; GFX8-NEXT: v_mov_b32_e32 v3, s4 2627; GFX8-NEXT: s_mov_b32 m0, -1 2628; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2629; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2630; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2631; GFX8-NEXT: .LBB15_2: 2632; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2633; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2634; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2635; GFX8-NEXT: v_mov_b32_e32 v0, v1 2636; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2637; GFX8-NEXT: s_mov_b32 s3, 0xf000 2638; GFX8-NEXT: s_mov_b32 s2, -1 2639; GFX8-NEXT: s_nop 0 2640; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2641; GFX8-NEXT: s_endpgm 2642; 2643; GFX9-LABEL: or_i32_varying: 2644; GFX9: ; %bb.0: ; %entry 2645; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2646; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2647; GFX9-NEXT: v_mov_b32_e32 v1, 0 2648; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2649; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2650; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2651; GFX9-NEXT: v_mov_b32_e32 v2, v0 2652; GFX9-NEXT: s_not_b64 exec, exec 2653; GFX9-NEXT: v_mov_b32_e32 v2, 0 2654; GFX9-NEXT: s_not_b64 exec, exec 2655; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2656; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2657; GFX9-NEXT: s_nop 1 2658; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2659; GFX9-NEXT: s_nop 1 2660; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2661; GFX9-NEXT: s_nop 1 2662; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2663; GFX9-NEXT: s_nop 1 2664; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2665; GFX9-NEXT: s_nop 1 2666; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2667; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2668; GFX9-NEXT: s_nop 0 2669; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2670; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2671; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2672; GFX9-NEXT: ; implicit-def: $vgpr0 2673; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2674; GFX9-NEXT: s_cbranch_execz .LBB15_2 2675; GFX9-NEXT: ; %bb.1: 2676; GFX9-NEXT: v_mov_b32_e32 v0, 0 2677; GFX9-NEXT: v_mov_b32_e32 v3, s4 2678; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2679; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2680; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2681; GFX9-NEXT: .LBB15_2: 2682; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2683; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2684; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2685; GFX9-NEXT: v_mov_b32_e32 v0, v1 2686; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2687; GFX9-NEXT: s_mov_b32 s3, 0xf000 2688; GFX9-NEXT: s_mov_b32 s2, -1 2689; GFX9-NEXT: s_nop 0 2690; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2691; GFX9-NEXT: s_endpgm 2692; 2693; GFX1064-LABEL: or_i32_varying: 2694; GFX1064: ; %bb.0: ; %entry 2695; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2696; GFX1064-NEXT: s_not_b64 exec, exec 2697; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2698; GFX1064-NEXT: s_not_b64 exec, exec 2699; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2700; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2701; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2702; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2703; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2704; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2705; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2706; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2707; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2708; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2709; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2710; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2711; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2712; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2713; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2714; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2715; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2716; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2717; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2718; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2719; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2720; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2721; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2722; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2723; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2724; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2725; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2726; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2727; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2728; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2729; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2730; GFX1064-NEXT: s_mov_b32 s2, -1 2731; GFX1064-NEXT: ; implicit-def: $vgpr0 2732; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2733; GFX1064-NEXT: s_cbranch_execz .LBB15_2 2734; GFX1064-NEXT: ; %bb.1: 2735; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2736; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2737; GFX1064-NEXT: s_mov_b32 s3, s7 2738; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2739; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2740; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 2741; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2742; GFX1064-NEXT: buffer_gl0_inv 2743; GFX1064-NEXT: .LBB15_2: 2744; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2745; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2746; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2747; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2748; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2749; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2750; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2751; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2752; GFX1064-NEXT: s_endpgm 2753; 2754; GFX1032-LABEL: or_i32_varying: 2755; GFX1032: ; %bb.0: ; %entry 2756; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2757; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2758; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2759; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2760; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2761; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2762; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2763; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2764; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2765; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2766; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2767; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2768; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2769; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2770; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2771; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2772; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2773; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2774; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2775; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2776; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2777; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2778; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2779; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2780; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2781; GFX1032-NEXT: s_mov_b32 s2, -1 2782; GFX1032-NEXT: ; implicit-def: $vgpr0 2783; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2784; GFX1032-NEXT: s_cbranch_execz .LBB15_2 2785; GFX1032-NEXT: ; %bb.1: 2786; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2787; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2788; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2789; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2790; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 2791; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2792; GFX1032-NEXT: buffer_gl0_inv 2793; GFX1032-NEXT: .LBB15_2: 2794; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2795; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2796; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2797; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2798; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 2799; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2800; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2801; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2802; GFX1032-NEXT: s_endpgm 2803entry: 2804 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2805 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2806 store i32 %old, i32 addrspace(1)* %out 2807 ret void 2808} 2809 2810define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 2811; 2812; 2813; GFX7LESS-LABEL: xor_i32_varying: 2814; GFX7LESS: ; %bb.0: ; %entry 2815; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2816; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2817; GFX7LESS-NEXT: s_mov_b32 m0, -1 2818; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2819; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 2820; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2821; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2822; GFX7LESS-NEXT: s_mov_b32 s2, -1 2823; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2824; GFX7LESS-NEXT: s_endpgm 2825; 2826; GFX8-LABEL: xor_i32_varying: 2827; GFX8: ; %bb.0: ; %entry 2828; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2829; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2830; GFX8-NEXT: v_mov_b32_e32 v1, 0 2831; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2832; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2833; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2834; GFX8-NEXT: v_mov_b32_e32 v2, v0 2835; GFX8-NEXT: s_not_b64 exec, exec 2836; GFX8-NEXT: v_mov_b32_e32 v2, 0 2837; GFX8-NEXT: s_not_b64 exec, exec 2838; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2839; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2840; GFX8-NEXT: s_nop 1 2841; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2842; GFX8-NEXT: s_nop 1 2843; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2844; GFX8-NEXT: s_nop 1 2845; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2846; GFX8-NEXT: s_nop 1 2847; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2848; GFX8-NEXT: s_nop 1 2849; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2850; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2851; GFX8-NEXT: s_nop 0 2852; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2853; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2854; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2855; GFX8-NEXT: ; implicit-def: $vgpr0 2856; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2857; GFX8-NEXT: s_cbranch_execz .LBB16_2 2858; GFX8-NEXT: ; %bb.1: 2859; GFX8-NEXT: v_mov_b32_e32 v0, 0 2860; GFX8-NEXT: v_mov_b32_e32 v3, s4 2861; GFX8-NEXT: s_mov_b32 m0, -1 2862; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2863; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 2864; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2865; GFX8-NEXT: .LBB16_2: 2866; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2867; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2868; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2869; GFX8-NEXT: v_mov_b32_e32 v0, v1 2870; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 2871; GFX8-NEXT: s_mov_b32 s3, 0xf000 2872; GFX8-NEXT: s_mov_b32 s2, -1 2873; GFX8-NEXT: s_nop 0 2874; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2875; GFX8-NEXT: s_endpgm 2876; 2877; GFX9-LABEL: xor_i32_varying: 2878; GFX9: ; %bb.0: ; %entry 2879; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2880; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2881; GFX9-NEXT: v_mov_b32_e32 v1, 0 2882; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2883; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 2884; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 2885; GFX9-NEXT: v_mov_b32_e32 v2, v0 2886; GFX9-NEXT: s_not_b64 exec, exec 2887; GFX9-NEXT: v_mov_b32_e32 v2, 0 2888; GFX9-NEXT: s_not_b64 exec, exec 2889; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2890; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2891; GFX9-NEXT: s_nop 1 2892; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2893; GFX9-NEXT: s_nop 1 2894; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2895; GFX9-NEXT: s_nop 1 2896; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2897; GFX9-NEXT: s_nop 1 2898; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2899; GFX9-NEXT: s_nop 1 2900; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2901; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2902; GFX9-NEXT: s_nop 0 2903; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2904; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2905; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2906; GFX9-NEXT: ; implicit-def: $vgpr0 2907; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2908; GFX9-NEXT: s_cbranch_execz .LBB16_2 2909; GFX9-NEXT: ; %bb.1: 2910; GFX9-NEXT: v_mov_b32_e32 v0, 0 2911; GFX9-NEXT: v_mov_b32_e32 v3, s4 2912; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2913; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 2914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2915; GFX9-NEXT: .LBB16_2: 2916; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2918; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2919; GFX9-NEXT: v_mov_b32_e32 v0, v1 2920; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2921; GFX9-NEXT: s_mov_b32 s3, 0xf000 2922; GFX9-NEXT: s_mov_b32 s2, -1 2923; GFX9-NEXT: s_nop 0 2924; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2925; GFX9-NEXT: s_endpgm 2926; 2927; GFX1064-LABEL: xor_i32_varying: 2928; GFX1064: ; %bb.0: ; %entry 2929; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2930; GFX1064-NEXT: s_not_b64 exec, exec 2931; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2932; GFX1064-NEXT: s_not_b64 exec, exec 2933; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2934; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2935; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2936; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2937; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2938; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2939; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2940; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2941; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2942; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2943; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2944; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2945; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2946; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2947; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2948; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2949; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2950; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2951; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2952; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2953; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2954; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2955; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2956; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2957; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2958; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2959; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2960; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2961; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2962; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2963; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2964; GFX1064-NEXT: s_mov_b32 s2, -1 2965; GFX1064-NEXT: ; implicit-def: $vgpr0 2966; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2967; GFX1064-NEXT: s_cbranch_execz .LBB16_2 2968; GFX1064-NEXT: ; %bb.1: 2969; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2970; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2971; GFX1064-NEXT: s_mov_b32 s3, s7 2972; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2973; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2974; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 2975; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2976; GFX1064-NEXT: buffer_gl0_inv 2977; GFX1064-NEXT: .LBB16_2: 2978; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2979; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2980; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2981; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2982; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 2983; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2984; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2985; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2986; GFX1064-NEXT: s_endpgm 2987; 2988; GFX1032-LABEL: xor_i32_varying: 2989; GFX1032: ; %bb.0: ; %entry 2990; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2991; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2992; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2993; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2994; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2995; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2996; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2997; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2998; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2999; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3000; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3001; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3002; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3003; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3004; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3005; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3006; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3007; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3008; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3009; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3010; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3011; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3012; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3013; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3014; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3015; GFX1032-NEXT: s_mov_b32 s2, -1 3016; GFX1032-NEXT: ; implicit-def: $vgpr0 3017; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3018; GFX1032-NEXT: s_cbranch_execz .LBB16_2 3019; GFX1032-NEXT: ; %bb.1: 3020; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3021; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3022; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3023; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3024; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 3025; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3026; GFX1032-NEXT: buffer_gl0_inv 3027; GFX1032-NEXT: .LBB16_2: 3028; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3029; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3030; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3031; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3032; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3033; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3034; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3035; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3036; GFX1032-NEXT: s_endpgm 3037entry: 3038 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3039 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3040 store i32 %old, i32 addrspace(1)* %out 3041 ret void 3042} 3043 3044define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3045; 3046; 3047; GFX7LESS-LABEL: max_i32_varying: 3048; GFX7LESS: ; %bb.0: ; %entry 3049; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3050; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3051; GFX7LESS-NEXT: s_mov_b32 m0, -1 3052; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3053; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3054; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3055; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3056; GFX7LESS-NEXT: s_mov_b32 s2, -1 3057; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3058; GFX7LESS-NEXT: s_endpgm 3059; 3060; GFX8-LABEL: max_i32_varying: 3061; GFX8: ; %bb.0: ; %entry 3062; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3063; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3064; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3065; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3066; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3067; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3068; GFX8-NEXT: v_mov_b32_e32 v2, v0 3069; GFX8-NEXT: s_not_b64 exec, exec 3070; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 3071; GFX8-NEXT: s_not_b64 exec, exec 3072; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3073; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3074; GFX8-NEXT: s_nop 1 3075; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3076; GFX8-NEXT: s_nop 1 3077; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3078; GFX8-NEXT: s_nop 1 3079; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3080; GFX8-NEXT: s_nop 1 3081; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3082; GFX8-NEXT: s_nop 1 3083; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3084; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3085; GFX8-NEXT: s_nop 0 3086; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3087; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3088; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3089; GFX8-NEXT: ; implicit-def: $vgpr0 3090; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3091; GFX8-NEXT: s_cbranch_execz .LBB17_2 3092; GFX8-NEXT: ; %bb.1: 3093; GFX8-NEXT: v_mov_b32_e32 v0, 0 3094; GFX8-NEXT: v_mov_b32_e32 v3, s4 3095; GFX8-NEXT: s_mov_b32 m0, -1 3096; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3097; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3098; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3099; GFX8-NEXT: .LBB17_2: 3100; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3101; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3102; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3103; GFX8-NEXT: v_mov_b32_e32 v0, v1 3104; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3105; GFX8-NEXT: s_mov_b32 s3, 0xf000 3106; GFX8-NEXT: s_mov_b32 s2, -1 3107; GFX8-NEXT: s_nop 0 3108; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3109; GFX8-NEXT: s_endpgm 3110; 3111; GFX9-LABEL: max_i32_varying: 3112; GFX9: ; %bb.0: ; %entry 3113; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3114; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3115; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3116; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3117; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3118; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3119; GFX9-NEXT: v_mov_b32_e32 v2, v0 3120; GFX9-NEXT: s_not_b64 exec, exec 3121; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 3122; GFX9-NEXT: s_not_b64 exec, exec 3123; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3124; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3125; GFX9-NEXT: s_nop 1 3126; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3127; GFX9-NEXT: s_nop 1 3128; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3129; GFX9-NEXT: s_nop 1 3130; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3131; GFX9-NEXT: s_nop 1 3132; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3133; GFX9-NEXT: s_nop 1 3134; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3135; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3136; GFX9-NEXT: s_nop 0 3137; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3138; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3139; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3140; GFX9-NEXT: ; implicit-def: $vgpr0 3141; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3142; GFX9-NEXT: s_cbranch_execz .LBB17_2 3143; GFX9-NEXT: ; %bb.1: 3144; GFX9-NEXT: v_mov_b32_e32 v0, 0 3145; GFX9-NEXT: v_mov_b32_e32 v3, s4 3146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3147; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3148; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3149; GFX9-NEXT: .LBB17_2: 3150; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3151; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3152; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3153; GFX9-NEXT: v_mov_b32_e32 v0, v1 3154; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3155; GFX9-NEXT: s_mov_b32 s3, 0xf000 3156; GFX9-NEXT: s_mov_b32 s2, -1 3157; GFX9-NEXT: s_nop 0 3158; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3159; GFX9-NEXT: s_endpgm 3160; 3161; GFX1064-LABEL: max_i32_varying: 3162; GFX1064: ; %bb.0: ; %entry 3163; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3164; GFX1064-NEXT: s_not_b64 exec, exec 3165; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3166; GFX1064-NEXT: s_not_b64 exec, exec 3167; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3168; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3169; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 3170; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3171; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3172; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3173; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3174; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3175; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3176; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3177; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3178; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3179; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3180; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3181; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3182; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3183; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3184; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3185; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3186; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3187; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3188; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3189; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3190; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3191; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3192; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3193; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3194; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3195; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3196; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3197; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3198; GFX1064-NEXT: s_mov_b32 s2, -1 3199; GFX1064-NEXT: ; implicit-def: $vgpr0 3200; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3201; GFX1064-NEXT: s_cbranch_execz .LBB17_2 3202; GFX1064-NEXT: ; %bb.1: 3203; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3204; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3205; GFX1064-NEXT: s_mov_b32 s3, s7 3206; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3207; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3208; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 3209; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3210; GFX1064-NEXT: buffer_gl0_inv 3211; GFX1064-NEXT: .LBB17_2: 3212; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3213; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3214; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3215; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3216; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3217; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3218; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3219; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3220; GFX1064-NEXT: s_endpgm 3221; 3222; GFX1032-LABEL: max_i32_varying: 3223; GFX1032: ; %bb.0: ; %entry 3224; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3225; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3226; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3227; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3228; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3229; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3230; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3231; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3232; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3233; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3234; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3235; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3236; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3237; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3238; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3239; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 3240; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3241; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3242; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3243; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3244; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3245; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3246; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3247; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3248; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3249; GFX1032-NEXT: s_mov_b32 s2, -1 3250; GFX1032-NEXT: ; implicit-def: $vgpr0 3251; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3252; GFX1032-NEXT: s_cbranch_execz .LBB17_2 3253; GFX1032-NEXT: ; %bb.1: 3254; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3255; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3256; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3257; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3258; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 3259; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3260; GFX1032-NEXT: buffer_gl0_inv 3261; GFX1032-NEXT: .LBB17_2: 3262; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3263; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3264; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3265; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3266; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3267; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3268; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3269; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3270; GFX1032-NEXT: s_endpgm 3271entry: 3272 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3273 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3274 store i32 %old, i32 addrspace(1)* %out 3275 ret void 3276} 3277 3278define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3279; 3280; 3281; GFX7LESS-LABEL: max_i64_constant: 3282; GFX7LESS: ; %bb.0: ; %entry 3283; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3284; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3285; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3286; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3287; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3288; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3289; GFX7LESS-NEXT: s_cbranch_execz .LBB18_2 3290; GFX7LESS-NEXT: ; %bb.1: 3291; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 3292; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3293; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3294; GFX7LESS-NEXT: s_mov_b32 m0, -1 3295; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3296; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3297; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3298; GFX7LESS-NEXT: .LBB18_2: 3299; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3300; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3301; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3302; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3303; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3304; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3305; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3306; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3307; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3308; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3309; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3310; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3311; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3312; GFX7LESS-NEXT: s_mov_b32 s2, -1 3313; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3314; GFX7LESS-NEXT: s_endpgm 3315; 3316; GFX8-LABEL: max_i64_constant: 3317; GFX8: ; %bb.0: ; %entry 3318; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3319; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3320; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3321; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3322; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3323; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3324; GFX8-NEXT: s_cbranch_execz .LBB18_2 3325; GFX8-NEXT: ; %bb.1: 3326; GFX8-NEXT: v_mov_b32_e32 v0, 5 3327; GFX8-NEXT: v_mov_b32_e32 v2, 0 3328; GFX8-NEXT: v_mov_b32_e32 v1, 0 3329; GFX8-NEXT: s_mov_b32 m0, -1 3330; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3331; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3333; GFX8-NEXT: .LBB18_2: 3334; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3335; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3336; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3337; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3338; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3339; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3340; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3341; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3342; GFX8-NEXT: v_mov_b32_e32 v2, s3 3343; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3344; GFX8-NEXT: v_mov_b32_e32 v2, s2 3345; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3346; GFX8-NEXT: s_mov_b32 s3, 0xf000 3347; GFX8-NEXT: s_mov_b32 s2, -1 3348; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3349; GFX8-NEXT: s_endpgm 3350; 3351; GFX9-LABEL: max_i64_constant: 3352; GFX9: ; %bb.0: ; %entry 3353; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3354; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3355; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3356; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3357; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3358; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3359; GFX9-NEXT: s_cbranch_execz .LBB18_2 3360; GFX9-NEXT: ; %bb.1: 3361; GFX9-NEXT: v_mov_b32_e32 v0, 5 3362; GFX9-NEXT: v_mov_b32_e32 v1, 0 3363; GFX9-NEXT: v_mov_b32_e32 v2, 0 3364; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3365; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3367; GFX9-NEXT: .LBB18_2: 3368; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3369; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3370; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3371; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3372; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3373; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3374; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3375; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3376; GFX9-NEXT: v_mov_b32_e32 v2, s3 3377; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3378; GFX9-NEXT: v_mov_b32_e32 v2, s2 3379; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3380; GFX9-NEXT: s_mov_b32 s3, 0xf000 3381; GFX9-NEXT: s_mov_b32 s2, -1 3382; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3383; GFX9-NEXT: s_endpgm 3384; 3385; GFX1064-LABEL: max_i64_constant: 3386; GFX1064: ; %bb.0: ; %entry 3387; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3388; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3389; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3390; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3391; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3392; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3393; GFX1064-NEXT: s_cbranch_execz .LBB18_2 3394; GFX1064-NEXT: ; %bb.1: 3395; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3396; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3397; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3398; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3399; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3400; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3401; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3402; GFX1064-NEXT: buffer_gl0_inv 3403; GFX1064-NEXT: .LBB18_2: 3404; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3405; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3406; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3407; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3408; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3409; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3410; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3411; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3412; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3413; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3414; GFX1064-NEXT: s_mov_b32 s2, -1 3415; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3416; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3417; GFX1064-NEXT: s_endpgm 3418; 3419; GFX1032-LABEL: max_i64_constant: 3420; GFX1032: ; %bb.0: ; %entry 3421; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3422; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3423; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3424; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3425; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3426; GFX1032-NEXT: s_cbranch_execz .LBB18_2 3427; GFX1032-NEXT: ; %bb.1: 3428; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3429; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3430; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3431; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3432; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3433; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3434; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3435; GFX1032-NEXT: buffer_gl0_inv 3436; GFX1032-NEXT: .LBB18_2: 3437; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3438; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3439; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3440; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3441; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3442; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3443; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3444; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3445; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3446; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3447; GFX1032-NEXT: s_mov_b32 s2, -1 3448; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3449; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3450; GFX1032-NEXT: s_endpgm 3451entry: 3452 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3453 store i64 %old, i64 addrspace(1)* %out 3454 ret void 3455} 3456 3457define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3458; 3459; 3460; GFX7LESS-LABEL: min_i32_varying: 3461; GFX7LESS: ; %bb.0: ; %entry 3462; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3463; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3464; GFX7LESS-NEXT: s_mov_b32 m0, -1 3465; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3466; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3467; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3468; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3469; GFX7LESS-NEXT: s_mov_b32 s2, -1 3470; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3471; GFX7LESS-NEXT: s_endpgm 3472; 3473; GFX8-LABEL: min_i32_varying: 3474; GFX8: ; %bb.0: ; %entry 3475; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3476; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3477; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3478; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3479; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3480; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3481; GFX8-NEXT: v_mov_b32_e32 v2, v0 3482; GFX8-NEXT: s_not_b64 exec, exec 3483; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 3484; GFX8-NEXT: s_not_b64 exec, exec 3485; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3486; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3487; GFX8-NEXT: s_nop 1 3488; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3489; GFX8-NEXT: s_nop 1 3490; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3491; GFX8-NEXT: s_nop 1 3492; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3493; GFX8-NEXT: s_nop 1 3494; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3495; GFX8-NEXT: s_nop 1 3496; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3497; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3498; GFX8-NEXT: s_nop 0 3499; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3500; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3501; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3502; GFX8-NEXT: ; implicit-def: $vgpr0 3503; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3504; GFX8-NEXT: s_cbranch_execz .LBB19_2 3505; GFX8-NEXT: ; %bb.1: 3506; GFX8-NEXT: v_mov_b32_e32 v0, 0 3507; GFX8-NEXT: v_mov_b32_e32 v3, s4 3508; GFX8-NEXT: s_mov_b32 m0, -1 3509; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3510; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3511; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3512; GFX8-NEXT: .LBB19_2: 3513; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3514; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3515; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3516; GFX8-NEXT: v_mov_b32_e32 v0, v1 3517; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3518; GFX8-NEXT: s_mov_b32 s3, 0xf000 3519; GFX8-NEXT: s_mov_b32 s2, -1 3520; GFX8-NEXT: s_nop 0 3521; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3522; GFX8-NEXT: s_endpgm 3523; 3524; GFX9-LABEL: min_i32_varying: 3525; GFX9: ; %bb.0: ; %entry 3526; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3527; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3528; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3529; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3530; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3531; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3532; GFX9-NEXT: v_mov_b32_e32 v2, v0 3533; GFX9-NEXT: s_not_b64 exec, exec 3534; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 3535; GFX9-NEXT: s_not_b64 exec, exec 3536; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3537; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3538; GFX9-NEXT: s_nop 1 3539; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3540; GFX9-NEXT: s_nop 1 3541; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3542; GFX9-NEXT: s_nop 1 3543; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3544; GFX9-NEXT: s_nop 1 3545; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3546; GFX9-NEXT: s_nop 1 3547; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3548; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3549; GFX9-NEXT: s_nop 0 3550; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3551; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3552; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3553; GFX9-NEXT: ; implicit-def: $vgpr0 3554; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3555; GFX9-NEXT: s_cbranch_execz .LBB19_2 3556; GFX9-NEXT: ; %bb.1: 3557; GFX9-NEXT: v_mov_b32_e32 v0, 0 3558; GFX9-NEXT: v_mov_b32_e32 v3, s4 3559; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3560; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3561; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3562; GFX9-NEXT: .LBB19_2: 3563; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3564; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3565; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3566; GFX9-NEXT: v_mov_b32_e32 v0, v1 3567; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3568; GFX9-NEXT: s_mov_b32 s3, 0xf000 3569; GFX9-NEXT: s_mov_b32 s2, -1 3570; GFX9-NEXT: s_nop 0 3571; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3572; GFX9-NEXT: s_endpgm 3573; 3574; GFX1064-LABEL: min_i32_varying: 3575; GFX1064: ; %bb.0: ; %entry 3576; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3577; GFX1064-NEXT: s_not_b64 exec, exec 3578; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3579; GFX1064-NEXT: s_not_b64 exec, exec 3580; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3581; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3582; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 3583; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3584; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3585; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3586; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3587; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3588; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3589; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3590; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3591; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3592; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3593; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3594; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3595; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3596; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3597; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3598; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3599; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3600; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3601; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3602; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3603; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3604; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3605; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3606; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3607; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3608; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3609; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3610; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3611; GFX1064-NEXT: s_mov_b32 s2, -1 3612; GFX1064-NEXT: ; implicit-def: $vgpr0 3613; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3614; GFX1064-NEXT: s_cbranch_execz .LBB19_2 3615; GFX1064-NEXT: ; %bb.1: 3616; GFX1064-NEXT: v_mov_b32_e32 v0, 0 3617; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3618; GFX1064-NEXT: s_mov_b32 s3, s7 3619; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3620; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3621; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 3622; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3623; GFX1064-NEXT: buffer_gl0_inv 3624; GFX1064-NEXT: .LBB19_2: 3625; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3626; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3627; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3628; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3629; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3630; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3631; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3632; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3633; GFX1064-NEXT: s_endpgm 3634; 3635; GFX1032-LABEL: min_i32_varying: 3636; GFX1032: ; %bb.0: ; %entry 3637; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3638; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3639; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3640; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3641; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3642; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3643; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 3644; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 3645; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 3646; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3647; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3648; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3649; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3650; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3651; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3652; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 3653; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3654; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3655; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3656; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3657; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3658; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3659; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3660; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3661; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3662; GFX1032-NEXT: s_mov_b32 s2, -1 3663; GFX1032-NEXT: ; implicit-def: $vgpr0 3664; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3665; GFX1032-NEXT: s_cbranch_execz .LBB19_2 3666; GFX1032-NEXT: ; %bb.1: 3667; GFX1032-NEXT: v_mov_b32_e32 v0, 0 3668; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3669; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3670; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3671; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 3672; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3673; GFX1032-NEXT: buffer_gl0_inv 3674; GFX1032-NEXT: .LBB19_2: 3675; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3676; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3677; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3678; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3679; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3680; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3681; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3682; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3683; GFX1032-NEXT: s_endpgm 3684entry: 3685 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3686 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3687 store i32 %old, i32 addrspace(1)* %out 3688 ret void 3689} 3690 3691define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3692; 3693; 3694; GFX7LESS-LABEL: min_i64_constant: 3695; GFX7LESS: ; %bb.0: ; %entry 3696; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3697; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3698; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3699; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3700; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3701; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3702; GFX7LESS-NEXT: s_cbranch_execz .LBB20_2 3703; GFX7LESS-NEXT: ; %bb.1: 3704; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 3705; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3706; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3707; GFX7LESS-NEXT: s_mov_b32 m0, -1 3708; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3709; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3710; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3711; GFX7LESS-NEXT: .LBB20_2: 3712; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3713; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3714; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3715; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3716; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3717; GFX7LESS-NEXT: s_mov_b32 s2, -1 3718; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3719; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3720; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3721; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3722; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3723; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3724; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3725; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3726; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3727; GFX7LESS-NEXT: s_endpgm 3728; 3729; GFX8-LABEL: min_i64_constant: 3730; GFX8: ; %bb.0: ; %entry 3731; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3732; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3733; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3734; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3735; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3736; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3737; GFX8-NEXT: s_cbranch_execz .LBB20_2 3738; GFX8-NEXT: ; %bb.1: 3739; GFX8-NEXT: v_mov_b32_e32 v0, 5 3740; GFX8-NEXT: v_mov_b32_e32 v2, 0 3741; GFX8-NEXT: v_mov_b32_e32 v1, 0 3742; GFX8-NEXT: s_mov_b32 m0, -1 3743; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3744; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3745; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3746; GFX8-NEXT: .LBB20_2: 3747; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3748; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3749; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3750; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3751; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3752; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3753; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3754; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3755; GFX8-NEXT: v_mov_b32_e32 v2, s5 3756; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3757; GFX8-NEXT: v_mov_b32_e32 v2, s4 3758; GFX8-NEXT: s_mov_b32 s2, -1 3759; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3760; GFX8-NEXT: s_mov_b32 s3, 0xf000 3761; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3762; GFX8-NEXT: s_endpgm 3763; 3764; GFX9-LABEL: min_i64_constant: 3765; GFX9: ; %bb.0: ; %entry 3766; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3767; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3768; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3769; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3770; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3771; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3772; GFX9-NEXT: s_cbranch_execz .LBB20_2 3773; GFX9-NEXT: ; %bb.1: 3774; GFX9-NEXT: v_mov_b32_e32 v0, 5 3775; GFX9-NEXT: v_mov_b32_e32 v1, 0 3776; GFX9-NEXT: v_mov_b32_e32 v2, 0 3777; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3778; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3779; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3780; GFX9-NEXT: .LBB20_2: 3781; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3782; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3783; GFX9-NEXT: v_readfirstlane_b32 s4, v0 3784; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 3785; GFX9-NEXT: v_readfirstlane_b32 s5, v1 3786; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3787; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3788; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3789; GFX9-NEXT: v_mov_b32_e32 v2, s5 3790; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3791; GFX9-NEXT: v_mov_b32_e32 v2, s4 3792; GFX9-NEXT: s_mov_b32 s2, -1 3793; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3794; GFX9-NEXT: s_mov_b32 s3, 0xf000 3795; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3796; GFX9-NEXT: s_endpgm 3797; 3798; GFX1064-LABEL: min_i64_constant: 3799; GFX1064: ; %bb.0: ; %entry 3800; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3801; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3802; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3803; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3804; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3805; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3806; GFX1064-NEXT: s_cbranch_execz .LBB20_2 3807; GFX1064-NEXT: ; %bb.1: 3808; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3809; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3810; GFX1064-NEXT: v_mov_b32_e32 v2, 0 3811; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3812; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3813; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3814; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3815; GFX1064-NEXT: buffer_gl0_inv 3816; GFX1064-NEXT: .LBB20_2: 3817; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3818; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3819; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3820; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3821; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 3822; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3823; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 3824; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3825; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3826; GFX1064-NEXT: s_mov_b32 s2, -1 3827; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3828; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3829; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3830; GFX1064-NEXT: s_endpgm 3831; 3832; GFX1032-LABEL: min_i64_constant: 3833; GFX1032: ; %bb.0: ; %entry 3834; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3835; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3836; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3837; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3838; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3839; GFX1032-NEXT: s_cbranch_execz .LBB20_2 3840; GFX1032-NEXT: ; %bb.1: 3841; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3842; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3843; GFX1032-NEXT: v_mov_b32_e32 v2, 0 3844; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3845; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3846; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3847; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3848; GFX1032-NEXT: buffer_gl0_inv 3849; GFX1032-NEXT: .LBB20_2: 3850; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3851; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3852; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3853; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3854; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 3855; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 3856; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 3857; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3858; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3859; GFX1032-NEXT: s_mov_b32 s2, -1 3860; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3861; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3862; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3863; GFX1032-NEXT: s_endpgm 3864entry: 3865 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 3866 store i64 %old, i64 addrspace(1)* %out 3867 ret void 3868} 3869 3870define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 3871; 3872; 3873; GFX7LESS-LABEL: umax_i32_varying: 3874; GFX7LESS: ; %bb.0: ; %entry 3875; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3876; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3877; GFX7LESS-NEXT: s_mov_b32 m0, -1 3878; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3879; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 3880; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3881; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3882; GFX7LESS-NEXT: s_mov_b32 s2, -1 3883; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3884; GFX7LESS-NEXT: s_endpgm 3885; 3886; GFX8-LABEL: umax_i32_varying: 3887; GFX8: ; %bb.0: ; %entry 3888; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3889; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3890; GFX8-NEXT: v_mov_b32_e32 v1, 0 3891; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3892; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3893; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3894; GFX8-NEXT: v_mov_b32_e32 v2, v0 3895; GFX8-NEXT: s_not_b64 exec, exec 3896; GFX8-NEXT: v_mov_b32_e32 v2, 0 3897; GFX8-NEXT: s_not_b64 exec, exec 3898; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3899; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3900; GFX8-NEXT: s_nop 1 3901; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3902; GFX8-NEXT: s_nop 1 3903; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3904; GFX8-NEXT: s_nop 1 3905; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3906; GFX8-NEXT: s_nop 1 3907; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3908; GFX8-NEXT: s_nop 1 3909; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3910; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3911; GFX8-NEXT: s_nop 0 3912; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3913; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3914; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3915; GFX8-NEXT: ; implicit-def: $vgpr0 3916; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3917; GFX8-NEXT: s_cbranch_execz .LBB21_2 3918; GFX8-NEXT: ; %bb.1: 3919; GFX8-NEXT: v_mov_b32_e32 v0, 0 3920; GFX8-NEXT: v_mov_b32_e32 v3, s4 3921; GFX8-NEXT: s_mov_b32 m0, -1 3922; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3923; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 3924; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3925; GFX8-NEXT: .LBB21_2: 3926; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3927; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3928; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3929; GFX8-NEXT: v_mov_b32_e32 v0, v1 3930; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 3931; GFX8-NEXT: s_mov_b32 s3, 0xf000 3932; GFX8-NEXT: s_mov_b32 s2, -1 3933; GFX8-NEXT: s_nop 0 3934; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3935; GFX8-NEXT: s_endpgm 3936; 3937; GFX9-LABEL: umax_i32_varying: 3938; GFX9: ; %bb.0: ; %entry 3939; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3940; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3941; GFX9-NEXT: v_mov_b32_e32 v1, 0 3942; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3943; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 3944; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 3945; GFX9-NEXT: v_mov_b32_e32 v2, v0 3946; GFX9-NEXT: s_not_b64 exec, exec 3947; GFX9-NEXT: v_mov_b32_e32 v2, 0 3948; GFX9-NEXT: s_not_b64 exec, exec 3949; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3950; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3951; GFX9-NEXT: s_nop 1 3952; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3953; GFX9-NEXT: s_nop 1 3954; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3955; GFX9-NEXT: s_nop 1 3956; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3957; GFX9-NEXT: s_nop 1 3958; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3959; GFX9-NEXT: s_nop 1 3960; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3961; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3962; GFX9-NEXT: s_nop 0 3963; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3964; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3965; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 3966; GFX9-NEXT: ; implicit-def: $vgpr0 3967; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3968; GFX9-NEXT: s_cbranch_execz .LBB21_2 3969; GFX9-NEXT: ; %bb.1: 3970; GFX9-NEXT: v_mov_b32_e32 v0, 0 3971; GFX9-NEXT: v_mov_b32_e32 v3, s4 3972; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3973; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 3974; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3975; GFX9-NEXT: .LBB21_2: 3976; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3977; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3978; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3979; GFX9-NEXT: v_mov_b32_e32 v0, v1 3980; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 3981; GFX9-NEXT: s_mov_b32 s3, 0xf000 3982; GFX9-NEXT: s_mov_b32 s2, -1 3983; GFX9-NEXT: s_nop 0 3984; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3985; GFX9-NEXT: s_endpgm 3986; 3987; GFX1064-LABEL: umax_i32_varying: 3988; GFX1064: ; %bb.0: ; %entry 3989; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3990; GFX1064-NEXT: s_not_b64 exec, exec 3991; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3992; GFX1064-NEXT: s_not_b64 exec, exec 3993; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3994; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3995; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3996; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3997; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3998; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3999; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4000; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4001; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4002; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4003; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4004; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4005; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4006; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4007; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4008; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4009; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4010; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4011; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4012; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4013; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4014; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4015; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4016; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4017; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4018; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4019; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4020; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4021; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4022; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4023; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4024; GFX1064-NEXT: s_mov_b32 s2, -1 4025; GFX1064-NEXT: ; implicit-def: $vgpr0 4026; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4027; GFX1064-NEXT: s_cbranch_execz .LBB21_2 4028; GFX1064-NEXT: ; %bb.1: 4029; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4030; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4031; GFX1064-NEXT: s_mov_b32 s3, s7 4032; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4033; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4034; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 4035; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4036; GFX1064-NEXT: buffer_gl0_inv 4037; GFX1064-NEXT: .LBB21_2: 4038; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4039; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4040; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4041; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4042; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4043; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4044; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4045; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4046; GFX1064-NEXT: s_endpgm 4047; 4048; GFX1032-LABEL: umax_i32_varying: 4049; GFX1032: ; %bb.0: ; %entry 4050; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4051; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4052; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4053; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4054; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4055; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4056; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4057; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4058; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4059; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4060; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4061; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4062; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4063; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4064; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4065; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4066; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4067; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4068; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4069; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4070; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4071; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4072; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4073; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4074; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4075; GFX1032-NEXT: s_mov_b32 s2, -1 4076; GFX1032-NEXT: ; implicit-def: $vgpr0 4077; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4078; GFX1032-NEXT: s_cbranch_execz .LBB21_2 4079; GFX1032-NEXT: ; %bb.1: 4080; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4081; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4082; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4083; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4084; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 4085; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4086; GFX1032-NEXT: buffer_gl0_inv 4087; GFX1032-NEXT: .LBB21_2: 4088; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4089; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4090; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4091; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4092; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4093; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4094; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4095; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4096; GFX1032-NEXT: s_endpgm 4097entry: 4098 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4099 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4100 store i32 %old, i32 addrspace(1)* %out 4101 ret void 4102} 4103 4104define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4105; 4106; 4107; GFX7LESS-LABEL: umax_i64_constant: 4108; GFX7LESS: ; %bb.0: ; %entry 4109; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4110; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4111; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4112; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4113; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4114; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4115; GFX7LESS-NEXT: s_cbranch_execz .LBB22_2 4116; GFX7LESS-NEXT: ; %bb.1: 4117; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4118; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4119; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4120; GFX7LESS-NEXT: s_mov_b32 m0, -1 4121; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4122; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4123; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4124; GFX7LESS-NEXT: .LBB22_2: 4125; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4126; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4127; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4128; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4129; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4130; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4131; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4132; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4133; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4134; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4135; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4136; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4137; GFX7LESS-NEXT: s_mov_b32 s2, -1 4138; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4139; GFX7LESS-NEXT: s_endpgm 4140; 4141; GFX8-LABEL: umax_i64_constant: 4142; GFX8: ; %bb.0: ; %entry 4143; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4144; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4145; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4146; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4147; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4148; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4149; GFX8-NEXT: s_cbranch_execz .LBB22_2 4150; GFX8-NEXT: ; %bb.1: 4151; GFX8-NEXT: v_mov_b32_e32 v0, 5 4152; GFX8-NEXT: v_mov_b32_e32 v2, 0 4153; GFX8-NEXT: v_mov_b32_e32 v1, 0 4154; GFX8-NEXT: s_mov_b32 m0, -1 4155; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4156; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4157; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4158; GFX8-NEXT: .LBB22_2: 4159; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4160; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4161; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4162; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4163; GFX8-NEXT: v_mov_b32_e32 v1, 0 4164; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4165; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4166; GFX8-NEXT: v_mov_b32_e32 v2, s2 4167; GFX8-NEXT: v_mov_b32_e32 v1, s3 4168; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4169; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4170; GFX8-NEXT: s_mov_b32 s3, 0xf000 4171; GFX8-NEXT: s_mov_b32 s2, -1 4172; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4173; GFX8-NEXT: s_endpgm 4174; 4175; GFX9-LABEL: umax_i64_constant: 4176; GFX9: ; %bb.0: ; %entry 4177; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4178; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4179; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4180; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4181; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4182; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4183; GFX9-NEXT: s_cbranch_execz .LBB22_2 4184; GFX9-NEXT: ; %bb.1: 4185; GFX9-NEXT: v_mov_b32_e32 v0, 5 4186; GFX9-NEXT: v_mov_b32_e32 v1, 0 4187; GFX9-NEXT: v_mov_b32_e32 v2, 0 4188; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4189; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4190; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4191; GFX9-NEXT: .LBB22_2: 4192; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4193; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4194; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4195; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4196; GFX9-NEXT: v_mov_b32_e32 v1, 0 4197; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4198; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4199; GFX9-NEXT: v_mov_b32_e32 v2, s2 4200; GFX9-NEXT: v_mov_b32_e32 v1, s3 4201; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4202; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4203; GFX9-NEXT: s_mov_b32 s3, 0xf000 4204; GFX9-NEXT: s_mov_b32 s2, -1 4205; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4206; GFX9-NEXT: s_endpgm 4207; 4208; GFX1064-LABEL: umax_i64_constant: 4209; GFX1064: ; %bb.0: ; %entry 4210; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4211; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4212; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4213; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4214; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4215; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4216; GFX1064-NEXT: s_cbranch_execz .LBB22_2 4217; GFX1064-NEXT: ; %bb.1: 4218; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4219; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4220; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4221; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4222; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4223; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4224; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4225; GFX1064-NEXT: buffer_gl0_inv 4226; GFX1064-NEXT: .LBB22_2: 4227; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4228; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4229; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4230; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4231; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4232; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4233; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4234; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4235; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4236; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4237; GFX1064-NEXT: s_mov_b32 s2, -1 4238; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4239; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4240; GFX1064-NEXT: s_endpgm 4241; 4242; GFX1032-LABEL: umax_i64_constant: 4243; GFX1032: ; %bb.0: ; %entry 4244; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4245; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4246; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4247; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4248; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4249; GFX1032-NEXT: s_cbranch_execz .LBB22_2 4250; GFX1032-NEXT: ; %bb.1: 4251; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4252; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4253; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4254; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4255; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4256; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4257; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4258; GFX1032-NEXT: buffer_gl0_inv 4259; GFX1032-NEXT: .LBB22_2: 4260; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4261; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4262; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4263; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4264; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4265; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4266; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4267; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4268; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4269; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4270; GFX1032-NEXT: s_mov_b32 s2, -1 4271; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4272; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4273; GFX1032-NEXT: s_endpgm 4274entry: 4275 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4276 store i64 %old, i64 addrspace(1)* %out 4277 ret void 4278} 4279 4280define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4281; 4282; 4283; GFX7LESS-LABEL: umin_i32_varying: 4284; GFX7LESS: ; %bb.0: ; %entry 4285; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4286; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4287; GFX7LESS-NEXT: s_mov_b32 m0, -1 4288; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4289; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4290; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4291; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4292; GFX7LESS-NEXT: s_mov_b32 s2, -1 4293; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4294; GFX7LESS-NEXT: s_endpgm 4295; 4296; GFX8-LABEL: umin_i32_varying: 4297; GFX8: ; %bb.0: ; %entry 4298; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4299; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4300; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4301; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4302; GFX8-NEXT: v_mov_b32_e32 v1, -1 4303; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4304; GFX8-NEXT: v_mov_b32_e32 v2, v0 4305; GFX8-NEXT: s_not_b64 exec, exec 4306; GFX8-NEXT: v_mov_b32_e32 v2, -1 4307; GFX8-NEXT: s_not_b64 exec, exec 4308; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4309; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4310; GFX8-NEXT: s_nop 1 4311; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4312; GFX8-NEXT: s_nop 1 4313; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4314; GFX8-NEXT: s_nop 1 4315; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4316; GFX8-NEXT: s_nop 1 4317; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4318; GFX8-NEXT: s_nop 1 4319; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4320; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4321; GFX8-NEXT: s_nop 0 4322; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4323; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4324; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4325; GFX8-NEXT: ; implicit-def: $vgpr0 4326; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4327; GFX8-NEXT: s_cbranch_execz .LBB23_2 4328; GFX8-NEXT: ; %bb.1: 4329; GFX8-NEXT: v_mov_b32_e32 v0, 0 4330; GFX8-NEXT: v_mov_b32_e32 v3, s4 4331; GFX8-NEXT: s_mov_b32 m0, -1 4332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4334; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4335; GFX8-NEXT: .LBB23_2: 4336; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4337; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4338; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4339; GFX8-NEXT: v_mov_b32_e32 v0, v1 4340; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4341; GFX8-NEXT: s_mov_b32 s3, 0xf000 4342; GFX8-NEXT: s_mov_b32 s2, -1 4343; GFX8-NEXT: s_nop 0 4344; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4345; GFX8-NEXT: s_endpgm 4346; 4347; GFX9-LABEL: umin_i32_varying: 4348; GFX9: ; %bb.0: ; %entry 4349; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4350; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 4351; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 4352; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4353; GFX9-NEXT: v_mov_b32_e32 v1, -1 4354; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4355; GFX9-NEXT: v_mov_b32_e32 v2, v0 4356; GFX9-NEXT: s_not_b64 exec, exec 4357; GFX9-NEXT: v_mov_b32_e32 v2, -1 4358; GFX9-NEXT: s_not_b64 exec, exec 4359; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4360; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4361; GFX9-NEXT: s_nop 1 4362; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4363; GFX9-NEXT: s_nop 1 4364; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4365; GFX9-NEXT: s_nop 1 4366; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4367; GFX9-NEXT: s_nop 1 4368; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4369; GFX9-NEXT: s_nop 1 4370; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4371; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4372; GFX9-NEXT: s_nop 0 4373; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4374; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4375; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 4376; GFX9-NEXT: ; implicit-def: $vgpr0 4377; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4378; GFX9-NEXT: s_cbranch_execz .LBB23_2 4379; GFX9-NEXT: ; %bb.1: 4380; GFX9-NEXT: v_mov_b32_e32 v0, 0 4381; GFX9-NEXT: v_mov_b32_e32 v3, s4 4382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4383; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4385; GFX9-NEXT: .LBB23_2: 4386; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4387; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4388; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4389; GFX9-NEXT: v_mov_b32_e32 v0, v1 4390; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4391; GFX9-NEXT: s_mov_b32 s3, 0xf000 4392; GFX9-NEXT: s_mov_b32 s2, -1 4393; GFX9-NEXT: s_nop 0 4394; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4395; GFX9-NEXT: s_endpgm 4396; 4397; GFX1064-LABEL: umin_i32_varying: 4398; GFX1064: ; %bb.0: ; %entry 4399; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4400; GFX1064-NEXT: s_not_b64 exec, exec 4401; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4402; GFX1064-NEXT: s_not_b64 exec, exec 4403; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4404; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4405; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4406; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4407; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4408; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4409; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4410; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4411; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4412; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4413; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4414; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4415; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4416; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4417; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4418; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4419; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4420; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4421; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4422; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4423; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4424; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4425; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4426; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4427; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4428; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4429; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4430; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4431; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4432; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4433; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4434; GFX1064-NEXT: s_mov_b32 s2, -1 4435; GFX1064-NEXT: ; implicit-def: $vgpr0 4436; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4437; GFX1064-NEXT: s_cbranch_execz .LBB23_2 4438; GFX1064-NEXT: ; %bb.1: 4439; GFX1064-NEXT: v_mov_b32_e32 v0, 0 4440; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4441; GFX1064-NEXT: s_mov_b32 s3, s7 4442; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4443; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4444; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 4445; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4446; GFX1064-NEXT: buffer_gl0_inv 4447; GFX1064-NEXT: .LBB23_2: 4448; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4449; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4450; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4451; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4452; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4453; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4454; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4455; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4456; GFX1064-NEXT: s_endpgm 4457; 4458; GFX1032-LABEL: umin_i32_varying: 4459; GFX1032: ; %bb.0: ; %entry 4460; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4461; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4462; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4463; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4464; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4465; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4466; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4467; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4468; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4469; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4470; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4471; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4472; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4473; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4474; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4475; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4476; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4477; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4478; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4479; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4480; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4481; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4482; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4483; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4484; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4485; GFX1032-NEXT: s_mov_b32 s2, -1 4486; GFX1032-NEXT: ; implicit-def: $vgpr0 4487; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4488; GFX1032-NEXT: s_cbranch_execz .LBB23_2 4489; GFX1032-NEXT: ; %bb.1: 4490; GFX1032-NEXT: v_mov_b32_e32 v0, 0 4491; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4492; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4493; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4494; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 4495; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4496; GFX1032-NEXT: buffer_gl0_inv 4497; GFX1032-NEXT: .LBB23_2: 4498; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4499; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4500; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4501; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4502; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4503; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4504; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4505; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4506; GFX1032-NEXT: s_endpgm 4507entry: 4508 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4509 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4510 store i32 %old, i32 addrspace(1)* %out 4511 ret void 4512} 4513 4514define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4515; 4516; 4517; GFX7LESS-LABEL: umin_i64_constant: 4518; GFX7LESS: ; %bb.0: ; %entry 4519; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4520; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4521; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4522; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4523; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4524; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4525; GFX7LESS-NEXT: s_cbranch_execz .LBB24_2 4526; GFX7LESS-NEXT: ; %bb.1: 4527; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 4528; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4529; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4530; GFX7LESS-NEXT: s_mov_b32 m0, -1 4531; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4532; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4533; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4534; GFX7LESS-NEXT: .LBB24_2: 4535; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4536; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4537; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4538; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4539; GFX7LESS-NEXT: s_mov_b32 s2, -1 4540; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4541; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4542; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4543; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4544; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4545; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4546; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4547; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4548; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4549; GFX7LESS-NEXT: s_endpgm 4550; 4551; GFX8-LABEL: umin_i64_constant: 4552; GFX8: ; %bb.0: ; %entry 4553; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4554; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4555; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4556; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4557; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4558; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4559; GFX8-NEXT: s_cbranch_execz .LBB24_2 4560; GFX8-NEXT: ; %bb.1: 4561; GFX8-NEXT: v_mov_b32_e32 v0, 5 4562; GFX8-NEXT: v_mov_b32_e32 v2, 0 4563; GFX8-NEXT: v_mov_b32_e32 v1, 0 4564; GFX8-NEXT: s_mov_b32 m0, -1 4565; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4566; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4567; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4568; GFX8-NEXT: .LBB24_2: 4569; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4570; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4571; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4572; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4573; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4574; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4575; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4576; GFX8-NEXT: v_mov_b32_e32 v2, s5 4577; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4578; GFX8-NEXT: v_mov_b32_e32 v2, s4 4579; GFX8-NEXT: s_mov_b32 s2, -1 4580; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4581; GFX8-NEXT: s_mov_b32 s3, 0xf000 4582; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4583; GFX8-NEXT: s_endpgm 4584; 4585; GFX9-LABEL: umin_i64_constant: 4586; GFX9: ; %bb.0: ; %entry 4587; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4588; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4589; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4590; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4591; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4592; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4593; GFX9-NEXT: s_cbranch_execz .LBB24_2 4594; GFX9-NEXT: ; %bb.1: 4595; GFX9-NEXT: v_mov_b32_e32 v0, 5 4596; GFX9-NEXT: v_mov_b32_e32 v1, 0 4597; GFX9-NEXT: v_mov_b32_e32 v2, 0 4598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4599; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4600; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4601; GFX9-NEXT: .LBB24_2: 4602; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4603; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4604; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4605; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4606; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4607; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4608; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4609; GFX9-NEXT: v_mov_b32_e32 v2, s5 4610; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4611; GFX9-NEXT: v_mov_b32_e32 v2, s4 4612; GFX9-NEXT: s_mov_b32 s2, -1 4613; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4614; GFX9-NEXT: s_mov_b32 s3, 0xf000 4615; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4616; GFX9-NEXT: s_endpgm 4617; 4618; GFX1064-LABEL: umin_i64_constant: 4619; GFX1064: ; %bb.0: ; %entry 4620; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4621; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4622; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4623; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4624; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4625; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4626; GFX1064-NEXT: s_cbranch_execz .LBB24_2 4627; GFX1064-NEXT: ; %bb.1: 4628; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4629; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4630; GFX1064-NEXT: v_mov_b32_e32 v2, 0 4631; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4632; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4633; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4634; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4635; GFX1064-NEXT: buffer_gl0_inv 4636; GFX1064-NEXT: .LBB24_2: 4637; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4638; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4639; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4640; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4641; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4642; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4643; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4644; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4645; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4646; GFX1064-NEXT: s_mov_b32 s2, -1 4647; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4648; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4649; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4650; GFX1064-NEXT: s_endpgm 4651; 4652; GFX1032-LABEL: umin_i64_constant: 4653; GFX1032: ; %bb.0: ; %entry 4654; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4655; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4656; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4657; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4658; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4659; GFX1032-NEXT: s_cbranch_execz .LBB24_2 4660; GFX1032-NEXT: ; %bb.1: 4661; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4662; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4663; GFX1032-NEXT: v_mov_b32_e32 v2, 0 4664; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4665; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4666; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4667; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4668; GFX1032-NEXT: buffer_gl0_inv 4669; GFX1032-NEXT: .LBB24_2: 4670; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4671; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4672; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4673; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4674; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4675; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4676; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4677; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4678; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4679; GFX1032-NEXT: s_mov_b32 s2, -1 4680; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4681; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4682; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4683; GFX1032-NEXT: s_endpgm 4684entry: 4685 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4686 store i64 %old, i64 addrspace(1)* %out 4687 ret void 4688} 4689