1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 140; GFX1032-NEXT: s_mov_b32 s3, exec_lo 141; GFX1032-NEXT: ; implicit-def: $vgpr1 142; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 143; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 144; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 145; GFX1032-NEXT: s_cbranch_execz BB0_2 146; GFX1032-NEXT: ; %bb.1: 147; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 148; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 149; GFX1032-NEXT: s_mul_i32 s3, s3, 5 150; GFX1032-NEXT: v_mov_b32_e32 v2, s3 151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 154; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 155; GFX1032-NEXT: buffer_gl0_inv 156; GFX1032-NEXT: BB0_2: 157; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 158; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 159; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 161; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 162; GFX1032-NEXT: s_mov_b32 s2, -1 163; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 164; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 165; GFX1032-NEXT: s_endpgm 166entry: 167 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 168 store i32 %old, i32 addrspace(1)* %out 169 ret void 170} 171 172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 173; 174; 175; GFX7LESS-LABEL: add_i32_uniform: 176; GFX7LESS: ; %bb.0: ; %entry 177; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 178; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 179; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 180; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 181; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 182; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 183; GFX7LESS-NEXT: ; implicit-def: $vgpr1 184; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 185; GFX7LESS-NEXT: s_cbranch_execz BB1_2 186; GFX7LESS-NEXT: ; %bb.1: 187; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 190; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 191; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 192; GFX7LESS-NEXT: s_mov_b32 m0, -1 193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 194; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: BB1_2: 197; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 198; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 199; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 200; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 201; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 202; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 203; GFX7LESS-NEXT: s_mov_b32 s6, -1 204; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 205; GFX7LESS-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_uniform: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 210; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 211; GFX8-NEXT: s_mov_b64 s[2:3], exec 212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 215; GFX8-NEXT: ; implicit-def: $vgpr1 216; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 217; GFX8-NEXT: s_cbranch_execz BB1_2 218; GFX8-NEXT: ; %bb.1: 219; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_mul_i32 s1, s0, s1 222; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 223; GFX8-NEXT: v_mov_b32_e32 v2, s1 224; GFX8-NEXT: s_mov_b32 m0, -1 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: BB1_2: 229; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 231; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 232; GFX8-NEXT: v_readfirstlane_b32 s0, v1 233; GFX8-NEXT: s_mov_b32 s7, 0xf000 234; GFX8-NEXT: s_mov_b32 s6, -1 235; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 236; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 237; GFX8-NEXT: s_endpgm 238; 239; GFX9-LABEL: add_i32_uniform: 240; GFX9: ; %bb.0: ; %entry 241; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 242; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 243; GFX9-NEXT: s_mov_b64 s[6:7], exec 244; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 245; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX9-NEXT: ; implicit-def: $vgpr1 248; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 249; GFX9-NEXT: s_cbranch_execz BB1_2 250; GFX9-NEXT: ; %bb.1: 251; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: s_mul_i32 s3, s2, s3 254; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 255; GFX9-NEXT: v_mov_b32_e32 v2, s3 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: BB1_2: 260; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 263; GFX9-NEXT: v_readfirstlane_b32 s0, v1 264; GFX9-NEXT: s_mov_b32 s7, 0xf000 265; GFX9-NEXT: s_mov_b32 s6, -1 266; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 267; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 268; GFX9-NEXT: s_endpgm 269; 270; GFX1064-LABEL: add_i32_uniform: 271; GFX1064: ; %bb.0: ; %entry 272; GFX1064-NEXT: s_clause 0x1 273; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 274; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 275; GFX1064-NEXT: s_mov_b64 s[6:7], exec 276; GFX1064-NEXT: ; implicit-def: $vgpr1 277; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 278; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 279; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 280; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 281; GFX1064-NEXT: s_cbranch_execz BB1_2 282; GFX1064-NEXT: ; %bb.1: 283; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 284; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 286; GFX1064-NEXT: s_mul_i32 s3, s2, s3 287; GFX1064-NEXT: v_mov_b32_e32 v2, s3 288; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: buffer_gl0_inv 293; GFX1064-NEXT: BB1_2: 294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 295; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 296; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 297; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 298; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 299; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 300; GFX1064-NEXT: s_mov_b32 s6, -1 301; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 302; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 303; GFX1064-NEXT: s_endpgm 304; 305; GFX1032-LABEL: add_i32_uniform: 306; GFX1032: ; %bb.0: ; %entry 307; GFX1032-NEXT: s_clause 0x1 308; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 309; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 310; GFX1032-NEXT: s_mov_b32 s3, exec_lo 311; GFX1032-NEXT: ; implicit-def: $vgpr1 312; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 313; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 314; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 315; GFX1032-NEXT: s_cbranch_execz BB1_2 316; GFX1032-NEXT: ; %bb.1: 317; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 318; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 319; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 320; GFX1032-NEXT: s_mul_i32 s1, s2, s1 321; GFX1032-NEXT: v_mov_b32_e32 v2, s1 322; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 323; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 324; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: buffer_gl0_inv 327; GFX1032-NEXT: BB1_2: 328; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 329; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 330; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 331; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 332; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 333; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 334; GFX1032-NEXT: s_mov_b32 s6, -1 335; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 336; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX1032-NEXT: s_endpgm 338entry: 339 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 340 store i32 %old, i32 addrspace(1)* %out 341 ret void 342} 343 344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 345; 346; 347; GFX7LESS-LABEL: add_i32_varying: 348; GFX7LESS: ; %bb.0: ; %entry 349; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 350; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 351; GFX7LESS-NEXT: s_mov_b32 m0, -1 352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 356; GFX7LESS-NEXT: s_mov_b32 s2, -1 357; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 358; GFX7LESS-NEXT: s_endpgm 359; 360; GFX8-LABEL: add_i32_varying: 361; GFX8: ; %bb.0: ; %entry 362; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 363; GFX8-NEXT: v_mov_b32_e32 v2, v0 364; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 365; GFX8-NEXT: v_mov_b32_e32 v1, 0 366; GFX8-NEXT: s_mov_b64 exec, s[2:3] 367; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 368; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 369; GFX8-NEXT: s_not_b64 exec, exec 370; GFX8-NEXT: v_mov_b32_e32 v2, 0 371; GFX8-NEXT: s_not_b64 exec, exec 372; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 373; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 374; GFX8-NEXT: s_nop 1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 382; GFX8-NEXT: s_nop 1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 384; GFX8-NEXT: v_readlane_b32 s4, v2, 63 385; GFX8-NEXT: s_nop 0 386; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 387; GFX8-NEXT: s_mov_b64 exec, s[2:3] 388; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 389; GFX8-NEXT: ; implicit-def: $vgpr0 390; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 391; GFX8-NEXT: s_cbranch_execz BB2_2 392; GFX8-NEXT: ; %bb.1: 393; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 394; GFX8-NEXT: v_mov_b32_e32 v3, s4 395; GFX8-NEXT: s_mov_b32 m0, -1 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: BB2_2: 400; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 402; GFX8-NEXT: v_readfirstlane_b32 s2, v0 403; GFX8-NEXT: v_mov_b32_e32 v0, v1 404; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 405; GFX8-NEXT: s_mov_b32 s3, 0xf000 406; GFX8-NEXT: s_mov_b32 s2, -1 407; GFX8-NEXT: s_nop 0 408; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 409; GFX8-NEXT: s_endpgm 410; 411; GFX9-LABEL: add_i32_varying: 412; GFX9: ; %bb.0: ; %entry 413; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 414; GFX9-NEXT: v_mov_b32_e32 v2, v0 415; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 416; GFX9-NEXT: v_mov_b32_e32 v1, 0 417; GFX9-NEXT: s_mov_b64 exec, s[2:3] 418; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 419; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 420; GFX9-NEXT: s_not_b64 exec, exec 421; GFX9-NEXT: v_mov_b32_e32 v2, 0 422; GFX9-NEXT: s_not_b64 exec, exec 423; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 424; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 425; GFX9-NEXT: s_nop 1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 435; GFX9-NEXT: v_readlane_b32 s4, v2, 63 436; GFX9-NEXT: s_nop 0 437; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 438; GFX9-NEXT: s_mov_b64 exec, s[2:3] 439; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 440; GFX9-NEXT: ; implicit-def: $vgpr0 441; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 442; GFX9-NEXT: s_cbranch_execz BB2_2 443; GFX9-NEXT: ; %bb.1: 444; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 445; GFX9-NEXT: v_mov_b32_e32 v3, s4 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: BB2_2: 450; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: v_readfirstlane_b32 s2, v0 453; GFX9-NEXT: v_mov_b32_e32 v0, v1 454; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 455; GFX9-NEXT: s_mov_b32 s3, 0xf000 456; GFX9-NEXT: s_mov_b32 s2, -1 457; GFX9-NEXT: s_nop 0 458; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX9-NEXT: s_endpgm 460; 461; GFX1064-LABEL: add_i32_varying: 462; GFX1064: ; %bb.0: ; %entry 463; GFX1064-NEXT: v_mov_b32_e32 v1, v0 464; GFX1064-NEXT: s_not_b64 exec, exec 465; GFX1064-NEXT: v_mov_b32_e32 v1, 0 466; GFX1064-NEXT: s_not_b64 exec, exec 467; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX1064-NEXT: v_mov_b32_e32 v3, 0 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX1064-NEXT: v_mov_b32_e32 v2, v1 474; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 475; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 476; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 477; GFX1064-NEXT: v_mov_b32_e32 v2, s4 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 479; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 480; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 481; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 482; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 484; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 485; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 486; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 487; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 488; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 489; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 490; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 491; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 492; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 493; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 498; GFX1064-NEXT: s_mov_b32 s2, -1 499; GFX1064-NEXT: ; implicit-def: $vgpr0 500; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 501; GFX1064-NEXT: s_cbranch_execz BB2_2 502; GFX1064-NEXT: ; %bb.1: 503; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 504; GFX1064-NEXT: v_mov_b32_e32 v4, s7 505; GFX1064-NEXT: s_mov_b32 s3, s7 506; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 507; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 508; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 509; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 510; GFX1064-NEXT: buffer_gl0_inv 511; GFX1064-NEXT: BB2_2: 512; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 513; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 514; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 515; GFX1064-NEXT: v_mov_b32_e32 v0, v3 516; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 517; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 518; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 519; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 520; GFX1064-NEXT: s_endpgm 521; 522; GFX1032-LABEL: add_i32_varying: 523; GFX1032: ; %bb.0: ; %entry 524; GFX1032-NEXT: v_mov_b32_e32 v1, v0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: v_mov_b32_e32 v1, 0 527; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 528; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 532; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX1032-NEXT: v_mov_b32_e32 v2, v1 534; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 535; GFX1032-NEXT: s_mov_b32 exec_lo, s2 536; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 538; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 539; GFX1032-NEXT: v_mov_b32_e32 v3, 0 540; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 541; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 542; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 543; GFX1032-NEXT: s_mov_b32 exec_lo, s2 544; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 545; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 546; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 549; GFX1032-NEXT: s_mov_b32 s2, -1 550; GFX1032-NEXT: ; implicit-def: $vgpr0 551; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 552; GFX1032-NEXT: s_cbranch_execz BB2_2 553; GFX1032-NEXT: ; %bb.1: 554; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 555; GFX1032-NEXT: v_mov_b32_e32 v4, s4 556; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 557; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 558; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 559; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 560; GFX1032-NEXT: buffer_gl0_inv 561; GFX1032-NEXT: BB2_2: 562; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 563; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 564; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 565; GFX1032-NEXT: v_mov_b32_e32 v0, v3 566; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 567; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 568; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 569; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; GFX1032-NEXT: s_endpgm 571entry: 572 %lane = call i32 @llvm.amdgcn.workitem.id.x() 573 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 574 store i32 %old, i32 addrspace(1)* %out 575 ret void 576} 577 578define amdgpu_kernel void @add_i32_varying_nouse() { 579; GFX7LESS-LABEL: add_i32_varying_nouse: 580; GFX7LESS: ; %bb.0: ; %entry 581; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 582; GFX7LESS-NEXT: s_mov_b32 m0, -1 583; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 584; GFX7LESS-NEXT: ds_add_u32 v1, v0 585; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 586; GFX7LESS-NEXT: s_endpgm 587; 588; GFX8-LABEL: add_i32_varying_nouse: 589; GFX8: ; %bb.0: ; %entry 590; GFX8-NEXT: v_mov_b32_e32 v1, v0 591; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 592; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 593; GFX8-NEXT: s_not_b64 exec, exec 594; GFX8-NEXT: v_mov_b32_e32 v1, 0 595; GFX8-NEXT: s_not_b64 exec, exec 596; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 597; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 598; GFX8-NEXT: s_nop 1 599; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 600; GFX8-NEXT: s_nop 1 601; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 602; GFX8-NEXT: s_nop 1 603; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 604; GFX8-NEXT: s_nop 1 605; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 606; GFX8-NEXT: s_nop 1 607; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 608; GFX8-NEXT: v_readlane_b32 s2, v1, 63 609; GFX8-NEXT: s_mov_b64 exec, s[0:1] 610; GFX8-NEXT: s_mov_b32 s0, s2 611; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 612; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 613; GFX8-NEXT: s_cbranch_execz BB3_2 614; GFX8-NEXT: ; %bb.1: 615; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 616; GFX8-NEXT: v_mov_b32_e32 v2, s0 617; GFX8-NEXT: s_mov_b32 m0, -1 618; GFX8-NEXT: s_waitcnt lgkmcnt(0) 619; GFX8-NEXT: ds_add_u32 v0, v2 620; GFX8-NEXT: s_waitcnt lgkmcnt(0) 621; GFX8-NEXT: BB3_2: 622; GFX8-NEXT: s_endpgm 623; 624; GFX9-LABEL: add_i32_varying_nouse: 625; GFX9: ; %bb.0: ; %entry 626; GFX9-NEXT: v_mov_b32_e32 v1, v0 627; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 628; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 629; GFX9-NEXT: s_not_b64 exec, exec 630; GFX9-NEXT: v_mov_b32_e32 v1, 0 631; GFX9-NEXT: s_not_b64 exec, exec 632; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 633; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 634; GFX9-NEXT: s_nop 1 635; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX9-NEXT: s_nop 1 637; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 638; GFX9-NEXT: s_nop 1 639; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 640; GFX9-NEXT: s_nop 1 641; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 642; GFX9-NEXT: s_nop 1 643; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 644; GFX9-NEXT: v_readlane_b32 s2, v1, 63 645; GFX9-NEXT: s_mov_b64 exec, s[0:1] 646; GFX9-NEXT: s_mov_b32 s0, s2 647; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 648; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 649; GFX9-NEXT: s_cbranch_execz BB3_2 650; GFX9-NEXT: ; %bb.1: 651; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 652; GFX9-NEXT: v_mov_b32_e32 v2, s0 653; GFX9-NEXT: s_waitcnt lgkmcnt(0) 654; GFX9-NEXT: ds_add_u32 v0, v2 655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-NEXT: BB3_2: 657; GFX9-NEXT: s_endpgm 658; 659; GFX1064-LABEL: add_i32_varying_nouse: 660; GFX1064: ; %bb.0: ; %entry 661; GFX1064-NEXT: v_mov_b32_e32 v1, v0 662; GFX1064-NEXT: s_not_b64 exec, exec 663; GFX1064-NEXT: v_mov_b32_e32 v1, 0 664; GFX1064-NEXT: s_not_b64 exec, exec 665; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 666; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX1064-NEXT: v_mov_b32_e32 v2, v1 671; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 672; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 673; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 674; GFX1064-NEXT: v_mov_b32_e32 v2, s2 675; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 676; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 677; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 678; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 679; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 680; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 681; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 682; GFX1064-NEXT: s_mov_b32 s0, s2 683; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 684; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 685; GFX1064-NEXT: s_cbranch_execz BB3_2 686; GFX1064-NEXT: ; %bb.1: 687; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 688; GFX1064-NEXT: v_mov_b32_e32 v3, s0 689; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 690; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 691; GFX1064-NEXT: ds_add_u32 v0, v3 692; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 693; GFX1064-NEXT: buffer_gl0_inv 694; GFX1064-NEXT: BB3_2: 695; GFX1064-NEXT: s_endpgm 696; 697; GFX1032-LABEL: add_i32_varying_nouse: 698; GFX1032: ; %bb.0: ; %entry 699; GFX1032-NEXT: v_mov_b32_e32 v1, v0 700; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 701; GFX1032-NEXT: v_mov_b32_e32 v1, 0 702; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 703; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 704; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 705; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 707; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 708; GFX1032-NEXT: v_mov_b32_e32 v2, v1 709; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 710; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 711; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 712; GFX1032-NEXT: s_mov_b32 exec_lo, s0 713; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 714; GFX1032-NEXT: s_mov_b32 s0, s1 715; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 716; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 717; GFX1032-NEXT: s_cbranch_execz BB3_2 718; GFX1032-NEXT: ; %bb.1: 719; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 720; GFX1032-NEXT: v_mov_b32_e32 v3, s0 721; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 722; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 723; GFX1032-NEXT: ds_add_u32 v0, v3 724; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 725; GFX1032-NEXT: buffer_gl0_inv 726; GFX1032-NEXT: BB3_2: 727; GFX1032-NEXT: s_endpgm 728entry: 729 %lane = call i32 @llvm.amdgcn.workitem.id.x() 730 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 731 ret void 732} 733 734define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 735; 736; 737; GFX7LESS-LABEL: add_i64_constant: 738; GFX7LESS: ; %bb.0: ; %entry 739; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 740; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 741; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 742; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 743; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 744; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 745; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 746; GFX7LESS-NEXT: s_cbranch_execz BB4_2 747; GFX7LESS-NEXT: ; %bb.1: 748; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 749; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 750; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 751; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 752; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 753; GFX7LESS-NEXT: s_mov_b32 m0, -1 754; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 755; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 756; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 757; GFX7LESS-NEXT: BB4_2: 758; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 759; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 760; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 761; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 762; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 763; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 764; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 765; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 766; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 767; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 768; GFX7LESS-NEXT: s_mov_b32 s2, -1 769; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 770; GFX7LESS-NEXT: s_endpgm 771; 772; GFX8-LABEL: add_i64_constant: 773; GFX8: ; %bb.0: ; %entry 774; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 775; GFX8-NEXT: s_mov_b64 s[4:5], exec 776; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 777; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 778; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 779; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 780; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 781; GFX8-NEXT: s_cbranch_execz BB4_2 782; GFX8-NEXT: ; %bb.1: 783; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 784; GFX8-NEXT: s_mul_i32 s4, s4, 5 785; GFX8-NEXT: v_mov_b32_e32 v1, s4 786; GFX8-NEXT: v_mov_b32_e32 v2, 0 787; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 788; GFX8-NEXT: s_mov_b32 m0, -1 789; GFX8-NEXT: s_waitcnt lgkmcnt(0) 790; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 791; GFX8-NEXT: s_waitcnt lgkmcnt(0) 792; GFX8-NEXT: BB4_2: 793; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 794; GFX8-NEXT: s_waitcnt lgkmcnt(0) 795; GFX8-NEXT: v_readfirstlane_b32 s2, v1 796; GFX8-NEXT: v_readfirstlane_b32 s3, v2 797; GFX8-NEXT: v_mov_b32_e32 v1, s2 798; GFX8-NEXT: v_mov_b32_e32 v2, s3 799; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 800; GFX8-NEXT: s_mov_b32 s3, 0xf000 801; GFX8-NEXT: s_mov_b32 s2, -1 802; GFX8-NEXT: s_nop 2 803; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 804; GFX8-NEXT: s_endpgm 805; 806; GFX9-LABEL: add_i64_constant: 807; GFX9: ; %bb.0: ; %entry 808; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 809; GFX9-NEXT: s_mov_b64 s[4:5], exec 810; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 811; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 812; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 813; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 814; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 815; GFX9-NEXT: s_cbranch_execz BB4_2 816; GFX9-NEXT: ; %bb.1: 817; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 818; GFX9-NEXT: s_mul_i32 s4, s4, 5 819; GFX9-NEXT: v_mov_b32_e32 v1, s4 820; GFX9-NEXT: v_mov_b32_e32 v2, 0 821; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 822; GFX9-NEXT: s_waitcnt lgkmcnt(0) 823; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 824; GFX9-NEXT: s_waitcnt lgkmcnt(0) 825; GFX9-NEXT: BB4_2: 826; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 827; GFX9-NEXT: s_waitcnt lgkmcnt(0) 828; GFX9-NEXT: v_readfirstlane_b32 s2, v1 829; GFX9-NEXT: v_readfirstlane_b32 s3, v2 830; GFX9-NEXT: v_mov_b32_e32 v1, s2 831; GFX9-NEXT: v_mov_b32_e32 v2, s3 832; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 833; GFX9-NEXT: s_mov_b32 s3, 0xf000 834; GFX9-NEXT: s_mov_b32 s2, -1 835; GFX9-NEXT: s_nop 2 836; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 837; GFX9-NEXT: s_endpgm 838; 839; GFX1064-LABEL: add_i64_constant: 840; GFX1064: ; %bb.0: ; %entry 841; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 842; GFX1064-NEXT: s_mov_b64 s[4:5], exec 843; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 844; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 845; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 846; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 847; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 848; GFX1064-NEXT: s_cbranch_execz BB4_2 849; GFX1064-NEXT: ; %bb.1: 850; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 851; GFX1064-NEXT: v_mov_b32_e32 v2, 0 852; GFX1064-NEXT: s_mul_i32 s4, s4, 5 853; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 854; GFX1064-NEXT: v_mov_b32_e32 v1, s4 855; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 856; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 857; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 858; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 859; GFX1064-NEXT: buffer_gl0_inv 860; GFX1064-NEXT: BB4_2: 861; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 862; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 863; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 864; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 865; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 866; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 867; GFX1064-NEXT: s_mov_b32 s2, -1 868; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 869; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 870; GFX1064-NEXT: s_endpgm 871; 872; GFX1032-LABEL: add_i64_constant: 873; GFX1032: ; %bb.0: ; %entry 874; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 875; GFX1032-NEXT: s_mov_b32 s3, exec_lo 876; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 877; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 878; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 879; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 880; GFX1032-NEXT: s_cbranch_execz BB4_2 881; GFX1032-NEXT: ; %bb.1: 882; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 883; GFX1032-NEXT: v_mov_b32_e32 v2, 0 884; GFX1032-NEXT: s_mul_i32 s3, s3, 5 885; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 886; GFX1032-NEXT: v_mov_b32_e32 v1, s3 887; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 888; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 889; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 890; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 891; GFX1032-NEXT: buffer_gl0_inv 892; GFX1032-NEXT: BB4_2: 893; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 894; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 895; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 896; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 897; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 898; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 899; GFX1032-NEXT: s_mov_b32 s2, -1 900; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 901; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 902; GFX1032-NEXT: s_endpgm 903entry: 904 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 905 store i64 %old, i64 addrspace(1)* %out 906 ret void 907} 908 909define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 910; 911; 912; GFX7LESS-LABEL: add_i64_uniform: 913; GFX7LESS: ; %bb.0: ; %entry 914; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 915; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 916; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 917; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 918; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 919; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 920; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 921; GFX7LESS-NEXT: s_cbranch_execz BB5_2 922; GFX7LESS-NEXT: ; %bb.1: 923; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 924; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 925; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 926; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 927; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 928; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 929; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 930; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 931; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 932; GFX7LESS-NEXT: s_mov_b32 m0, -1 933; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 934; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 935; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 936; GFX7LESS-NEXT: BB5_2: 937; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 938; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 939; GFX7LESS-NEXT: s_mov_b32 s6, -1 940; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 941; GFX7LESS-NEXT: s_mov_b32 s4, s0 942; GFX7LESS-NEXT: s_mov_b32 s5, s1 943; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 944; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 945; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 946; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 947; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 948; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 949; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 950; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 951; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 952; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 953; GFX7LESS-NEXT: s_endpgm 954; 955; GFX8-LABEL: add_i64_uniform: 956; GFX8: ; %bb.0: ; %entry 957; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 958; GFX8-NEXT: s_mov_b64 s[6:7], exec 959; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 960; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 961; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 962; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 963; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 964; GFX8-NEXT: s_cbranch_execz BB5_2 965; GFX8-NEXT: ; %bb.1: 966; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 967; GFX8-NEXT: v_mov_b32_e32 v1, s6 968; GFX8-NEXT: s_waitcnt lgkmcnt(0) 969; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 970; GFX8-NEXT: s_mul_i32 s7, s3, s6 971; GFX8-NEXT: s_mul_i32 s6, s2, s6 972; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 973; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 974; GFX8-NEXT: v_mov_b32_e32 v1, s6 975; GFX8-NEXT: s_mov_b32 m0, -1 976; GFX8-NEXT: s_waitcnt lgkmcnt(0) 977; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 978; GFX8-NEXT: s_waitcnt lgkmcnt(0) 979; GFX8-NEXT: BB5_2: 980; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 981; GFX8-NEXT: s_waitcnt lgkmcnt(0) 982; GFX8-NEXT: s_mov_b32 s4, s0 983; GFX8-NEXT: v_readfirstlane_b32 s0, v1 984; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 985; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 986; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 987; GFX8-NEXT: s_mov_b32 s5, s1 988; GFX8-NEXT: v_readfirstlane_b32 s1, v2 989; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 990; GFX8-NEXT: v_mov_b32_e32 v2, s1 991; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 992; GFX8-NEXT: s_mov_b32 s7, 0xf000 993; GFX8-NEXT: s_mov_b32 s6, -1 994; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 995; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 996; GFX8-NEXT: s_endpgm 997; 998; GFX9-LABEL: add_i64_uniform: 999; GFX9: ; %bb.0: ; %entry 1000; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1001; GFX9-NEXT: s_mov_b64 s[6:7], exec 1002; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1003; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1004; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1005; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1006; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1007; GFX9-NEXT: s_cbranch_execz BB5_2 1008; GFX9-NEXT: ; %bb.1: 1009; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1010; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX9-NEXT: s_mul_i32 s7, s3, s6 1012; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1013; GFX9-NEXT: s_add_i32 s8, s8, s7 1014; GFX9-NEXT: s_mul_i32 s6, s2, s6 1015; GFX9-NEXT: v_mov_b32_e32 v1, s6 1016; GFX9-NEXT: v_mov_b32_e32 v2, s8 1017; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1018; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1020; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1021; GFX9-NEXT: BB5_2: 1022; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1023; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1025; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1026; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1027; GFX9-NEXT: s_mov_b32 s4, s0 1028; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1029; GFX9-NEXT: s_mov_b32 s5, s1 1030; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1031; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1032; GFX9-NEXT: v_mov_b32_e32 v2, s1 1033; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1034; GFX9-NEXT: s_mov_b32 s7, 0xf000 1035; GFX9-NEXT: s_mov_b32 s6, -1 1036; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1037; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1038; GFX9-NEXT: s_endpgm 1039; 1040; GFX1064-LABEL: add_i64_uniform: 1041; GFX1064: ; %bb.0: ; %entry 1042; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1043; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1044; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1045; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1046; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1047; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1048; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1049; GFX1064-NEXT: s_cbranch_execz BB5_2 1050; GFX1064-NEXT: ; %bb.1: 1051; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1052; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1053; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1054; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1055; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1056; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1057; GFX1064-NEXT: s_add_i32 s8, s8, s7 1058; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1059; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1060; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1061; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1062; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1063; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1064; GFX1064-NEXT: buffer_gl0_inv 1065; GFX1064-NEXT: BB5_2: 1066; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1067; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1068; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1069; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1070; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1071; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1072; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1073; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1074; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1075; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1076; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 1077; GFX1064-NEXT: s_mov_b32 s2, -1 1078; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1079; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1080; GFX1064-NEXT: s_endpgm 1081; 1082; GFX1032-LABEL: add_i64_uniform: 1083; GFX1032: ; %bb.0: ; %entry 1084; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1085; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1086; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1087; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1088; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1089; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1090; GFX1032-NEXT: s_cbranch_execz BB5_2 1091; GFX1032-NEXT: ; %bb.1: 1092; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1093; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1094; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1095; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1096; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1097; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1098; GFX1032-NEXT: s_add_i32 s7, s7, s6 1099; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1100; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1101; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1102; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1103; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1104; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1105; GFX1032-NEXT: buffer_gl0_inv 1106; GFX1032-NEXT: BB5_2: 1107; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1108; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1109; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1111; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1112; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1113; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1114; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1115; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1116; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1117; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 1118; GFX1032-NEXT: s_mov_b32 s2, -1 1119; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1120; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1121; GFX1032-NEXT: s_endpgm 1122entry: 1123 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1124 store i64 %old, i64 addrspace(1)* %out 1125 ret void 1126} 1127 1128define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1129; 1130; 1131; GFX7LESS-LABEL: add_i64_varying: 1132; GFX7LESS: ; %bb.0: ; %entry 1133; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1134; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1135; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1136; GFX7LESS-NEXT: s_mov_b32 m0, -1 1137; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1139; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1141; GFX7LESS-NEXT: s_mov_b32 s2, -1 1142; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1143; GFX7LESS-NEXT: s_endpgm 1144; 1145; GFX8-LABEL: add_i64_varying: 1146; GFX8: ; %bb.0: ; %entry 1147; GFX8-NEXT: v_mov_b32_e32 v1, 0 1148; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1149; GFX8-NEXT: s_mov_b32 m0, -1 1150; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1151; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1152; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1153; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1154; GFX8-NEXT: s_mov_b32 s3, 0xf000 1155; GFX8-NEXT: s_mov_b32 s2, -1 1156; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1157; GFX8-NEXT: s_endpgm 1158; 1159; GFX9-LABEL: add_i64_varying: 1160; GFX9: ; %bb.0: ; %entry 1161; GFX9-NEXT: v_mov_b32_e32 v1, 0 1162; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1163; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1166; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX9-NEXT: s_mov_b32 s3, 0xf000 1168; GFX9-NEXT: s_mov_b32 s2, -1 1169; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1170; GFX9-NEXT: s_endpgm 1171; 1172; GFX1064-LABEL: add_i64_varying: 1173; GFX1064: ; %bb.0: ; %entry 1174; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1175; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1176; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1177; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1178; GFX1064-NEXT: s_mov_b32 s2, -1 1179; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1180; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1181; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1182; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1183; GFX1064-NEXT: buffer_gl0_inv 1184; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1185; GFX1064-NEXT: s_endpgm 1186; 1187; GFX1032-LABEL: add_i64_varying: 1188; GFX1032: ; %bb.0: ; %entry 1189; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1190; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1191; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1192; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1193; GFX1032-NEXT: s_mov_b32 s2, -1 1194; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1195; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1196; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1197; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1198; GFX1032-NEXT: buffer_gl0_inv 1199; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1200; GFX1032-NEXT: s_endpgm 1201entry: 1202 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1203 %zext = zext i32 %lane to i64 1204 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1205 store i64 %old, i64 addrspace(1)* %out 1206 ret void 1207} 1208 1209define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1210; 1211; 1212; GFX7LESS-LABEL: sub_i32_constant: 1213; GFX7LESS: ; %bb.0: ; %entry 1214; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1215; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1216; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1217; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1218; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1219; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1220; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1221; GFX7LESS-NEXT: s_cbranch_execz BB7_2 1222; GFX7LESS-NEXT: ; %bb.1: 1223; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1224; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1225; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1226; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1227; GFX7LESS-NEXT: s_mov_b32 m0, -1 1228; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1229; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1230; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1231; GFX7LESS-NEXT: BB7_2: 1232; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1233; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1234; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1235; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1236; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1237; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1238; GFX7LESS-NEXT: s_mov_b32 s2, -1 1239; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1240; GFX7LESS-NEXT: s_endpgm 1241; 1242; GFX8-LABEL: sub_i32_constant: 1243; GFX8: ; %bb.0: ; %entry 1244; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1245; GFX8-NEXT: s_mov_b64 s[2:3], exec 1246; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1247; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1248; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1249; GFX8-NEXT: ; implicit-def: $vgpr1 1250; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1251; GFX8-NEXT: s_cbranch_execz BB7_2 1252; GFX8-NEXT: ; %bb.1: 1253; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1254; GFX8-NEXT: s_mul_i32 s2, s2, 5 1255; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1256; GFX8-NEXT: v_mov_b32_e32 v2, s2 1257; GFX8-NEXT: s_mov_b32 m0, -1 1258; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1259; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1260; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1261; GFX8-NEXT: BB7_2: 1262; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1263; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1264; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1265; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1266; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1267; GFX8-NEXT: s_mov_b32 s3, 0xf000 1268; GFX8-NEXT: s_mov_b32 s2, -1 1269; GFX8-NEXT: s_nop 0 1270; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1271; GFX8-NEXT: s_endpgm 1272; 1273; GFX9-LABEL: sub_i32_constant: 1274; GFX9: ; %bb.0: ; %entry 1275; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1276; GFX9-NEXT: s_mov_b64 s[2:3], exec 1277; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1278; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1279; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1280; GFX9-NEXT: ; implicit-def: $vgpr1 1281; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1282; GFX9-NEXT: s_cbranch_execz BB7_2 1283; GFX9-NEXT: ; %bb.1: 1284; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1285; GFX9-NEXT: s_mul_i32 s2, s2, 5 1286; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1287; GFX9-NEXT: v_mov_b32_e32 v2, s2 1288; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1289; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1290; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX9-NEXT: BB7_2: 1292; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1293; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1295; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1296; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1297; GFX9-NEXT: s_mov_b32 s3, 0xf000 1298; GFX9-NEXT: s_mov_b32 s2, -1 1299; GFX9-NEXT: s_nop 0 1300; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1301; GFX9-NEXT: s_endpgm 1302; 1303; GFX1064-LABEL: sub_i32_constant: 1304; GFX1064: ; %bb.0: ; %entry 1305; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1306; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1307; GFX1064-NEXT: ; implicit-def: $vgpr1 1308; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1309; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1310; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1311; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1312; GFX1064-NEXT: s_cbranch_execz BB7_2 1313; GFX1064-NEXT: ; %bb.1: 1314; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1315; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1316; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1317; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1318; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1319; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1320; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1321; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX1064-NEXT: buffer_gl0_inv 1323; GFX1064-NEXT: BB7_2: 1324; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1325; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1326; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1327; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1328; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1329; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1330; GFX1064-NEXT: s_mov_b32 s2, -1 1331; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1332; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1333; GFX1064-NEXT: s_endpgm 1334; 1335; GFX1032-LABEL: sub_i32_constant: 1336; GFX1032: ; %bb.0: ; %entry 1337; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1338; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1339; GFX1032-NEXT: ; implicit-def: $vgpr1 1340; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1341; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1342; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1343; GFX1032-NEXT: s_cbranch_execz BB7_2 1344; GFX1032-NEXT: ; %bb.1: 1345; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1346; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1347; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1348; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1349; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1350; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1351; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1352; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1353; GFX1032-NEXT: buffer_gl0_inv 1354; GFX1032-NEXT: BB7_2: 1355; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1356; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1357; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1358; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1359; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1360; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1361; GFX1032-NEXT: s_mov_b32 s2, -1 1362; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1363; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1364; GFX1032-NEXT: s_endpgm 1365entry: 1366 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1367 store i32 %old, i32 addrspace(1)* %out 1368 ret void 1369} 1370 1371define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1372; 1373; 1374; GFX7LESS-LABEL: sub_i32_uniform: 1375; GFX7LESS: ; %bb.0: ; %entry 1376; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1377; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1378; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 1379; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1380; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1381; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1382; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1383; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 1384; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1385; GFX7LESS-NEXT: ; %bb.1: 1386; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1387; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1388; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 1389; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1390; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1391; GFX7LESS-NEXT: s_mov_b32 m0, -1 1392; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1393; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1394; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1395; GFX7LESS-NEXT: BB8_2: 1396; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 1397; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1398; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1399; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1400; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1401; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 1402; GFX7LESS-NEXT: s_mov_b32 s6, -1 1403; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1404; GFX7LESS-NEXT: s_endpgm 1405; 1406; GFX8-LABEL: sub_i32_uniform: 1407; GFX8: ; %bb.0: ; %entry 1408; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1409; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1410; GFX8-NEXT: s_mov_b64 s[2:3], exec 1411; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1412; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1413; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1414; GFX8-NEXT: ; implicit-def: $vgpr1 1415; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1416; GFX8-NEXT: s_cbranch_execz BB8_2 1417; GFX8-NEXT: ; %bb.1: 1418; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1419; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1420; GFX8-NEXT: s_mul_i32 s1, s0, s1 1421; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1422; GFX8-NEXT: v_mov_b32_e32 v2, s1 1423; GFX8-NEXT: s_mov_b32 m0, -1 1424; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1425; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1426; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1427; GFX8-NEXT: BB8_2: 1428; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1429; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1430; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1431; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1432; GFX8-NEXT: s_mov_b32 s7, 0xf000 1433; GFX8-NEXT: s_mov_b32 s6, -1 1434; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1435; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1436; GFX8-NEXT: s_endpgm 1437; 1438; GFX9-LABEL: sub_i32_uniform: 1439; GFX9: ; %bb.0: ; %entry 1440; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1441; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1442; GFX9-NEXT: s_mov_b64 s[6:7], exec 1443; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1444; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1445; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1446; GFX9-NEXT: ; implicit-def: $vgpr1 1447; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1448; GFX9-NEXT: s_cbranch_execz BB8_2 1449; GFX9-NEXT: ; %bb.1: 1450; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1452; GFX9-NEXT: s_mul_i32 s3, s2, s3 1453; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1454; GFX9-NEXT: v_mov_b32_e32 v2, s3 1455; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1456; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1457; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1458; GFX9-NEXT: BB8_2: 1459; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1460; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1461; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1462; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1463; GFX9-NEXT: s_mov_b32 s7, 0xf000 1464; GFX9-NEXT: s_mov_b32 s6, -1 1465; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1466; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1467; GFX9-NEXT: s_endpgm 1468; 1469; GFX1064-LABEL: sub_i32_uniform: 1470; GFX1064: ; %bb.0: ; %entry 1471; GFX1064-NEXT: s_clause 0x1 1472; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1473; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 1474; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1475; GFX1064-NEXT: ; implicit-def: $vgpr1 1476; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1477; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1478; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1479; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1480; GFX1064-NEXT: s_cbranch_execz BB8_2 1481; GFX1064-NEXT: ; %bb.1: 1482; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1483; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1484; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1485; GFX1064-NEXT: s_mul_i32 s3, s2, s3 1486; GFX1064-NEXT: v_mov_b32_e32 v2, s3 1487; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1488; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1489; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1490; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX1064-NEXT: buffer_gl0_inv 1492; GFX1064-NEXT: BB8_2: 1493; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1494; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1495; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1496; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1497; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1498; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1499; GFX1064-NEXT: s_mov_b32 s6, -1 1500; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1501; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1502; GFX1064-NEXT: s_endpgm 1503; 1504; GFX1032-LABEL: sub_i32_uniform: 1505; GFX1032: ; %bb.0: ; %entry 1506; GFX1032-NEXT: s_clause 0x1 1507; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1508; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1509; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1510; GFX1032-NEXT: ; implicit-def: $vgpr1 1511; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1512; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1513; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1514; GFX1032-NEXT: s_cbranch_execz BB8_2 1515; GFX1032-NEXT: ; %bb.1: 1516; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1517; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1518; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1519; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1520; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1521; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1522; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1523; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1524; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1525; GFX1032-NEXT: buffer_gl0_inv 1526; GFX1032-NEXT: BB8_2: 1527; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1528; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1529; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1530; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1531; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1532; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1533; GFX1032-NEXT: s_mov_b32 s6, -1 1534; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1535; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1536; GFX1032-NEXT: s_endpgm 1537entry: 1538 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1539 store i32 %old, i32 addrspace(1)* %out 1540 ret void 1541} 1542 1543define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1544; 1545; 1546; GFX7LESS-LABEL: sub_i32_varying: 1547; GFX7LESS: ; %bb.0: ; %entry 1548; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1549; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1550; GFX7LESS-NEXT: s_mov_b32 m0, -1 1551; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1552; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1553; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1554; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1555; GFX7LESS-NEXT: s_mov_b32 s2, -1 1556; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1557; GFX7LESS-NEXT: s_endpgm 1558; 1559; GFX8-LABEL: sub_i32_varying: 1560; GFX8: ; %bb.0: ; %entry 1561; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1562; GFX8-NEXT: v_mov_b32_e32 v2, v0 1563; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1564; GFX8-NEXT: v_mov_b32_e32 v1, 0 1565; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1566; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1567; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1568; GFX8-NEXT: s_not_b64 exec, exec 1569; GFX8-NEXT: v_mov_b32_e32 v2, 0 1570; GFX8-NEXT: s_not_b64 exec, exec 1571; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1572; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1573; GFX8-NEXT: s_nop 1 1574; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1575; GFX8-NEXT: s_nop 1 1576; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1577; GFX8-NEXT: s_nop 1 1578; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1579; GFX8-NEXT: s_nop 1 1580; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1581; GFX8-NEXT: s_nop 1 1582; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1583; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1584; GFX8-NEXT: s_nop 0 1585; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1586; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1587; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1588; GFX8-NEXT: ; implicit-def: $vgpr0 1589; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1590; GFX8-NEXT: s_cbranch_execz BB9_2 1591; GFX8-NEXT: ; %bb.1: 1592; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1593; GFX8-NEXT: v_mov_b32_e32 v3, s4 1594; GFX8-NEXT: s_mov_b32 m0, -1 1595; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1596; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1597; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX8-NEXT: BB9_2: 1599; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1600; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1602; GFX8-NEXT: v_mov_b32_e32 v0, v1 1603; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1604; GFX8-NEXT: s_mov_b32 s3, 0xf000 1605; GFX8-NEXT: s_mov_b32 s2, -1 1606; GFX8-NEXT: s_nop 0 1607; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1608; GFX8-NEXT: s_endpgm 1609; 1610; GFX9-LABEL: sub_i32_varying: 1611; GFX9: ; %bb.0: ; %entry 1612; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1613; GFX9-NEXT: v_mov_b32_e32 v2, v0 1614; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1615; GFX9-NEXT: v_mov_b32_e32 v1, 0 1616; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1617; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1618; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1619; GFX9-NEXT: s_not_b64 exec, exec 1620; GFX9-NEXT: v_mov_b32_e32 v2, 0 1621; GFX9-NEXT: s_not_b64 exec, exec 1622; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1623; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1624; GFX9-NEXT: s_nop 1 1625; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1626; GFX9-NEXT: s_nop 1 1627; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1628; GFX9-NEXT: s_nop 1 1629; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1630; GFX9-NEXT: s_nop 1 1631; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1632; GFX9-NEXT: s_nop 1 1633; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1634; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1635; GFX9-NEXT: s_nop 0 1636; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1637; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1638; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1639; GFX9-NEXT: ; implicit-def: $vgpr0 1640; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1641; GFX9-NEXT: s_cbranch_execz BB9_2 1642; GFX9-NEXT: ; %bb.1: 1643; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1644; GFX9-NEXT: v_mov_b32_e32 v3, s4 1645; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1646; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1647; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1648; GFX9-NEXT: BB9_2: 1649; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1650; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1651; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1652; GFX9-NEXT: v_mov_b32_e32 v0, v1 1653; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1654; GFX9-NEXT: s_mov_b32 s3, 0xf000 1655; GFX9-NEXT: s_mov_b32 s2, -1 1656; GFX9-NEXT: s_nop 0 1657; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1658; GFX9-NEXT: s_endpgm 1659; 1660; GFX1064-LABEL: sub_i32_varying: 1661; GFX1064: ; %bb.0: ; %entry 1662; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1663; GFX1064-NEXT: s_not_b64 exec, exec 1664; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1665; GFX1064-NEXT: s_not_b64 exec, exec 1666; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1667; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1668; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1669; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1670; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1671; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1672; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1673; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1674; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1675; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1676; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1677; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1678; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1679; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1680; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1681; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1682; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1683; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1684; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 1685; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1686; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1687; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1688; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 1689; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 1690; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 1691; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1692; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 1693; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1694; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 1695; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1696; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1697; GFX1064-NEXT: s_mov_b32 s2, -1 1698; GFX1064-NEXT: ; implicit-def: $vgpr0 1699; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1700; GFX1064-NEXT: s_cbranch_execz BB9_2 1701; GFX1064-NEXT: ; %bb.1: 1702; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1703; GFX1064-NEXT: v_mov_b32_e32 v4, s7 1704; GFX1064-NEXT: s_mov_b32 s3, s7 1705; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1706; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1707; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 1708; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1709; GFX1064-NEXT: buffer_gl0_inv 1710; GFX1064-NEXT: BB9_2: 1711; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1712; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1713; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1714; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1715; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1716; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1717; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1718; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1719; GFX1064-NEXT: s_endpgm 1720; 1721; GFX1032-LABEL: sub_i32_varying: 1722; GFX1032: ; %bb.0: ; %entry 1723; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1724; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1725; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1726; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1727; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1728; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1729; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1730; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1731; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1732; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1733; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1734; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1735; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1736; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1737; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1738; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1739; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1740; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1741; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1742; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1743; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1744; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1745; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1746; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1747; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1748; GFX1032-NEXT: s_mov_b32 s2, -1 1749; GFX1032-NEXT: ; implicit-def: $vgpr0 1750; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1751; GFX1032-NEXT: s_cbranch_execz BB9_2 1752; GFX1032-NEXT: ; %bb.1: 1753; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1754; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1755; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1756; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1757; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 1758; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1759; GFX1032-NEXT: buffer_gl0_inv 1760; GFX1032-NEXT: BB9_2: 1761; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1762; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1763; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1764; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1765; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1766; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1767; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1769; GFX1032-NEXT: s_endpgm 1770entry: 1771 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1772 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1773 store i32 %old, i32 addrspace(1)* %out 1774 ret void 1775} 1776 1777define amdgpu_kernel void @sub_i32_varying_nouse() { 1778; GFX7LESS-LABEL: sub_i32_varying_nouse: 1779; GFX7LESS: ; %bb.0: ; %entry 1780; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1781; GFX7LESS-NEXT: s_mov_b32 m0, -1 1782; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1783; GFX7LESS-NEXT: ds_sub_u32 v1, v0 1784; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1785; GFX7LESS-NEXT: s_endpgm 1786; 1787; GFX8-LABEL: sub_i32_varying_nouse: 1788; GFX8: ; %bb.0: ; %entry 1789; GFX8-NEXT: v_mov_b32_e32 v1, v0 1790; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1791; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1792; GFX8-NEXT: s_not_b64 exec, exec 1793; GFX8-NEXT: v_mov_b32_e32 v1, 0 1794; GFX8-NEXT: s_not_b64 exec, exec 1795; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 1796; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1797; GFX8-NEXT: s_nop 1 1798; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1799; GFX8-NEXT: s_nop 1 1800; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1801; GFX8-NEXT: s_nop 1 1802; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1803; GFX8-NEXT: s_nop 1 1804; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1805; GFX8-NEXT: s_nop 1 1806; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1807; GFX8-NEXT: v_readlane_b32 s2, v1, 63 1808; GFX8-NEXT: s_mov_b64 exec, s[0:1] 1809; GFX8-NEXT: s_mov_b32 s0, s2 1810; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1811; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1812; GFX8-NEXT: s_cbranch_execz BB10_2 1813; GFX8-NEXT: ; %bb.1: 1814; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1815; GFX8-NEXT: v_mov_b32_e32 v2, s0 1816; GFX8-NEXT: s_mov_b32 m0, -1 1817; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1818; GFX8-NEXT: ds_sub_u32 v0, v2 1819; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX8-NEXT: BB10_2: 1821; GFX8-NEXT: s_endpgm 1822; 1823; GFX9-LABEL: sub_i32_varying_nouse: 1824; GFX9: ; %bb.0: ; %entry 1825; GFX9-NEXT: v_mov_b32_e32 v1, v0 1826; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1827; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1828; GFX9-NEXT: s_not_b64 exec, exec 1829; GFX9-NEXT: v_mov_b32_e32 v1, 0 1830; GFX9-NEXT: s_not_b64 exec, exec 1831; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 1832; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1833; GFX9-NEXT: s_nop 1 1834; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1835; GFX9-NEXT: s_nop 1 1836; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1837; GFX9-NEXT: s_nop 1 1838; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1839; GFX9-NEXT: s_nop 1 1840; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1841; GFX9-NEXT: s_nop 1 1842; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1843; GFX9-NEXT: v_readlane_b32 s2, v1, 63 1844; GFX9-NEXT: s_mov_b64 exec, s[0:1] 1845; GFX9-NEXT: s_mov_b32 s0, s2 1846; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1847; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1848; GFX9-NEXT: s_cbranch_execz BB10_2 1849; GFX9-NEXT: ; %bb.1: 1850; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1851; GFX9-NEXT: v_mov_b32_e32 v2, s0 1852; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1853; GFX9-NEXT: ds_sub_u32 v0, v2 1854; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX9-NEXT: BB10_2: 1856; GFX9-NEXT: s_endpgm 1857; 1858; GFX1064-LABEL: sub_i32_varying_nouse: 1859; GFX1064: ; %bb.0: ; %entry 1860; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1861; GFX1064-NEXT: s_not_b64 exec, exec 1862; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1863; GFX1064-NEXT: s_not_b64 exec, exec 1864; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1865; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1866; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1867; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1868; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1869; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1870; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1871; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1872; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 1873; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1874; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1875; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1876; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1877; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1878; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 1879; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1880; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 1881; GFX1064-NEXT: s_mov_b32 s0, s2 1882; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1883; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1884; GFX1064-NEXT: s_cbranch_execz BB10_2 1885; GFX1064-NEXT: ; %bb.1: 1886; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1887; GFX1064-NEXT: v_mov_b32_e32 v3, s0 1888; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1889; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1890; GFX1064-NEXT: ds_sub_u32 v0, v3 1891; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1892; GFX1064-NEXT: buffer_gl0_inv 1893; GFX1064-NEXT: BB10_2: 1894; GFX1064-NEXT: s_endpgm 1895; 1896; GFX1032-LABEL: sub_i32_varying_nouse: 1897; GFX1032: ; %bb.0: ; %entry 1898; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1899; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1900; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1901; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1902; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1903; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1904; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1905; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1906; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1907; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1908; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1909; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1910; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 1911; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1912; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1913; GFX1032-NEXT: s_mov_b32 s0, s1 1914; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1915; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1916; GFX1032-NEXT: s_cbranch_execz BB10_2 1917; GFX1032-NEXT: ; %bb.1: 1918; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1919; GFX1032-NEXT: v_mov_b32_e32 v3, s0 1920; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1921; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1922; GFX1032-NEXT: ds_sub_u32 v0, v3 1923; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1924; GFX1032-NEXT: buffer_gl0_inv 1925; GFX1032-NEXT: BB10_2: 1926; GFX1032-NEXT: s_endpgm 1927entry: 1928 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1929 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1930 ret void 1931} 1932 1933define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 1934; 1935; 1936; GFX7LESS-LABEL: sub_i64_constant: 1937; GFX7LESS: ; %bb.0: ; %entry 1938; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1939; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1940; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1941; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1942; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1943; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1944; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1945; GFX7LESS-NEXT: s_cbranch_execz BB11_2 1946; GFX7LESS-NEXT: ; %bb.1: 1947; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1948; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 1949; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1950; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1951; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 1952; GFX7LESS-NEXT: s_mov_b32 m0, -1 1953; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1954; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 1955; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1956; GFX7LESS-NEXT: BB11_2: 1957; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1958; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1959; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1960; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1961; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1962; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1963; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1964; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1965; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1966; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1967; GFX7LESS-NEXT: s_mov_b32 s2, -1 1968; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1969; GFX7LESS-NEXT: s_endpgm 1970; 1971; GFX8-LABEL: sub_i64_constant: 1972; GFX8: ; %bb.0: ; %entry 1973; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1974; GFX8-NEXT: s_mov_b64 s[4:5], exec 1975; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1976; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1977; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1978; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1979; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1980; GFX8-NEXT: s_cbranch_execz BB11_2 1981; GFX8-NEXT: ; %bb.1: 1982; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1983; GFX8-NEXT: s_mul_i32 s4, s4, 5 1984; GFX8-NEXT: v_mov_b32_e32 v1, s4 1985; GFX8-NEXT: v_mov_b32_e32 v2, 0 1986; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1987; GFX8-NEXT: s_mov_b32 m0, -1 1988; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1989; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 1990; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1991; GFX8-NEXT: BB11_2: 1992; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1993; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1994; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1995; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1996; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1997; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1998; GFX8-NEXT: v_mov_b32_e32 v2, s3 1999; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2000; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2001; GFX8-NEXT: s_mov_b32 s3, 0xf000 2002; GFX8-NEXT: s_mov_b32 s2, -1 2003; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2004; GFX8-NEXT: s_endpgm 2005; 2006; GFX9-LABEL: sub_i64_constant: 2007; GFX9: ; %bb.0: ; %entry 2008; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2009; GFX9-NEXT: s_mov_b64 s[4:5], exec 2010; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2011; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2012; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2013; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2014; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2015; GFX9-NEXT: s_cbranch_execz BB11_2 2016; GFX9-NEXT: ; %bb.1: 2017; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2018; GFX9-NEXT: s_mul_i32 s4, s4, 5 2019; GFX9-NEXT: v_mov_b32_e32 v1, s4 2020; GFX9-NEXT: v_mov_b32_e32 v2, 0 2021; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2022; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2023; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2024; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2025; GFX9-NEXT: BB11_2: 2026; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2027; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2028; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2029; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2030; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2031; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2032; GFX9-NEXT: v_mov_b32_e32 v2, s3 2033; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2034; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2035; GFX9-NEXT: s_mov_b32 s3, 0xf000 2036; GFX9-NEXT: s_mov_b32 s2, -1 2037; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2038; GFX9-NEXT: s_endpgm 2039; 2040; GFX1064-LABEL: sub_i64_constant: 2041; GFX1064: ; %bb.0: ; %entry 2042; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2043; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2044; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2045; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2046; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2047; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2048; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2049; GFX1064-NEXT: s_cbranch_execz BB11_2 2050; GFX1064-NEXT: ; %bb.1: 2051; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2052; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2053; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2054; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2055; GFX1064-NEXT: v_mov_b32_e32 v1, s4 2056; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2057; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2058; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2059; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2060; GFX1064-NEXT: buffer_gl0_inv 2061; GFX1064-NEXT: BB11_2: 2062; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2063; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2064; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2065; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2066; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2067; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2068; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2069; GFX1064-NEXT: s_mov_b32 s2, -1 2070; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2071; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2072; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2074; GFX1064-NEXT: s_endpgm 2075; 2076; GFX1032-LABEL: sub_i64_constant: 2077; GFX1032: ; %bb.0: ; %entry 2078; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2079; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2080; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2081; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2082; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2083; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2084; GFX1032-NEXT: s_cbranch_execz BB11_2 2085; GFX1032-NEXT: ; %bb.1: 2086; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2087; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2088; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2089; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2090; GFX1032-NEXT: v_mov_b32_e32 v1, s3 2091; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2092; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2093; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2094; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2095; GFX1032-NEXT: buffer_gl0_inv 2096; GFX1032-NEXT: BB11_2: 2097; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2098; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2099; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2100; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2101; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2102; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2103; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2104; GFX1032-NEXT: s_mov_b32 s2, -1 2105; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2106; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2107; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2108; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2109; GFX1032-NEXT: s_endpgm 2110entry: 2111 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2112 store i64 %old, i64 addrspace(1)* %out 2113 ret void 2114} 2115 2116define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2117; 2118; 2119; GFX7LESS-LABEL: sub_i64_uniform: 2120; GFX7LESS: ; %bb.0: ; %entry 2121; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2122; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2123; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2124; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2125; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2126; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2127; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2128; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2129; GFX7LESS-NEXT: ; %bb.1: 2130; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2131; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2132; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2133; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2134; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2135; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2136; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2137; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2138; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2139; GFX7LESS-NEXT: s_mov_b32 m0, -1 2140; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2141; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2142; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2143; GFX7LESS-NEXT: BB12_2: 2144; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2145; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2146; GFX7LESS-NEXT: s_mov_b32 s6, -1 2147; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2148; GFX7LESS-NEXT: s_mov_b32 s4, s0 2149; GFX7LESS-NEXT: s_mov_b32 s5, s1 2150; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2151; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2152; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2153; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2154; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2155; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2156; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2157; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2158; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2159; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2160; GFX7LESS-NEXT: s_endpgm 2161; 2162; GFX8-LABEL: sub_i64_uniform: 2163; GFX8: ; %bb.0: ; %entry 2164; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2165; GFX8-NEXT: s_mov_b64 s[6:7], exec 2166; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2167; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2168; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2169; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2170; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2171; GFX8-NEXT: s_cbranch_execz BB12_2 2172; GFX8-NEXT: ; %bb.1: 2173; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2174; GFX8-NEXT: v_mov_b32_e32 v1, s6 2175; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2176; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2177; GFX8-NEXT: s_mul_i32 s7, s3, s6 2178; GFX8-NEXT: s_mul_i32 s6, s2, s6 2179; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2180; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2181; GFX8-NEXT: v_mov_b32_e32 v1, s6 2182; GFX8-NEXT: s_mov_b32 m0, -1 2183; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2184; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2185; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2186; GFX8-NEXT: BB12_2: 2187; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2188; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2189; GFX8-NEXT: s_mov_b32 s4, s0 2190; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2191; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2192; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2193; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2194; GFX8-NEXT: s_mov_b32 s5, s1 2195; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2196; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2197; GFX8-NEXT: v_mov_b32_e32 v2, s1 2198; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2199; GFX8-NEXT: s_mov_b32 s7, 0xf000 2200; GFX8-NEXT: s_mov_b32 s6, -1 2201; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2202; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2203; GFX8-NEXT: s_endpgm 2204; 2205; GFX9-LABEL: sub_i64_uniform: 2206; GFX9: ; %bb.0: ; %entry 2207; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2208; GFX9-NEXT: s_mov_b64 s[6:7], exec 2209; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2210; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2211; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2212; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2213; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2214; GFX9-NEXT: s_cbranch_execz BB12_2 2215; GFX9-NEXT: ; %bb.1: 2216; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2217; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2218; GFX9-NEXT: s_mul_i32 s7, s3, s6 2219; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2220; GFX9-NEXT: s_add_i32 s8, s8, s7 2221; GFX9-NEXT: s_mul_i32 s6, s2, s6 2222; GFX9-NEXT: v_mov_b32_e32 v1, s6 2223; GFX9-NEXT: v_mov_b32_e32 v2, s8 2224; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2225; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2226; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2227; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2228; GFX9-NEXT: BB12_2: 2229; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2230; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2231; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2232; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2233; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2234; GFX9-NEXT: s_mov_b32 s4, s0 2235; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2236; GFX9-NEXT: s_mov_b32 s5, s1 2237; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2238; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2239; GFX9-NEXT: v_mov_b32_e32 v2, s1 2240; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2241; GFX9-NEXT: s_mov_b32 s7, 0xf000 2242; GFX9-NEXT: s_mov_b32 s6, -1 2243; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2244; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2245; GFX9-NEXT: s_endpgm 2246; 2247; GFX1064-LABEL: sub_i64_uniform: 2248; GFX1064: ; %bb.0: ; %entry 2249; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2250; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2251; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2252; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2253; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2254; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2255; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2256; GFX1064-NEXT: s_cbranch_execz BB12_2 2257; GFX1064-NEXT: ; %bb.1: 2258; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2259; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2260; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2261; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2262; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2263; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2264; GFX1064-NEXT: s_add_i32 s8, s8, s7 2265; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2266; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2267; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2268; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2269; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2270; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2271; GFX1064-NEXT: buffer_gl0_inv 2272; GFX1064-NEXT: BB12_2: 2273; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2274; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2275; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2276; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2277; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2278; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2279; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2280; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2281; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2282; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2283; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0 2284; GFX1064-NEXT: s_mov_b32 s2, -1 2285; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2286; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2287; GFX1064-NEXT: s_endpgm 2288; 2289; GFX1032-LABEL: sub_i64_uniform: 2290; GFX1032: ; %bb.0: ; %entry 2291; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2292; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2293; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2294; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2295; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2296; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2297; GFX1032-NEXT: s_cbranch_execz BB12_2 2298; GFX1032-NEXT: ; %bb.1: 2299; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2300; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2301; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2303; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2304; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2305; GFX1032-NEXT: s_add_i32 s7, s7, s6 2306; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2307; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2308; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2309; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2310; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2311; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2312; GFX1032-NEXT: buffer_gl0_inv 2313; GFX1032-NEXT: BB12_2: 2314; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2315; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2316; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2317; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2318; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2319; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2320; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2321; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2322; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2323; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2324; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0 2325; GFX1032-NEXT: s_mov_b32 s2, -1 2326; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2327; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2328; GFX1032-NEXT: s_endpgm 2329entry: 2330 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2331 store i64 %old, i64 addrspace(1)* %out 2332 ret void 2333} 2334 2335define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2336; 2337; 2338; GFX7LESS-LABEL: sub_i64_varying: 2339; GFX7LESS: ; %bb.0: ; %entry 2340; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2341; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2342; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2343; GFX7LESS-NEXT: s_mov_b32 m0, -1 2344; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2345; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2346; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2347; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2348; GFX7LESS-NEXT: s_mov_b32 s2, -1 2349; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2350; GFX7LESS-NEXT: s_endpgm 2351; 2352; GFX8-LABEL: sub_i64_varying: 2353; GFX8: ; %bb.0: ; %entry 2354; GFX8-NEXT: v_mov_b32_e32 v1, 0 2355; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2356; GFX8-NEXT: s_mov_b32 m0, -1 2357; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2358; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2359; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2360; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2361; GFX8-NEXT: s_mov_b32 s3, 0xf000 2362; GFX8-NEXT: s_mov_b32 s2, -1 2363; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2364; GFX8-NEXT: s_endpgm 2365; 2366; GFX9-LABEL: sub_i64_varying: 2367; GFX9: ; %bb.0: ; %entry 2368; GFX9-NEXT: v_mov_b32_e32 v1, 0 2369; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2370; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2371; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2372; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2374; GFX9-NEXT: s_mov_b32 s3, 0xf000 2375; GFX9-NEXT: s_mov_b32 s2, -1 2376; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2377; GFX9-NEXT: s_endpgm 2378; 2379; GFX1064-LABEL: sub_i64_varying: 2380; GFX1064: ; %bb.0: ; %entry 2381; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2382; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2383; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2384; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2385; GFX1064-NEXT: s_mov_b32 s2, -1 2386; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2387; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2388; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2389; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2390; GFX1064-NEXT: buffer_gl0_inv 2391; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2392; GFX1064-NEXT: s_endpgm 2393; 2394; GFX1032-LABEL: sub_i64_varying: 2395; GFX1032: ; %bb.0: ; %entry 2396; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2397; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2398; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2399; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2400; GFX1032-NEXT: s_mov_b32 s2, -1 2401; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2402; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2403; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2404; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2405; GFX1032-NEXT: buffer_gl0_inv 2406; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2407; GFX1032-NEXT: s_endpgm 2408entry: 2409 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2410 %zext = zext i32 %lane to i64 2411 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2412 store i64 %old, i64 addrspace(1)* %out 2413 ret void 2414} 2415 2416define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2417; 2418; 2419; GFX7LESS-LABEL: and_i32_varying: 2420; GFX7LESS: ; %bb.0: ; %entry 2421; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2422; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2423; GFX7LESS-NEXT: s_mov_b32 m0, -1 2424; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2425; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2426; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2427; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2428; GFX7LESS-NEXT: s_mov_b32 s2, -1 2429; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2430; GFX7LESS-NEXT: s_endpgm 2431; 2432; GFX8-LABEL: and_i32_varying: 2433; GFX8: ; %bb.0: ; %entry 2434; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2435; GFX8-NEXT: v_mov_b32_e32 v2, v0 2436; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2437; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2438; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2439; GFX8-NEXT: v_mov_b32_e32 v1, -1 2440; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2441; GFX8-NEXT: s_not_b64 exec, exec 2442; GFX8-NEXT: v_mov_b32_e32 v2, -1 2443; GFX8-NEXT: s_not_b64 exec, exec 2444; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2445; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2446; GFX8-NEXT: s_nop 1 2447; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2448; GFX8-NEXT: s_nop 1 2449; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2450; GFX8-NEXT: s_nop 1 2451; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2452; GFX8-NEXT: s_nop 1 2453; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2454; GFX8-NEXT: s_nop 1 2455; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2456; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2457; GFX8-NEXT: s_nop 0 2458; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2459; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2460; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2461; GFX8-NEXT: ; implicit-def: $vgpr0 2462; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2463; GFX8-NEXT: s_cbranch_execz BB14_2 2464; GFX8-NEXT: ; %bb.1: 2465; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2466; GFX8-NEXT: v_mov_b32_e32 v3, s4 2467; GFX8-NEXT: s_mov_b32 m0, -1 2468; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2469; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2470; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2471; GFX8-NEXT: BB14_2: 2472; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2473; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2474; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2475; GFX8-NEXT: v_mov_b32_e32 v0, v1 2476; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2477; GFX8-NEXT: s_mov_b32 s3, 0xf000 2478; GFX8-NEXT: s_mov_b32 s2, -1 2479; GFX8-NEXT: s_nop 0 2480; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2481; GFX8-NEXT: s_endpgm 2482; 2483; GFX9-LABEL: and_i32_varying: 2484; GFX9: ; %bb.0: ; %entry 2485; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2486; GFX9-NEXT: v_mov_b32_e32 v2, v0 2487; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2488; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2489; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2490; GFX9-NEXT: v_mov_b32_e32 v1, -1 2491; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2492; GFX9-NEXT: s_not_b64 exec, exec 2493; GFX9-NEXT: v_mov_b32_e32 v2, -1 2494; GFX9-NEXT: s_not_b64 exec, exec 2495; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2496; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2497; GFX9-NEXT: s_nop 1 2498; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2499; GFX9-NEXT: s_nop 1 2500; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2501; GFX9-NEXT: s_nop 1 2502; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2503; GFX9-NEXT: s_nop 1 2504; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2505; GFX9-NEXT: s_nop 1 2506; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2507; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2508; GFX9-NEXT: s_nop 0 2509; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2510; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2511; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2512; GFX9-NEXT: ; implicit-def: $vgpr0 2513; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2514; GFX9-NEXT: s_cbranch_execz BB14_2 2515; GFX9-NEXT: ; %bb.1: 2516; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2517; GFX9-NEXT: v_mov_b32_e32 v3, s4 2518; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2519; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2520; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2521; GFX9-NEXT: BB14_2: 2522; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2523; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2524; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2525; GFX9-NEXT: v_mov_b32_e32 v0, v1 2526; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2527; GFX9-NEXT: s_mov_b32 s3, 0xf000 2528; GFX9-NEXT: s_mov_b32 s2, -1 2529; GFX9-NEXT: s_nop 0 2530; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2531; GFX9-NEXT: s_endpgm 2532; 2533; GFX1064-LABEL: and_i32_varying: 2534; GFX1064: ; %bb.0: ; %entry 2535; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2536; GFX1064-NEXT: s_not_b64 exec, exec 2537; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2538; GFX1064-NEXT: s_not_b64 exec, exec 2539; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2540; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2541; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2542; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2543; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2544; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2545; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2546; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2547; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2548; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2549; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2550; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2551; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2552; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2553; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2554; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2555; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2556; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2557; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2558; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2559; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2560; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2561; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2562; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2563; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2564; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2565; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2566; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2567; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2568; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2569; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2570; GFX1064-NEXT: s_mov_b32 s2, -1 2571; GFX1064-NEXT: ; implicit-def: $vgpr0 2572; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2573; GFX1064-NEXT: s_cbranch_execz BB14_2 2574; GFX1064-NEXT: ; %bb.1: 2575; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2576; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2577; GFX1064-NEXT: s_mov_b32 s3, s7 2578; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2579; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2580; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 2581; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2582; GFX1064-NEXT: buffer_gl0_inv 2583; GFX1064-NEXT: BB14_2: 2584; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2585; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2586; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2587; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2588; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2589; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2590; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2591; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2592; GFX1064-NEXT: s_endpgm 2593; 2594; GFX1032-LABEL: and_i32_varying: 2595; GFX1032: ; %bb.0: ; %entry 2596; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2597; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2598; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2599; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2600; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2601; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2602; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2603; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2604; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2605; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2606; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2607; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2608; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2609; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2610; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2611; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2612; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2613; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2614; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2615; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2616; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2617; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2618; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2619; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2620; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2621; GFX1032-NEXT: s_mov_b32 s2, -1 2622; GFX1032-NEXT: ; implicit-def: $vgpr0 2623; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2624; GFX1032-NEXT: s_cbranch_execz BB14_2 2625; GFX1032-NEXT: ; %bb.1: 2626; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2627; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2628; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2629; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2630; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 2631; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2632; GFX1032-NEXT: buffer_gl0_inv 2633; GFX1032-NEXT: BB14_2: 2634; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2635; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2636; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2637; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2638; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2639; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2640; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2641; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2642; GFX1032-NEXT: s_endpgm 2643entry: 2644 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2645 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2646 store i32 %old, i32 addrspace(1)* %out 2647 ret void 2648} 2649 2650define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2651; 2652; 2653; GFX7LESS-LABEL: or_i32_varying: 2654; GFX7LESS: ; %bb.0: ; %entry 2655; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2656; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2657; GFX7LESS-NEXT: s_mov_b32 m0, -1 2658; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2659; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2660; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2661; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2662; GFX7LESS-NEXT: s_mov_b32 s2, -1 2663; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2664; GFX7LESS-NEXT: s_endpgm 2665; 2666; GFX8-LABEL: or_i32_varying: 2667; GFX8: ; %bb.0: ; %entry 2668; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2669; GFX8-NEXT: v_mov_b32_e32 v2, v0 2670; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2671; GFX8-NEXT: v_mov_b32_e32 v1, 0 2672; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2673; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2674; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2675; GFX8-NEXT: s_not_b64 exec, exec 2676; GFX8-NEXT: v_mov_b32_e32 v2, 0 2677; GFX8-NEXT: s_not_b64 exec, exec 2678; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2679; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2680; GFX8-NEXT: s_nop 1 2681; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2682; GFX8-NEXT: s_nop 1 2683; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2684; GFX8-NEXT: s_nop 1 2685; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2686; GFX8-NEXT: s_nop 1 2687; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2688; GFX8-NEXT: s_nop 1 2689; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2690; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2691; GFX8-NEXT: s_nop 0 2692; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2693; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2694; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2695; GFX8-NEXT: ; implicit-def: $vgpr0 2696; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2697; GFX8-NEXT: s_cbranch_execz BB15_2 2698; GFX8-NEXT: ; %bb.1: 2699; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2700; GFX8-NEXT: v_mov_b32_e32 v3, s4 2701; GFX8-NEXT: s_mov_b32 m0, -1 2702; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2703; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2704; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2705; GFX8-NEXT: BB15_2: 2706; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2707; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2708; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2709; GFX8-NEXT: v_mov_b32_e32 v0, v1 2710; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2711; GFX8-NEXT: s_mov_b32 s3, 0xf000 2712; GFX8-NEXT: s_mov_b32 s2, -1 2713; GFX8-NEXT: s_nop 0 2714; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2715; GFX8-NEXT: s_endpgm 2716; 2717; GFX9-LABEL: or_i32_varying: 2718; GFX9: ; %bb.0: ; %entry 2719; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2720; GFX9-NEXT: v_mov_b32_e32 v2, v0 2721; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2722; GFX9-NEXT: v_mov_b32_e32 v1, 0 2723; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2724; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2725; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2726; GFX9-NEXT: s_not_b64 exec, exec 2727; GFX9-NEXT: v_mov_b32_e32 v2, 0 2728; GFX9-NEXT: s_not_b64 exec, exec 2729; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2730; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2731; GFX9-NEXT: s_nop 1 2732; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2733; GFX9-NEXT: s_nop 1 2734; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2735; GFX9-NEXT: s_nop 1 2736; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2737; GFX9-NEXT: s_nop 1 2738; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2739; GFX9-NEXT: s_nop 1 2740; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2741; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2742; GFX9-NEXT: s_nop 0 2743; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2744; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2745; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2746; GFX9-NEXT: ; implicit-def: $vgpr0 2747; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2748; GFX9-NEXT: s_cbranch_execz BB15_2 2749; GFX9-NEXT: ; %bb.1: 2750; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2751; GFX9-NEXT: v_mov_b32_e32 v3, s4 2752; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2753; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2754; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2755; GFX9-NEXT: BB15_2: 2756; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2757; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2758; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2759; GFX9-NEXT: v_mov_b32_e32 v0, v1 2760; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2761; GFX9-NEXT: s_mov_b32 s3, 0xf000 2762; GFX9-NEXT: s_mov_b32 s2, -1 2763; GFX9-NEXT: s_nop 0 2764; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2765; GFX9-NEXT: s_endpgm 2766; 2767; GFX1064-LABEL: or_i32_varying: 2768; GFX1064: ; %bb.0: ; %entry 2769; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2770; GFX1064-NEXT: s_not_b64 exec, exec 2771; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2772; GFX1064-NEXT: s_not_b64 exec, exec 2773; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2774; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2775; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2776; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2777; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2778; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2779; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2780; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2781; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2782; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2783; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2784; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2785; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2786; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2787; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2788; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2789; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2790; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2791; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2792; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2793; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2794; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2795; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2796; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2797; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2798; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2799; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2800; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2801; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2802; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2803; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2804; GFX1064-NEXT: s_mov_b32 s2, -1 2805; GFX1064-NEXT: ; implicit-def: $vgpr0 2806; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2807; GFX1064-NEXT: s_cbranch_execz BB15_2 2808; GFX1064-NEXT: ; %bb.1: 2809; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2810; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2811; GFX1064-NEXT: s_mov_b32 s3, s7 2812; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2813; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2814; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 2815; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2816; GFX1064-NEXT: buffer_gl0_inv 2817; GFX1064-NEXT: BB15_2: 2818; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2819; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2820; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2821; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2822; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2823; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2824; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2825; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2826; GFX1064-NEXT: s_endpgm 2827; 2828; GFX1032-LABEL: or_i32_varying: 2829; GFX1032: ; %bb.0: ; %entry 2830; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2831; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2832; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2833; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2834; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2835; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2836; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2837; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2838; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2839; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2840; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2841; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2842; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2843; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2844; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2845; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2846; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2847; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2848; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2849; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2850; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2851; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2852; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2853; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2854; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2855; GFX1032-NEXT: s_mov_b32 s2, -1 2856; GFX1032-NEXT: ; implicit-def: $vgpr0 2857; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2858; GFX1032-NEXT: s_cbranch_execz BB15_2 2859; GFX1032-NEXT: ; %bb.1: 2860; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2861; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2862; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2863; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2864; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 2865; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2866; GFX1032-NEXT: buffer_gl0_inv 2867; GFX1032-NEXT: BB15_2: 2868; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2869; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2870; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2871; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2872; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 2873; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2874; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2875; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2876; GFX1032-NEXT: s_endpgm 2877entry: 2878 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2879 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2880 store i32 %old, i32 addrspace(1)* %out 2881 ret void 2882} 2883 2884define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 2885; 2886; 2887; GFX7LESS-LABEL: xor_i32_varying: 2888; GFX7LESS: ; %bb.0: ; %entry 2889; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2890; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2891; GFX7LESS-NEXT: s_mov_b32 m0, -1 2892; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2893; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 2894; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2895; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2896; GFX7LESS-NEXT: s_mov_b32 s2, -1 2897; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2898; GFX7LESS-NEXT: s_endpgm 2899; 2900; GFX8-LABEL: xor_i32_varying: 2901; GFX8: ; %bb.0: ; %entry 2902; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2903; GFX8-NEXT: v_mov_b32_e32 v2, v0 2904; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2905; GFX8-NEXT: v_mov_b32_e32 v1, 0 2906; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2907; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2908; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2909; GFX8-NEXT: s_not_b64 exec, exec 2910; GFX8-NEXT: v_mov_b32_e32 v2, 0 2911; GFX8-NEXT: s_not_b64 exec, exec 2912; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2913; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2914; GFX8-NEXT: s_nop 1 2915; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2916; GFX8-NEXT: s_nop 1 2917; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2918; GFX8-NEXT: s_nop 1 2919; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2920; GFX8-NEXT: s_nop 1 2921; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2922; GFX8-NEXT: s_nop 1 2923; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2924; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2925; GFX8-NEXT: s_nop 0 2926; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2927; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2928; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2929; GFX8-NEXT: ; implicit-def: $vgpr0 2930; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2931; GFX8-NEXT: s_cbranch_execz BB16_2 2932; GFX8-NEXT: ; %bb.1: 2933; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2934; GFX8-NEXT: v_mov_b32_e32 v3, s4 2935; GFX8-NEXT: s_mov_b32 m0, -1 2936; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2937; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 2938; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2939; GFX8-NEXT: BB16_2: 2940; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2941; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2942; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2943; GFX8-NEXT: v_mov_b32_e32 v0, v1 2944; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 2945; GFX8-NEXT: s_mov_b32 s3, 0xf000 2946; GFX8-NEXT: s_mov_b32 s2, -1 2947; GFX8-NEXT: s_nop 0 2948; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2949; GFX8-NEXT: s_endpgm 2950; 2951; GFX9-LABEL: xor_i32_varying: 2952; GFX9: ; %bb.0: ; %entry 2953; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2954; GFX9-NEXT: v_mov_b32_e32 v2, v0 2955; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2956; GFX9-NEXT: v_mov_b32_e32 v1, 0 2957; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2958; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2959; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2960; GFX9-NEXT: s_not_b64 exec, exec 2961; GFX9-NEXT: v_mov_b32_e32 v2, 0 2962; GFX9-NEXT: s_not_b64 exec, exec 2963; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2964; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2965; GFX9-NEXT: s_nop 1 2966; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2967; GFX9-NEXT: s_nop 1 2968; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2969; GFX9-NEXT: s_nop 1 2970; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2971; GFX9-NEXT: s_nop 1 2972; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2973; GFX9-NEXT: s_nop 1 2974; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2975; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2976; GFX9-NEXT: s_nop 0 2977; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2978; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2979; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2980; GFX9-NEXT: ; implicit-def: $vgpr0 2981; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2982; GFX9-NEXT: s_cbranch_execz BB16_2 2983; GFX9-NEXT: ; %bb.1: 2984; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2985; GFX9-NEXT: v_mov_b32_e32 v3, s4 2986; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2987; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 2988; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2989; GFX9-NEXT: BB16_2: 2990; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2991; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2992; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2993; GFX9-NEXT: v_mov_b32_e32 v0, v1 2994; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2995; GFX9-NEXT: s_mov_b32 s3, 0xf000 2996; GFX9-NEXT: s_mov_b32 s2, -1 2997; GFX9-NEXT: s_nop 0 2998; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2999; GFX9-NEXT: s_endpgm 3000; 3001; GFX1064-LABEL: xor_i32_varying: 3002; GFX1064: ; %bb.0: ; %entry 3003; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3004; GFX1064-NEXT: s_not_b64 exec, exec 3005; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3006; GFX1064-NEXT: s_not_b64 exec, exec 3007; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3008; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3009; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3010; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3011; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3012; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3013; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3014; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3015; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3016; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3017; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3018; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3019; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3020; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3021; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3022; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3023; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3024; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3025; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3026; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3027; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3028; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3029; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3030; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3031; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3032; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3033; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3034; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3035; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3036; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3037; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3038; GFX1064-NEXT: s_mov_b32 s2, -1 3039; GFX1064-NEXT: ; implicit-def: $vgpr0 3040; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3041; GFX1064-NEXT: s_cbranch_execz BB16_2 3042; GFX1064-NEXT: ; %bb.1: 3043; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3044; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3045; GFX1064-NEXT: s_mov_b32 s3, s7 3046; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3047; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3048; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 3049; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3050; GFX1064-NEXT: buffer_gl0_inv 3051; GFX1064-NEXT: BB16_2: 3052; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3053; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3054; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3055; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3056; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3057; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3058; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3059; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3060; GFX1064-NEXT: s_endpgm 3061; 3062; GFX1032-LABEL: xor_i32_varying: 3063; GFX1032: ; %bb.0: ; %entry 3064; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3065; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3066; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3067; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3068; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3069; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3070; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3071; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3072; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3073; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3074; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3075; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3076; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3077; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3078; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3079; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3080; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3081; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3082; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3083; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3084; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3085; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3086; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3087; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3088; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3089; GFX1032-NEXT: s_mov_b32 s2, -1 3090; GFX1032-NEXT: ; implicit-def: $vgpr0 3091; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3092; GFX1032-NEXT: s_cbranch_execz BB16_2 3093; GFX1032-NEXT: ; %bb.1: 3094; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3095; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3096; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3097; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3098; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 3099; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3100; GFX1032-NEXT: buffer_gl0_inv 3101; GFX1032-NEXT: BB16_2: 3102; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3103; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3104; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3105; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3106; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3107; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3108; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3109; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3110; GFX1032-NEXT: s_endpgm 3111entry: 3112 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3113 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3114 store i32 %old, i32 addrspace(1)* %out 3115 ret void 3116} 3117 3118define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3119; 3120; 3121; GFX7LESS-LABEL: max_i32_varying: 3122; GFX7LESS: ; %bb.0: ; %entry 3123; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3124; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3125; GFX7LESS-NEXT: s_mov_b32 m0, -1 3126; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3127; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3128; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3129; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3130; GFX7LESS-NEXT: s_mov_b32 s2, -1 3131; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3132; GFX7LESS-NEXT: s_endpgm 3133; 3134; GFX8-LABEL: max_i32_varying: 3135; GFX8: ; %bb.0: ; %entry 3136; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3137; GFX8-NEXT: v_mov_b32_e32 v2, v0 3138; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3139; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3140; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3141; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3142; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3143; GFX8-NEXT: s_not_b64 exec, exec 3144; GFX8-NEXT: v_mov_b32_e32 v2, v1 3145; GFX8-NEXT: s_not_b64 exec, exec 3146; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3147; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3148; GFX8-NEXT: s_nop 1 3149; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3150; GFX8-NEXT: s_nop 1 3151; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3152; GFX8-NEXT: s_nop 1 3153; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3154; GFX8-NEXT: s_nop 1 3155; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3156; GFX8-NEXT: s_nop 1 3157; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3158; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3159; GFX8-NEXT: s_nop 0 3160; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3161; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3162; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3163; GFX8-NEXT: ; implicit-def: $vgpr0 3164; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3165; GFX8-NEXT: s_cbranch_execz BB17_2 3166; GFX8-NEXT: ; %bb.1: 3167; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3168; GFX8-NEXT: v_mov_b32_e32 v3, s4 3169; GFX8-NEXT: s_mov_b32 m0, -1 3170; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3171; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3172; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3173; GFX8-NEXT: BB17_2: 3174; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3175; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3176; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3177; GFX8-NEXT: v_mov_b32_e32 v0, v1 3178; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3179; GFX8-NEXT: s_mov_b32 s3, 0xf000 3180; GFX8-NEXT: s_mov_b32 s2, -1 3181; GFX8-NEXT: s_nop 0 3182; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3183; GFX8-NEXT: s_endpgm 3184; 3185; GFX9-LABEL: max_i32_varying: 3186; GFX9: ; %bb.0: ; %entry 3187; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3188; GFX9-NEXT: v_mov_b32_e32 v2, v0 3189; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3190; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3191; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3192; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3193; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3194; GFX9-NEXT: s_not_b64 exec, exec 3195; GFX9-NEXT: v_mov_b32_e32 v2, v1 3196; GFX9-NEXT: s_not_b64 exec, exec 3197; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3198; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3199; GFX9-NEXT: s_nop 1 3200; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3201; GFX9-NEXT: s_nop 1 3202; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3203; GFX9-NEXT: s_nop 1 3204; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3205; GFX9-NEXT: s_nop 1 3206; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3207; GFX9-NEXT: s_nop 1 3208; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3209; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3210; GFX9-NEXT: s_nop 0 3211; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3212; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3213; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3214; GFX9-NEXT: ; implicit-def: $vgpr0 3215; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3216; GFX9-NEXT: s_cbranch_execz BB17_2 3217; GFX9-NEXT: ; %bb.1: 3218; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3219; GFX9-NEXT: v_mov_b32_e32 v3, s4 3220; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3221; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3222; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3223; GFX9-NEXT: BB17_2: 3224; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3225; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3226; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3227; GFX9-NEXT: v_mov_b32_e32 v0, v1 3228; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3229; GFX9-NEXT: s_mov_b32 s3, 0xf000 3230; GFX9-NEXT: s_mov_b32 s2, -1 3231; GFX9-NEXT: s_nop 0 3232; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3233; GFX9-NEXT: s_endpgm 3234; 3235; GFX1064-LABEL: max_i32_varying: 3236; GFX1064: ; %bb.0: ; %entry 3237; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3238; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3239; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3240; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3241; GFX1064-NEXT: s_not_b64 exec, exec 3242; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3243; GFX1064-NEXT: s_not_b64 exec, exec 3244; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3245; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3246; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3247; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3248; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3249; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3250; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3251; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3252; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3253; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3254; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3255; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3256; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3257; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3258; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3259; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3260; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3261; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3262; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3263; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3264; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3265; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3266; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3267; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3268; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3269; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3270; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3271; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3272; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3273; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3274; GFX1064-NEXT: s_mov_b32 s2, -1 3275; GFX1064-NEXT: ; implicit-def: $vgpr0 3276; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3277; GFX1064-NEXT: s_cbranch_execz BB17_2 3278; GFX1064-NEXT: ; %bb.1: 3279; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3280; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3281; GFX1064-NEXT: s_mov_b32 s3, s7 3282; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3283; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3284; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 3285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3286; GFX1064-NEXT: buffer_gl0_inv 3287; GFX1064-NEXT: BB17_2: 3288; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3289; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3290; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3291; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3292; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3293; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3294; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3295; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3296; GFX1064-NEXT: s_endpgm 3297; 3298; GFX1032-LABEL: max_i32_varying: 3299; GFX1032: ; %bb.0: ; %entry 3300; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3301; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3302; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3303; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3304; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3305; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3306; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3307; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3308; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3309; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3310; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3311; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3312; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3313; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3314; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3315; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3316; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3317; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3318; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3319; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3320; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3321; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3322; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3323; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3324; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3325; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3326; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3327; GFX1032-NEXT: s_mov_b32 s2, -1 3328; GFX1032-NEXT: ; implicit-def: $vgpr0 3329; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3330; GFX1032-NEXT: s_cbranch_execz BB17_2 3331; GFX1032-NEXT: ; %bb.1: 3332; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3333; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3334; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3335; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3336; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 3337; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3338; GFX1032-NEXT: buffer_gl0_inv 3339; GFX1032-NEXT: BB17_2: 3340; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3341; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3342; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3343; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3344; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3345; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3346; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3347; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3348; GFX1032-NEXT: s_endpgm 3349entry: 3350 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3351 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3352 store i32 %old, i32 addrspace(1)* %out 3353 ret void 3354} 3355 3356define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3357; 3358; 3359; GFX7LESS-LABEL: max_i64_constant: 3360; GFX7LESS: ; %bb.0: ; %entry 3361; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3362; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3363; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3364; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3365; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3366; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3367; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3368; GFX7LESS-NEXT: ; %bb.1: 3369; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3370; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3371; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3372; GFX7LESS-NEXT: s_mov_b32 m0, -1 3373; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3374; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3375; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3376; GFX7LESS-NEXT: BB18_2: 3377; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3378; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3379; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3380; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3381; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3382; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3383; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3384; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3385; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3386; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3387; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3388; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3389; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3390; GFX7LESS-NEXT: s_mov_b32 s2, -1 3391; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3392; GFX7LESS-NEXT: s_endpgm 3393; 3394; GFX8-LABEL: max_i64_constant: 3395; GFX8: ; %bb.0: ; %entry 3396; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3397; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3398; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3399; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3400; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3401; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3402; GFX8-NEXT: s_cbranch_execz BB18_2 3403; GFX8-NEXT: ; %bb.1: 3404; GFX8-NEXT: v_mov_b32_e32 v0, 5 3405; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3406; GFX8-NEXT: v_mov_b32_e32 v1, 0 3407; GFX8-NEXT: s_mov_b32 m0, -1 3408; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3409; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3410; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3411; GFX8-NEXT: BB18_2: 3412; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3413; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3414; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3415; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3416; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3417; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3418; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3419; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3420; GFX8-NEXT: v_mov_b32_e32 v2, s3 3421; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3422; GFX8-NEXT: v_mov_b32_e32 v2, s2 3423; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3424; GFX8-NEXT: s_mov_b32 s3, 0xf000 3425; GFX8-NEXT: s_mov_b32 s2, -1 3426; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3427; GFX8-NEXT: s_endpgm 3428; 3429; GFX9-LABEL: max_i64_constant: 3430; GFX9: ; %bb.0: ; %entry 3431; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3432; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3433; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3434; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3435; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3436; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3437; GFX9-NEXT: s_cbranch_execz BB18_2 3438; GFX9-NEXT: ; %bb.1: 3439; GFX9-NEXT: v_mov_b32_e32 v0, 5 3440; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3441; GFX9-NEXT: v_mov_b32_e32 v1, 0 3442; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3443; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3445; GFX9-NEXT: BB18_2: 3446; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3447; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3448; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3449; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3450; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3451; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3452; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3453; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3454; GFX9-NEXT: v_mov_b32_e32 v2, s3 3455; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3456; GFX9-NEXT: v_mov_b32_e32 v2, s2 3457; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3458; GFX9-NEXT: s_mov_b32 s3, 0xf000 3459; GFX9-NEXT: s_mov_b32 s2, -1 3460; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3461; GFX9-NEXT: s_endpgm 3462; 3463; GFX1064-LABEL: max_i64_constant: 3464; GFX1064: ; %bb.0: ; %entry 3465; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3466; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3467; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3468; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3469; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3470; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3471; GFX1064-NEXT: s_cbranch_execz BB18_2 3472; GFX1064-NEXT: ; %bb.1: 3473; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3474; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3475; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3476; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3477; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3478; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3479; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3480; GFX1064-NEXT: buffer_gl0_inv 3481; GFX1064-NEXT: BB18_2: 3482; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3483; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3484; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3485; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3486; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3487; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3488; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3489; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3490; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3491; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3492; GFX1064-NEXT: s_mov_b32 s2, -1 3493; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3494; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3495; GFX1064-NEXT: s_endpgm 3496; 3497; GFX1032-LABEL: max_i64_constant: 3498; GFX1032: ; %bb.0: ; %entry 3499; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3500; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3501; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3502; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3503; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3504; GFX1032-NEXT: s_cbranch_execz BB18_2 3505; GFX1032-NEXT: ; %bb.1: 3506; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3507; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3508; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3509; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3510; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3511; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3512; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3513; GFX1032-NEXT: buffer_gl0_inv 3514; GFX1032-NEXT: BB18_2: 3515; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3516; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3517; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3518; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3519; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3520; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3521; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3522; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3523; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3524; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3525; GFX1032-NEXT: s_mov_b32 s2, -1 3526; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3527; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3528; GFX1032-NEXT: s_endpgm 3529entry: 3530 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3531 store i64 %old, i64 addrspace(1)* %out 3532 ret void 3533} 3534 3535define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3536; 3537; 3538; GFX7LESS-LABEL: min_i32_varying: 3539; GFX7LESS: ; %bb.0: ; %entry 3540; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3541; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3542; GFX7LESS-NEXT: s_mov_b32 m0, -1 3543; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3544; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3545; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3546; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3547; GFX7LESS-NEXT: s_mov_b32 s2, -1 3548; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3549; GFX7LESS-NEXT: s_endpgm 3550; 3551; GFX8-LABEL: min_i32_varying: 3552; GFX8: ; %bb.0: ; %entry 3553; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3554; GFX8-NEXT: v_mov_b32_e32 v2, v0 3555; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3556; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3557; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3558; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3559; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3560; GFX8-NEXT: s_not_b64 exec, exec 3561; GFX8-NEXT: v_mov_b32_e32 v2, v1 3562; GFX8-NEXT: s_not_b64 exec, exec 3563; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3564; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3565; GFX8-NEXT: s_nop 1 3566; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3567; GFX8-NEXT: s_nop 1 3568; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3569; GFX8-NEXT: s_nop 1 3570; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3571; GFX8-NEXT: s_nop 1 3572; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3573; GFX8-NEXT: s_nop 1 3574; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3575; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3576; GFX8-NEXT: s_nop 0 3577; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3578; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3579; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3580; GFX8-NEXT: ; implicit-def: $vgpr0 3581; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3582; GFX8-NEXT: s_cbranch_execz BB19_2 3583; GFX8-NEXT: ; %bb.1: 3584; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3585; GFX8-NEXT: v_mov_b32_e32 v3, s4 3586; GFX8-NEXT: s_mov_b32 m0, -1 3587; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3588; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3589; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3590; GFX8-NEXT: BB19_2: 3591; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3592; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3593; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3594; GFX8-NEXT: v_mov_b32_e32 v0, v1 3595; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3596; GFX8-NEXT: s_mov_b32 s3, 0xf000 3597; GFX8-NEXT: s_mov_b32 s2, -1 3598; GFX8-NEXT: s_nop 0 3599; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3600; GFX8-NEXT: s_endpgm 3601; 3602; GFX9-LABEL: min_i32_varying: 3603; GFX9: ; %bb.0: ; %entry 3604; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3605; GFX9-NEXT: v_mov_b32_e32 v2, v0 3606; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3607; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3608; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3609; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3610; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3611; GFX9-NEXT: s_not_b64 exec, exec 3612; GFX9-NEXT: v_mov_b32_e32 v2, v1 3613; GFX9-NEXT: s_not_b64 exec, exec 3614; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3615; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3616; GFX9-NEXT: s_nop 1 3617; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3618; GFX9-NEXT: s_nop 1 3619; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3620; GFX9-NEXT: s_nop 1 3621; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3622; GFX9-NEXT: s_nop 1 3623; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3624; GFX9-NEXT: s_nop 1 3625; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3626; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3627; GFX9-NEXT: s_nop 0 3628; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3629; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3630; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3631; GFX9-NEXT: ; implicit-def: $vgpr0 3632; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3633; GFX9-NEXT: s_cbranch_execz BB19_2 3634; GFX9-NEXT: ; %bb.1: 3635; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3636; GFX9-NEXT: v_mov_b32_e32 v3, s4 3637; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3638; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3639; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3640; GFX9-NEXT: BB19_2: 3641; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3642; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3643; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3644; GFX9-NEXT: v_mov_b32_e32 v0, v1 3645; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3646; GFX9-NEXT: s_mov_b32 s3, 0xf000 3647; GFX9-NEXT: s_mov_b32 s2, -1 3648; GFX9-NEXT: s_nop 0 3649; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3650; GFX9-NEXT: s_endpgm 3651; 3652; GFX1064-LABEL: min_i32_varying: 3653; GFX1064: ; %bb.0: ; %entry 3654; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3655; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3656; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3657; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3658; GFX1064-NEXT: s_not_b64 exec, exec 3659; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3660; GFX1064-NEXT: s_not_b64 exec, exec 3661; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3662; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3663; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3664; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3665; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3666; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3667; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3668; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3669; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3670; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3671; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3672; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3673; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3674; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3675; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3676; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3677; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3678; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3679; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3680; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3681; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3682; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3683; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3684; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3685; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3686; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3687; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3688; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3689; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3690; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3691; GFX1064-NEXT: s_mov_b32 s2, -1 3692; GFX1064-NEXT: ; implicit-def: $vgpr0 3693; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3694; GFX1064-NEXT: s_cbranch_execz BB19_2 3695; GFX1064-NEXT: ; %bb.1: 3696; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3697; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3698; GFX1064-NEXT: s_mov_b32 s3, s7 3699; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3700; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3701; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 3702; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3703; GFX1064-NEXT: buffer_gl0_inv 3704; GFX1064-NEXT: BB19_2: 3705; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3706; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3707; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3708; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3709; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3710; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3711; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3712; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3713; GFX1064-NEXT: s_endpgm 3714; 3715; GFX1032-LABEL: min_i32_varying: 3716; GFX1032: ; %bb.0: ; %entry 3717; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3718; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3719; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3720; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3721; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3722; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3723; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3724; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3725; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3726; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3727; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3728; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3729; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3730; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3731; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3732; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3733; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3734; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3735; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3736; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3737; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3738; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3739; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3740; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3741; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3742; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3743; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3744; GFX1032-NEXT: s_mov_b32 s2, -1 3745; GFX1032-NEXT: ; implicit-def: $vgpr0 3746; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3747; GFX1032-NEXT: s_cbranch_execz BB19_2 3748; GFX1032-NEXT: ; %bb.1: 3749; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3750; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3751; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3752; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3753; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 3754; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3755; GFX1032-NEXT: buffer_gl0_inv 3756; GFX1032-NEXT: BB19_2: 3757; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3758; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3759; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3760; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3761; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3762; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3763; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3764; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3765; GFX1032-NEXT: s_endpgm 3766entry: 3767 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3768 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3769 store i32 %old, i32 addrspace(1)* %out 3770 ret void 3771} 3772 3773define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3774; 3775; 3776; GFX7LESS-LABEL: min_i64_constant: 3777; GFX7LESS: ; %bb.0: ; %entry 3778; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3779; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3780; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3781; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3782; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3783; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3784; GFX7LESS-NEXT: s_cbranch_execz BB20_2 3785; GFX7LESS-NEXT: ; %bb.1: 3786; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3787; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3788; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3789; GFX7LESS-NEXT: s_mov_b32 m0, -1 3790; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3791; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3792; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3793; GFX7LESS-NEXT: BB20_2: 3794; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3795; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3796; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3797; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3798; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3799; GFX7LESS-NEXT: s_mov_b32 s2, -1 3800; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3801; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3802; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3803; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3804; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3805; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3806; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3807; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3808; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3809; GFX7LESS-NEXT: s_endpgm 3810; 3811; GFX8-LABEL: min_i64_constant: 3812; GFX8: ; %bb.0: ; %entry 3813; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3814; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3815; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3816; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3817; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3818; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3819; GFX8-NEXT: s_cbranch_execz BB20_2 3820; GFX8-NEXT: ; %bb.1: 3821; GFX8-NEXT: v_mov_b32_e32 v0, 5 3822; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3823; GFX8-NEXT: v_mov_b32_e32 v1, 0 3824; GFX8-NEXT: s_mov_b32 m0, -1 3825; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3826; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3827; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3828; GFX8-NEXT: BB20_2: 3829; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3830; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3831; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3832; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3833; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3834; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3835; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3836; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3837; GFX8-NEXT: v_mov_b32_e32 v2, s5 3838; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3839; GFX8-NEXT: v_mov_b32_e32 v2, s4 3840; GFX8-NEXT: s_mov_b32 s2, -1 3841; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3842; GFX8-NEXT: s_mov_b32 s3, 0xf000 3843; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3844; GFX8-NEXT: s_endpgm 3845; 3846; GFX9-LABEL: min_i64_constant: 3847; GFX9: ; %bb.0: ; %entry 3848; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3849; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3850; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3851; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3852; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3853; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3854; GFX9-NEXT: s_cbranch_execz BB20_2 3855; GFX9-NEXT: ; %bb.1: 3856; GFX9-NEXT: v_mov_b32_e32 v0, 5 3857; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3858; GFX9-NEXT: v_mov_b32_e32 v1, 0 3859; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3860; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3861; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3862; GFX9-NEXT: BB20_2: 3863; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3864; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3865; GFX9-NEXT: v_readfirstlane_b32 s4, v0 3866; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 3867; GFX9-NEXT: v_readfirstlane_b32 s5, v1 3868; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3869; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3870; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3871; GFX9-NEXT: v_mov_b32_e32 v2, s5 3872; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3873; GFX9-NEXT: v_mov_b32_e32 v2, s4 3874; GFX9-NEXT: s_mov_b32 s2, -1 3875; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3876; GFX9-NEXT: s_mov_b32 s3, 0xf000 3877; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3878; GFX9-NEXT: s_endpgm 3879; 3880; GFX1064-LABEL: min_i64_constant: 3881; GFX1064: ; %bb.0: ; %entry 3882; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3883; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3884; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3885; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3886; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3887; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3888; GFX1064-NEXT: s_cbranch_execz BB20_2 3889; GFX1064-NEXT: ; %bb.1: 3890; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3891; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3892; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3893; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3894; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3895; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3896; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3897; GFX1064-NEXT: buffer_gl0_inv 3898; GFX1064-NEXT: BB20_2: 3899; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3900; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3901; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3902; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3903; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 3904; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3905; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 3906; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3907; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3908; GFX1064-NEXT: s_mov_b32 s2, -1 3909; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3910; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3911; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3912; GFX1064-NEXT: s_endpgm 3913; 3914; GFX1032-LABEL: min_i64_constant: 3915; GFX1032: ; %bb.0: ; %entry 3916; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3917; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3918; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3919; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3920; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3921; GFX1032-NEXT: s_cbranch_execz BB20_2 3922; GFX1032-NEXT: ; %bb.1: 3923; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3924; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3925; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3926; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3927; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3928; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3929; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3930; GFX1032-NEXT: buffer_gl0_inv 3931; GFX1032-NEXT: BB20_2: 3932; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3933; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3934; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3935; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3936; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 3937; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 3938; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 3939; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3940; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3941; GFX1032-NEXT: s_mov_b32 s2, -1 3942; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3943; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3944; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3945; GFX1032-NEXT: s_endpgm 3946entry: 3947 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 3948 store i64 %old, i64 addrspace(1)* %out 3949 ret void 3950} 3951 3952define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 3953; 3954; 3955; GFX7LESS-LABEL: umax_i32_varying: 3956; GFX7LESS: ; %bb.0: ; %entry 3957; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3958; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3959; GFX7LESS-NEXT: s_mov_b32 m0, -1 3960; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3961; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 3962; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3963; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3964; GFX7LESS-NEXT: s_mov_b32 s2, -1 3965; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3966; GFX7LESS-NEXT: s_endpgm 3967; 3968; GFX8-LABEL: umax_i32_varying: 3969; GFX8: ; %bb.0: ; %entry 3970; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3971; GFX8-NEXT: v_mov_b32_e32 v2, v0 3972; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3973; GFX8-NEXT: v_mov_b32_e32 v1, 0 3974; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3975; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3976; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3977; GFX8-NEXT: s_not_b64 exec, exec 3978; GFX8-NEXT: v_mov_b32_e32 v2, 0 3979; GFX8-NEXT: s_not_b64 exec, exec 3980; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3981; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3982; GFX8-NEXT: s_nop 1 3983; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3984; GFX8-NEXT: s_nop 1 3985; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3986; GFX8-NEXT: s_nop 1 3987; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3988; GFX8-NEXT: s_nop 1 3989; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3990; GFX8-NEXT: s_nop 1 3991; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3992; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3993; GFX8-NEXT: s_nop 0 3994; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3995; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3996; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3997; GFX8-NEXT: ; implicit-def: $vgpr0 3998; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3999; GFX8-NEXT: s_cbranch_execz BB21_2 4000; GFX8-NEXT: ; %bb.1: 4001; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4002; GFX8-NEXT: v_mov_b32_e32 v3, s4 4003; GFX8-NEXT: s_mov_b32 m0, -1 4004; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4005; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4006; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4007; GFX8-NEXT: BB21_2: 4008; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4009; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4010; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4011; GFX8-NEXT: v_mov_b32_e32 v0, v1 4012; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4013; GFX8-NEXT: s_mov_b32 s3, 0xf000 4014; GFX8-NEXT: s_mov_b32 s2, -1 4015; GFX8-NEXT: s_nop 0 4016; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4017; GFX8-NEXT: s_endpgm 4018; 4019; GFX9-LABEL: umax_i32_varying: 4020; GFX9: ; %bb.0: ; %entry 4021; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4022; GFX9-NEXT: v_mov_b32_e32 v2, v0 4023; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4024; GFX9-NEXT: v_mov_b32_e32 v1, 0 4025; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4026; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4027; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4028; GFX9-NEXT: s_not_b64 exec, exec 4029; GFX9-NEXT: v_mov_b32_e32 v2, 0 4030; GFX9-NEXT: s_not_b64 exec, exec 4031; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4032; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4033; GFX9-NEXT: s_nop 1 4034; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4035; GFX9-NEXT: s_nop 1 4036; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4037; GFX9-NEXT: s_nop 1 4038; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4039; GFX9-NEXT: s_nop 1 4040; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4041; GFX9-NEXT: s_nop 1 4042; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4043; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4044; GFX9-NEXT: s_nop 0 4045; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4046; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4047; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4048; GFX9-NEXT: ; implicit-def: $vgpr0 4049; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4050; GFX9-NEXT: s_cbranch_execz BB21_2 4051; GFX9-NEXT: ; %bb.1: 4052; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4053; GFX9-NEXT: v_mov_b32_e32 v3, s4 4054; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4055; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4056; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4057; GFX9-NEXT: BB21_2: 4058; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4059; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4060; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4061; GFX9-NEXT: v_mov_b32_e32 v0, v1 4062; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4063; GFX9-NEXT: s_mov_b32 s3, 0xf000 4064; GFX9-NEXT: s_mov_b32 s2, -1 4065; GFX9-NEXT: s_nop 0 4066; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4067; GFX9-NEXT: s_endpgm 4068; 4069; GFX1064-LABEL: umax_i32_varying: 4070; GFX1064: ; %bb.0: ; %entry 4071; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4072; GFX1064-NEXT: s_not_b64 exec, exec 4073; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4074; GFX1064-NEXT: s_not_b64 exec, exec 4075; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4076; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4077; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4078; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4079; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4080; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4081; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4082; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4083; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4084; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4085; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4086; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4087; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4088; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4089; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4090; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4091; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4092; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4093; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4094; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4095; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4096; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4097; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4098; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4099; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4100; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4101; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4102; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4103; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4104; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4105; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4106; GFX1064-NEXT: s_mov_b32 s2, -1 4107; GFX1064-NEXT: ; implicit-def: $vgpr0 4108; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4109; GFX1064-NEXT: s_cbranch_execz BB21_2 4110; GFX1064-NEXT: ; %bb.1: 4111; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4112; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4113; GFX1064-NEXT: s_mov_b32 s3, s7 4114; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4115; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4116; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 4117; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4118; GFX1064-NEXT: buffer_gl0_inv 4119; GFX1064-NEXT: BB21_2: 4120; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4121; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4122; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4123; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4124; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4125; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4126; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4127; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4128; GFX1064-NEXT: s_endpgm 4129; 4130; GFX1032-LABEL: umax_i32_varying: 4131; GFX1032: ; %bb.0: ; %entry 4132; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4133; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4134; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4135; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4136; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4137; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4138; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4139; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4140; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4141; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4142; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4143; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4144; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4145; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4146; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4147; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4148; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4149; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4150; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4151; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4152; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4153; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4154; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4155; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4156; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4157; GFX1032-NEXT: s_mov_b32 s2, -1 4158; GFX1032-NEXT: ; implicit-def: $vgpr0 4159; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4160; GFX1032-NEXT: s_cbranch_execz BB21_2 4161; GFX1032-NEXT: ; %bb.1: 4162; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4163; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4164; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4165; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4166; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 4167; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4168; GFX1032-NEXT: buffer_gl0_inv 4169; GFX1032-NEXT: BB21_2: 4170; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4171; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4172; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4173; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4174; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4175; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4176; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4177; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4178; GFX1032-NEXT: s_endpgm 4179entry: 4180 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4181 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4182 store i32 %old, i32 addrspace(1)* %out 4183 ret void 4184} 4185 4186define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4187; 4188; 4189; GFX7LESS-LABEL: umax_i64_constant: 4190; GFX7LESS: ; %bb.0: ; %entry 4191; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4192; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4193; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4194; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4195; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4196; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4197; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4198; GFX7LESS-NEXT: ; %bb.1: 4199; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4200; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4201; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4202; GFX7LESS-NEXT: s_mov_b32 m0, -1 4203; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4204; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4205; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4206; GFX7LESS-NEXT: BB22_2: 4207; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4208; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4209; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4210; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4211; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4212; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4213; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4214; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4215; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4216; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4217; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4218; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4219; GFX7LESS-NEXT: s_mov_b32 s2, -1 4220; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4221; GFX7LESS-NEXT: s_endpgm 4222; 4223; GFX8-LABEL: umax_i64_constant: 4224; GFX8: ; %bb.0: ; %entry 4225; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4226; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4227; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4228; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4229; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4230; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4231; GFX8-NEXT: s_cbranch_execz BB22_2 4232; GFX8-NEXT: ; %bb.1: 4233; GFX8-NEXT: v_mov_b32_e32 v0, 5 4234; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4235; GFX8-NEXT: v_mov_b32_e32 v1, 0 4236; GFX8-NEXT: s_mov_b32 m0, -1 4237; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4238; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4239; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4240; GFX8-NEXT: BB22_2: 4241; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4242; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4243; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4244; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4245; GFX8-NEXT: v_mov_b32_e32 v1, 0 4246; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4247; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4248; GFX8-NEXT: v_mov_b32_e32 v1, s3 4249; GFX8-NEXT: v_mov_b32_e32 v2, s2 4250; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4251; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4252; GFX8-NEXT: s_mov_b32 s3, 0xf000 4253; GFX8-NEXT: s_mov_b32 s2, -1 4254; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4255; GFX8-NEXT: s_endpgm 4256; 4257; GFX9-LABEL: umax_i64_constant: 4258; GFX9: ; %bb.0: ; %entry 4259; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4260; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4261; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4262; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4263; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4264; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4265; GFX9-NEXT: s_cbranch_execz BB22_2 4266; GFX9-NEXT: ; %bb.1: 4267; GFX9-NEXT: v_mov_b32_e32 v0, 5 4268; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4269; GFX9-NEXT: v_mov_b32_e32 v1, 0 4270; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4271; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4272; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4273; GFX9-NEXT: BB22_2: 4274; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4275; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4276; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4277; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4278; GFX9-NEXT: v_mov_b32_e32 v1, 0 4279; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4280; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4281; GFX9-NEXT: v_mov_b32_e32 v1, s3 4282; GFX9-NEXT: v_mov_b32_e32 v2, s2 4283; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4284; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4285; GFX9-NEXT: s_mov_b32 s3, 0xf000 4286; GFX9-NEXT: s_mov_b32 s2, -1 4287; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4288; GFX9-NEXT: s_endpgm 4289; 4290; GFX1064-LABEL: umax_i64_constant: 4291; GFX1064: ; %bb.0: ; %entry 4292; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4293; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4294; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4295; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4296; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4297; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4298; GFX1064-NEXT: s_cbranch_execz BB22_2 4299; GFX1064-NEXT: ; %bb.1: 4300; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4301; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4302; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4303; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4304; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4305; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4306; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4307; GFX1064-NEXT: buffer_gl0_inv 4308; GFX1064-NEXT: BB22_2: 4309; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4310; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4311; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4312; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4313; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4314; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4315; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4316; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4317; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4318; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4319; GFX1064-NEXT: s_mov_b32 s2, -1 4320; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4321; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4322; GFX1064-NEXT: s_endpgm 4323; 4324; GFX1032-LABEL: umax_i64_constant: 4325; GFX1032: ; %bb.0: ; %entry 4326; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4327; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4328; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4329; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4330; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4331; GFX1032-NEXT: s_cbranch_execz BB22_2 4332; GFX1032-NEXT: ; %bb.1: 4333; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4334; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4335; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4336; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4337; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4338; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4339; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4340; GFX1032-NEXT: buffer_gl0_inv 4341; GFX1032-NEXT: BB22_2: 4342; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4343; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4344; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4345; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4346; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4347; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4348; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4349; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4350; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4351; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4352; GFX1032-NEXT: s_mov_b32 s2, -1 4353; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4354; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4355; GFX1032-NEXT: s_endpgm 4356entry: 4357 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4358 store i64 %old, i64 addrspace(1)* %out 4359 ret void 4360} 4361 4362define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4363; 4364; 4365; GFX7LESS-LABEL: umin_i32_varying: 4366; GFX7LESS: ; %bb.0: ; %entry 4367; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4368; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4369; GFX7LESS-NEXT: s_mov_b32 m0, -1 4370; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4371; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4372; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4373; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4374; GFX7LESS-NEXT: s_mov_b32 s2, -1 4375; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4376; GFX7LESS-NEXT: s_endpgm 4377; 4378; GFX8-LABEL: umin_i32_varying: 4379; GFX8: ; %bb.0: ; %entry 4380; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4381; GFX8-NEXT: v_mov_b32_e32 v2, v0 4382; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4383; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4384; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4385; GFX8-NEXT: v_mov_b32_e32 v1, -1 4386; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4387; GFX8-NEXT: s_not_b64 exec, exec 4388; GFX8-NEXT: v_mov_b32_e32 v2, -1 4389; GFX8-NEXT: s_not_b64 exec, exec 4390; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4391; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4392; GFX8-NEXT: s_nop 1 4393; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4394; GFX8-NEXT: s_nop 1 4395; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4396; GFX8-NEXT: s_nop 1 4397; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4398; GFX8-NEXT: s_nop 1 4399; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4400; GFX8-NEXT: s_nop 1 4401; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4402; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4403; GFX8-NEXT: s_nop 0 4404; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4405; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4406; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4407; GFX8-NEXT: ; implicit-def: $vgpr0 4408; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4409; GFX8-NEXT: s_cbranch_execz BB23_2 4410; GFX8-NEXT: ; %bb.1: 4411; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4412; GFX8-NEXT: v_mov_b32_e32 v3, s4 4413; GFX8-NEXT: s_mov_b32 m0, -1 4414; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4415; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4416; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4417; GFX8-NEXT: BB23_2: 4418; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4419; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4420; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4421; GFX8-NEXT: v_mov_b32_e32 v0, v1 4422; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4423; GFX8-NEXT: s_mov_b32 s3, 0xf000 4424; GFX8-NEXT: s_mov_b32 s2, -1 4425; GFX8-NEXT: s_nop 0 4426; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4427; GFX8-NEXT: s_endpgm 4428; 4429; GFX9-LABEL: umin_i32_varying: 4430; GFX9: ; %bb.0: ; %entry 4431; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4432; GFX9-NEXT: v_mov_b32_e32 v2, v0 4433; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4434; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4435; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4436; GFX9-NEXT: v_mov_b32_e32 v1, -1 4437; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4438; GFX9-NEXT: s_not_b64 exec, exec 4439; GFX9-NEXT: v_mov_b32_e32 v2, -1 4440; GFX9-NEXT: s_not_b64 exec, exec 4441; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4442; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4443; GFX9-NEXT: s_nop 1 4444; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4445; GFX9-NEXT: s_nop 1 4446; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4447; GFX9-NEXT: s_nop 1 4448; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4449; GFX9-NEXT: s_nop 1 4450; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4451; GFX9-NEXT: s_nop 1 4452; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4453; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4454; GFX9-NEXT: s_nop 0 4455; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4456; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4457; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4458; GFX9-NEXT: ; implicit-def: $vgpr0 4459; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4460; GFX9-NEXT: s_cbranch_execz BB23_2 4461; GFX9-NEXT: ; %bb.1: 4462; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4463; GFX9-NEXT: v_mov_b32_e32 v3, s4 4464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4465; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4466; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4467; GFX9-NEXT: BB23_2: 4468; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4469; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4470; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4471; GFX9-NEXT: v_mov_b32_e32 v0, v1 4472; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4473; GFX9-NEXT: s_mov_b32 s3, 0xf000 4474; GFX9-NEXT: s_mov_b32 s2, -1 4475; GFX9-NEXT: s_nop 0 4476; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4477; GFX9-NEXT: s_endpgm 4478; 4479; GFX1064-LABEL: umin_i32_varying: 4480; GFX1064: ; %bb.0: ; %entry 4481; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4482; GFX1064-NEXT: s_not_b64 exec, exec 4483; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4484; GFX1064-NEXT: s_not_b64 exec, exec 4485; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4486; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4487; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4488; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4489; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4490; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4491; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4492; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4493; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4494; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4495; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4496; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4497; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4498; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4499; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4500; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4501; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4502; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4503; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4504; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4505; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4506; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4507; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4508; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4509; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4510; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4511; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4512; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4513; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4514; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4515; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4516; GFX1064-NEXT: s_mov_b32 s2, -1 4517; GFX1064-NEXT: ; implicit-def: $vgpr0 4518; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4519; GFX1064-NEXT: s_cbranch_execz BB23_2 4520; GFX1064-NEXT: ; %bb.1: 4521; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4522; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4523; GFX1064-NEXT: s_mov_b32 s3, s7 4524; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4525; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4526; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 4527; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4528; GFX1064-NEXT: buffer_gl0_inv 4529; GFX1064-NEXT: BB23_2: 4530; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4531; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4532; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4533; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4534; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4535; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4536; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4537; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4538; GFX1064-NEXT: s_endpgm 4539; 4540; GFX1032-LABEL: umin_i32_varying: 4541; GFX1032: ; %bb.0: ; %entry 4542; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4543; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4544; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4545; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4546; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4547; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4548; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4549; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4550; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4551; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4552; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4553; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4554; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4555; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4556; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4557; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4558; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4559; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4560; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4561; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4562; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4563; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4564; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4565; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4566; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4567; GFX1032-NEXT: s_mov_b32 s2, -1 4568; GFX1032-NEXT: ; implicit-def: $vgpr0 4569; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4570; GFX1032-NEXT: s_cbranch_execz BB23_2 4571; GFX1032-NEXT: ; %bb.1: 4572; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4573; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4574; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4575; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4576; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 4577; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4578; GFX1032-NEXT: buffer_gl0_inv 4579; GFX1032-NEXT: BB23_2: 4580; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4581; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4582; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4583; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4584; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4585; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4586; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4587; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4588; GFX1032-NEXT: s_endpgm 4589entry: 4590 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4591 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4592 store i32 %old, i32 addrspace(1)* %out 4593 ret void 4594} 4595 4596define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4597; 4598; 4599; GFX7LESS-LABEL: umin_i64_constant: 4600; GFX7LESS: ; %bb.0: ; %entry 4601; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4602; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4603; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4604; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4605; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4606; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4607; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4608; GFX7LESS-NEXT: ; %bb.1: 4609; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4610; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4611; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4612; GFX7LESS-NEXT: s_mov_b32 m0, -1 4613; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4614; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4615; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4616; GFX7LESS-NEXT: BB24_2: 4617; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4618; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4619; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4620; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4621; GFX7LESS-NEXT: s_mov_b32 s2, -1 4622; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4623; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4624; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4625; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4626; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4627; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4628; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4629; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4630; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4631; GFX7LESS-NEXT: s_endpgm 4632; 4633; GFX8-LABEL: umin_i64_constant: 4634; GFX8: ; %bb.0: ; %entry 4635; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4636; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4637; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4638; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4639; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4640; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4641; GFX8-NEXT: s_cbranch_execz BB24_2 4642; GFX8-NEXT: ; %bb.1: 4643; GFX8-NEXT: v_mov_b32_e32 v0, 5 4644; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4645; GFX8-NEXT: v_mov_b32_e32 v1, 0 4646; GFX8-NEXT: s_mov_b32 m0, -1 4647; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4648; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4649; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4650; GFX8-NEXT: BB24_2: 4651; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4652; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4653; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4654; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4655; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4656; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4657; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4658; GFX8-NEXT: v_mov_b32_e32 v2, s5 4659; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4660; GFX8-NEXT: v_mov_b32_e32 v2, s4 4661; GFX8-NEXT: s_mov_b32 s2, -1 4662; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4663; GFX8-NEXT: s_mov_b32 s3, 0xf000 4664; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4665; GFX8-NEXT: s_endpgm 4666; 4667; GFX9-LABEL: umin_i64_constant: 4668; GFX9: ; %bb.0: ; %entry 4669; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4670; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4671; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4672; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4673; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4674; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4675; GFX9-NEXT: s_cbranch_execz BB24_2 4676; GFX9-NEXT: ; %bb.1: 4677; GFX9-NEXT: v_mov_b32_e32 v0, 5 4678; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4679; GFX9-NEXT: v_mov_b32_e32 v1, 0 4680; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4681; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4682; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4683; GFX9-NEXT: BB24_2: 4684; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4685; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4686; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4687; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4688; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4689; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4690; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4691; GFX9-NEXT: v_mov_b32_e32 v2, s5 4692; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4693; GFX9-NEXT: v_mov_b32_e32 v2, s4 4694; GFX9-NEXT: s_mov_b32 s2, -1 4695; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4696; GFX9-NEXT: s_mov_b32 s3, 0xf000 4697; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4698; GFX9-NEXT: s_endpgm 4699; 4700; GFX1064-LABEL: umin_i64_constant: 4701; GFX1064: ; %bb.0: ; %entry 4702; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4703; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4704; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4705; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4706; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4707; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4708; GFX1064-NEXT: s_cbranch_execz BB24_2 4709; GFX1064-NEXT: ; %bb.1: 4710; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4711; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4712; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4713; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4714; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4715; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4716; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4717; GFX1064-NEXT: buffer_gl0_inv 4718; GFX1064-NEXT: BB24_2: 4719; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4720; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4721; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4722; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4723; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4724; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4725; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4726; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4727; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4728; GFX1064-NEXT: s_mov_b32 s2, -1 4729; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4730; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4731; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4732; GFX1064-NEXT: s_endpgm 4733; 4734; GFX1032-LABEL: umin_i64_constant: 4735; GFX1032: ; %bb.0: ; %entry 4736; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4737; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4738; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4739; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4740; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4741; GFX1032-NEXT: s_cbranch_execz BB24_2 4742; GFX1032-NEXT: ; %bb.1: 4743; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4744; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4745; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4746; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4747; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4748; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4749; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4750; GFX1032-NEXT: buffer_gl0_inv 4751; GFX1032-NEXT: BB24_2: 4752; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4753; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4754; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4755; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4756; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4757; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4758; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4759; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4760; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4761; GFX1032-NEXT: s_mov_b32 s2, -1 4762; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4763; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4764; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4765; GFX1032-NEXT: s_endpgm 4766entry: 4767 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4768 store i64 %old, i64 addrspace(1)* %out 4769 ret void 4770} 4771