1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 140; GFX1032-NEXT: s_mov_b32 s3, exec_lo 141; GFX1032-NEXT: ; implicit-def: $vgpr1 142; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 143; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 144; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 145; GFX1032-NEXT: s_cbranch_execz BB0_2 146; GFX1032-NEXT: ; %bb.1: 147; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 148; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 149; GFX1032-NEXT: s_mul_i32 s3, s3, 5 150; GFX1032-NEXT: v_mov_b32_e32 v2, s3 151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 154; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 155; GFX1032-NEXT: buffer_gl0_inv 156; GFX1032-NEXT: BB0_2: 157; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 158; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 159; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 161; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 162; GFX1032-NEXT: s_mov_b32 s2, -1 163; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 164; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 165; GFX1032-NEXT: s_endpgm 166entry: 167 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 168 store i32 %old, i32 addrspace(1)* %out 169 ret void 170} 171 172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 173; 174; 175; GFX7LESS-LABEL: add_i32_uniform: 176; GFX7LESS: ; %bb.0: ; %entry 177; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 178; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 179; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 180; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 181; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 182; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 183; GFX7LESS-NEXT: ; implicit-def: $vgpr1 184; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 185; GFX7LESS-NEXT: s_cbranch_execz BB1_2 186; GFX7LESS-NEXT: ; %bb.1: 187; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 190; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 191; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 192; GFX7LESS-NEXT: s_mov_b32 m0, -1 193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 194; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: BB1_2: 197; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 198; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 199; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 200; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 201; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 202; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 203; GFX7LESS-NEXT: s_mov_b32 s6, -1 204; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 205; GFX7LESS-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_uniform: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 210; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 211; GFX8-NEXT: s_mov_b64 s[2:3], exec 212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 215; GFX8-NEXT: ; implicit-def: $vgpr1 216; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 217; GFX8-NEXT: s_cbranch_execz BB1_2 218; GFX8-NEXT: ; %bb.1: 219; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_mul_i32 s1, s0, s1 222; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 223; GFX8-NEXT: v_mov_b32_e32 v2, s1 224; GFX8-NEXT: s_mov_b32 m0, -1 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: BB1_2: 229; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 231; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 232; GFX8-NEXT: v_readfirstlane_b32 s0, v1 233; GFX8-NEXT: s_mov_b32 s7, 0xf000 234; GFX8-NEXT: s_mov_b32 s6, -1 235; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 236; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 237; GFX8-NEXT: s_endpgm 238; 239; GFX9-LABEL: add_i32_uniform: 240; GFX9: ; %bb.0: ; %entry 241; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 242; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 243; GFX9-NEXT: s_mov_b64 s[6:7], exec 244; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 245; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX9-NEXT: ; implicit-def: $vgpr1 248; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 249; GFX9-NEXT: s_cbranch_execz BB1_2 250; GFX9-NEXT: ; %bb.1: 251; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: s_mul_i32 s3, s2, s3 254; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 255; GFX9-NEXT: v_mov_b32_e32 v2, s3 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: BB1_2: 260; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 263; GFX9-NEXT: v_readfirstlane_b32 s0, v1 264; GFX9-NEXT: s_mov_b32 s7, 0xf000 265; GFX9-NEXT: s_mov_b32 s6, -1 266; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 267; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 268; GFX9-NEXT: s_endpgm 269; 270; GFX1064-LABEL: add_i32_uniform: 271; GFX1064: ; %bb.0: ; %entry 272; GFX1064-NEXT: s_clause 0x1 273; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 274; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 275; GFX1064-NEXT: s_mov_b64 s[6:7], exec 276; GFX1064-NEXT: ; implicit-def: $vgpr1 277; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 278; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 279; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 280; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 281; GFX1064-NEXT: s_cbranch_execz BB1_2 282; GFX1064-NEXT: ; %bb.1: 283; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 284; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 286; GFX1064-NEXT: s_mul_i32 s3, s2, s3 287; GFX1064-NEXT: v_mov_b32_e32 v2, s3 288; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: buffer_gl0_inv 293; GFX1064-NEXT: BB1_2: 294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 295; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 296; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 297; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 298; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 299; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 300; GFX1064-NEXT: s_mov_b32 s6, -1 301; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 302; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 303; GFX1064-NEXT: s_endpgm 304; 305; GFX1032-LABEL: add_i32_uniform: 306; GFX1032: ; %bb.0: ; %entry 307; GFX1032-NEXT: s_clause 0x1 308; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 309; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 310; GFX1032-NEXT: s_mov_b32 s3, exec_lo 311; GFX1032-NEXT: ; implicit-def: $vgpr1 312; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 313; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 314; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 315; GFX1032-NEXT: s_cbranch_execz BB1_2 316; GFX1032-NEXT: ; %bb.1: 317; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 318; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 319; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 320; GFX1032-NEXT: s_mul_i32 s1, s2, s1 321; GFX1032-NEXT: v_mov_b32_e32 v2, s1 322; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 323; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 324; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: buffer_gl0_inv 327; GFX1032-NEXT: BB1_2: 328; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 329; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 330; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 331; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 332; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 333; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 334; GFX1032-NEXT: s_mov_b32 s6, -1 335; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 336; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX1032-NEXT: s_endpgm 338entry: 339 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 340 store i32 %old, i32 addrspace(1)* %out 341 ret void 342} 343 344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 345; 346; 347; GFX7LESS-LABEL: add_i32_varying: 348; GFX7LESS: ; %bb.0: ; %entry 349; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 350; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 351; GFX7LESS-NEXT: s_mov_b32 m0, -1 352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 356; GFX7LESS-NEXT: s_mov_b32 s2, -1 357; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 358; GFX7LESS-NEXT: s_endpgm 359; 360; GFX8-LABEL: add_i32_varying: 361; GFX8: ; %bb.0: ; %entry 362; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 363; GFX8-NEXT: v_mov_b32_e32 v2, v0 364; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 365; GFX8-NEXT: v_mov_b32_e32 v1, 0 366; GFX8-NEXT: s_mov_b64 exec, s[2:3] 367; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 368; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 369; GFX8-NEXT: s_not_b64 exec, exec 370; GFX8-NEXT: v_mov_b32_e32 v2, 0 371; GFX8-NEXT: s_not_b64 exec, exec 372; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 373; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 374; GFX8-NEXT: s_nop 1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 382; GFX8-NEXT: s_nop 1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 384; GFX8-NEXT: v_readlane_b32 s4, v2, 63 385; GFX8-NEXT: s_nop 0 386; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 387; GFX8-NEXT: s_mov_b64 exec, s[2:3] 388; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 389; GFX8-NEXT: ; implicit-def: $vgpr0 390; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 391; GFX8-NEXT: s_cbranch_execz BB2_2 392; GFX8-NEXT: ; %bb.1: 393; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 394; GFX8-NEXT: v_mov_b32_e32 v3, s4 395; GFX8-NEXT: s_mov_b32 m0, -1 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: BB2_2: 400; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 402; GFX8-NEXT: v_readfirstlane_b32 s2, v0 403; GFX8-NEXT: v_mov_b32_e32 v0, v1 404; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 405; GFX8-NEXT: s_mov_b32 s3, 0xf000 406; GFX8-NEXT: s_mov_b32 s2, -1 407; GFX8-NEXT: s_nop 0 408; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 409; GFX8-NEXT: s_endpgm 410; 411; GFX9-LABEL: add_i32_varying: 412; GFX9: ; %bb.0: ; %entry 413; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 414; GFX9-NEXT: v_mov_b32_e32 v2, v0 415; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 416; GFX9-NEXT: v_mov_b32_e32 v1, 0 417; GFX9-NEXT: s_mov_b64 exec, s[2:3] 418; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 419; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 420; GFX9-NEXT: s_not_b64 exec, exec 421; GFX9-NEXT: v_mov_b32_e32 v2, 0 422; GFX9-NEXT: s_not_b64 exec, exec 423; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 424; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 425; GFX9-NEXT: s_nop 1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 435; GFX9-NEXT: v_readlane_b32 s4, v2, 63 436; GFX9-NEXT: s_nop 0 437; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 438; GFX9-NEXT: s_mov_b64 exec, s[2:3] 439; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 440; GFX9-NEXT: ; implicit-def: $vgpr0 441; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 442; GFX9-NEXT: s_cbranch_execz BB2_2 443; GFX9-NEXT: ; %bb.1: 444; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 445; GFX9-NEXT: v_mov_b32_e32 v3, s4 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: BB2_2: 450; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: v_readfirstlane_b32 s2, v0 453; GFX9-NEXT: v_mov_b32_e32 v0, v1 454; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 455; GFX9-NEXT: s_mov_b32 s3, 0xf000 456; GFX9-NEXT: s_mov_b32 s2, -1 457; GFX9-NEXT: s_nop 0 458; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX9-NEXT: s_endpgm 460; 461; GFX1064-LABEL: add_i32_varying: 462; GFX1064: ; %bb.0: ; %entry 463; GFX1064-NEXT: v_mov_b32_e32 v1, v0 464; GFX1064-NEXT: s_not_b64 exec, exec 465; GFX1064-NEXT: v_mov_b32_e32 v1, 0 466; GFX1064-NEXT: s_not_b64 exec, exec 467; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX1064-NEXT: v_mov_b32_e32 v3, 0 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX1064-NEXT: v_mov_b32_e32 v2, v1 474; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 475; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 476; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 477; GFX1064-NEXT: v_mov_b32_e32 v2, s4 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 479; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 480; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 481; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 482; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 484; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 485; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 486; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 487; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 488; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 489; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 490; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 491; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 492; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 493; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 498; GFX1064-NEXT: s_mov_b32 s2, -1 499; GFX1064-NEXT: ; implicit-def: $vgpr0 500; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 501; GFX1064-NEXT: s_cbranch_execz BB2_2 502; GFX1064-NEXT: ; %bb.1: 503; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 504; GFX1064-NEXT: v_mov_b32_e32 v4, s7 505; GFX1064-NEXT: s_mov_b32 s3, s7 506; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 507; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 508; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 509; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 510; GFX1064-NEXT: buffer_gl0_inv 511; GFX1064-NEXT: BB2_2: 512; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 513; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 514; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 515; GFX1064-NEXT: v_mov_b32_e32 v0, v3 516; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 517; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 518; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 519; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 520; GFX1064-NEXT: s_endpgm 521; 522; GFX1032-LABEL: add_i32_varying: 523; GFX1032: ; %bb.0: ; %entry 524; GFX1032-NEXT: v_mov_b32_e32 v1, v0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: v_mov_b32_e32 v1, 0 527; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 528; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 532; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX1032-NEXT: v_mov_b32_e32 v2, v1 534; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 535; GFX1032-NEXT: s_mov_b32 exec_lo, s2 536; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 538; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 539; GFX1032-NEXT: v_mov_b32_e32 v3, 0 540; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 541; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 542; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 543; GFX1032-NEXT: s_mov_b32 exec_lo, s2 544; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 545; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 546; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 549; GFX1032-NEXT: s_mov_b32 s2, -1 550; GFX1032-NEXT: ; implicit-def: $vgpr0 551; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 552; GFX1032-NEXT: s_cbranch_execz BB2_2 553; GFX1032-NEXT: ; %bb.1: 554; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 555; GFX1032-NEXT: v_mov_b32_e32 v4, s4 556; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 557; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 558; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 559; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 560; GFX1032-NEXT: buffer_gl0_inv 561; GFX1032-NEXT: BB2_2: 562; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 563; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 564; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 565; GFX1032-NEXT: v_mov_b32_e32 v0, v3 566; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 567; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 568; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 569; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; GFX1032-NEXT: s_endpgm 571entry: 572 %lane = call i32 @llvm.amdgcn.workitem.id.x() 573 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 574 store i32 %old, i32 addrspace(1)* %out 575 ret void 576} 577 578define amdgpu_kernel void @add_i32_varying_nouse() { 579; GFX7LESS-LABEL: add_i32_varying_nouse: 580; GFX7LESS: ; %bb.0: ; %entry 581; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 582; GFX7LESS-NEXT: s_mov_b32 m0, -1 583; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 584; GFX7LESS-NEXT: ds_add_u32 v1, v0 585; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 586; GFX7LESS-NEXT: s_endpgm 587; 588; GFX8-LABEL: add_i32_varying_nouse: 589; GFX8: ; %bb.0: ; %entry 590; GFX8-NEXT: v_mov_b32_e32 v1, v0 591; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 592; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 593; GFX8-NEXT: s_not_b64 exec, exec 594; GFX8-NEXT: v_mov_b32_e32 v1, 0 595; GFX8-NEXT: s_not_b64 exec, exec 596; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 597; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 598; GFX8-NEXT: s_nop 1 599; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 600; GFX8-NEXT: s_nop 1 601; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 602; GFX8-NEXT: s_nop 1 603; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 604; GFX8-NEXT: s_nop 1 605; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 606; GFX8-NEXT: s_nop 1 607; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 608; GFX8-NEXT: v_readlane_b32 s2, v1, 63 609; GFX8-NEXT: s_mov_b64 exec, s[0:1] 610; GFX8-NEXT: s_mov_b32 s0, s2 611; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 612; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 613; GFX8-NEXT: s_cbranch_execz BB3_2 614; GFX8-NEXT: ; %bb.1: 615; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 616; GFX8-NEXT: v_mov_b32_e32 v2, s0 617; GFX8-NEXT: s_mov_b32 m0, -1 618; GFX8-NEXT: s_waitcnt lgkmcnt(0) 619; GFX8-NEXT: ds_add_u32 v0, v2 620; GFX8-NEXT: s_waitcnt lgkmcnt(0) 621; GFX8-NEXT: BB3_2: 622; GFX8-NEXT: s_endpgm 623; 624; GFX9-LABEL: add_i32_varying_nouse: 625; GFX9: ; %bb.0: ; %entry 626; GFX9-NEXT: v_mov_b32_e32 v1, v0 627; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 628; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 629; GFX9-NEXT: s_not_b64 exec, exec 630; GFX9-NEXT: v_mov_b32_e32 v1, 0 631; GFX9-NEXT: s_not_b64 exec, exec 632; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 633; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 634; GFX9-NEXT: s_nop 1 635; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX9-NEXT: s_nop 1 637; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 638; GFX9-NEXT: s_nop 1 639; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 640; GFX9-NEXT: s_nop 1 641; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 642; GFX9-NEXT: s_nop 1 643; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 644; GFX9-NEXT: v_readlane_b32 s2, v1, 63 645; GFX9-NEXT: s_mov_b64 exec, s[0:1] 646; GFX9-NEXT: s_mov_b32 s0, s2 647; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 648; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 649; GFX9-NEXT: s_cbranch_execz BB3_2 650; GFX9-NEXT: ; %bb.1: 651; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 652; GFX9-NEXT: v_mov_b32_e32 v2, s0 653; GFX9-NEXT: s_waitcnt lgkmcnt(0) 654; GFX9-NEXT: ds_add_u32 v0, v2 655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-NEXT: BB3_2: 657; GFX9-NEXT: s_endpgm 658; 659; GFX1064-LABEL: add_i32_varying_nouse: 660; GFX1064: ; %bb.0: ; %entry 661; GFX1064-NEXT: v_mov_b32_e32 v1, v0 662; GFX1064-NEXT: s_not_b64 exec, exec 663; GFX1064-NEXT: v_mov_b32_e32 v1, 0 664; GFX1064-NEXT: s_not_b64 exec, exec 665; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 666; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX1064-NEXT: v_mov_b32_e32 v2, v1 671; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 672; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 673; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 674; GFX1064-NEXT: v_mov_b32_e32 v2, s2 675; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 676; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 677; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 678; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 679; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 680; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 681; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 682; GFX1064-NEXT: s_mov_b32 s0, s2 683; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 684; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 685; GFX1064-NEXT: s_cbranch_execz BB3_2 686; GFX1064-NEXT: ; %bb.1: 687; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 688; GFX1064-NEXT: v_mov_b32_e32 v3, s0 689; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 690; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 691; GFX1064-NEXT: ds_add_u32 v0, v3 692; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 693; GFX1064-NEXT: buffer_gl0_inv 694; GFX1064-NEXT: BB3_2: 695; GFX1064-NEXT: s_endpgm 696; 697; GFX1032-LABEL: add_i32_varying_nouse: 698; GFX1032: ; %bb.0: ; %entry 699; GFX1032-NEXT: v_mov_b32_e32 v1, v0 700; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 701; GFX1032-NEXT: v_mov_b32_e32 v1, 0 702; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 703; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 704; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 705; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 707; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 708; GFX1032-NEXT: v_mov_b32_e32 v2, v1 709; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 710; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 711; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 712; GFX1032-NEXT: s_mov_b32 exec_lo, s0 713; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 714; GFX1032-NEXT: s_mov_b32 s0, s1 715; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 716; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 717; GFX1032-NEXT: s_cbranch_execz BB3_2 718; GFX1032-NEXT: ; %bb.1: 719; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 720; GFX1032-NEXT: v_mov_b32_e32 v3, s0 721; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 722; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 723; GFX1032-NEXT: ds_add_u32 v0, v3 724; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 725; GFX1032-NEXT: buffer_gl0_inv 726; GFX1032-NEXT: BB3_2: 727; GFX1032-NEXT: s_endpgm 728entry: 729 %lane = call i32 @llvm.amdgcn.workitem.id.x() 730 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 731 ret void 732} 733 734define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 735; 736; 737; GFX7LESS-LABEL: add_i64_constant: 738; GFX7LESS: ; %bb.0: ; %entry 739; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 740; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 741; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 742; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 743; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 744; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 745; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 746; GFX7LESS-NEXT: s_cbranch_execz BB4_2 747; GFX7LESS-NEXT: ; %bb.1: 748; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 749; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 750; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 751; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 752; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 753; GFX7LESS-NEXT: s_mov_b32 m0, -1 754; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 755; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 756; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 757; GFX7LESS-NEXT: BB4_2: 758; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 759; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 760; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 761; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 762; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 763; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 764; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 765; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 766; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 767; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 768; GFX7LESS-NEXT: s_mov_b32 s2, -1 769; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 770; GFX7LESS-NEXT: s_endpgm 771; 772; GFX8-LABEL: add_i64_constant: 773; GFX8: ; %bb.0: ; %entry 774; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 775; GFX8-NEXT: s_mov_b64 s[4:5], exec 776; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 777; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 778; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 779; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 780; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 781; GFX8-NEXT: s_cbranch_execz BB4_2 782; GFX8-NEXT: ; %bb.1: 783; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 784; GFX8-NEXT: s_mul_i32 s4, s4, 5 785; GFX8-NEXT: v_mov_b32_e32 v1, s4 786; GFX8-NEXT: v_mov_b32_e32 v2, 0 787; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 788; GFX8-NEXT: s_mov_b32 m0, -1 789; GFX8-NEXT: s_waitcnt lgkmcnt(0) 790; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 791; GFX8-NEXT: s_waitcnt lgkmcnt(0) 792; GFX8-NEXT: BB4_2: 793; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 794; GFX8-NEXT: s_waitcnt lgkmcnt(0) 795; GFX8-NEXT: v_readfirstlane_b32 s2, v1 796; GFX8-NEXT: v_readfirstlane_b32 s3, v2 797; GFX8-NEXT: v_mov_b32_e32 v1, s2 798; GFX8-NEXT: v_mov_b32_e32 v2, s3 799; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 800; GFX8-NEXT: s_mov_b32 s3, 0xf000 801; GFX8-NEXT: s_mov_b32 s2, -1 802; GFX8-NEXT: s_nop 2 803; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 804; GFX8-NEXT: s_endpgm 805; 806; GFX9-LABEL: add_i64_constant: 807; GFX9: ; %bb.0: ; %entry 808; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 809; GFX9-NEXT: s_mov_b64 s[4:5], exec 810; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 811; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 812; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 813; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 814; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 815; GFX9-NEXT: s_cbranch_execz BB4_2 816; GFX9-NEXT: ; %bb.1: 817; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 818; GFX9-NEXT: s_mul_i32 s4, s4, 5 819; GFX9-NEXT: v_mov_b32_e32 v1, s4 820; GFX9-NEXT: v_mov_b32_e32 v2, 0 821; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 822; GFX9-NEXT: s_waitcnt lgkmcnt(0) 823; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 824; GFX9-NEXT: s_waitcnt lgkmcnt(0) 825; GFX9-NEXT: BB4_2: 826; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 827; GFX9-NEXT: s_waitcnt lgkmcnt(0) 828; GFX9-NEXT: v_readfirstlane_b32 s2, v1 829; GFX9-NEXT: v_readfirstlane_b32 s3, v2 830; GFX9-NEXT: v_mov_b32_e32 v1, s2 831; GFX9-NEXT: v_mov_b32_e32 v2, s3 832; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 833; GFX9-NEXT: s_mov_b32 s3, 0xf000 834; GFX9-NEXT: s_mov_b32 s2, -1 835; GFX9-NEXT: s_nop 2 836; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 837; GFX9-NEXT: s_endpgm 838; 839; GFX1064-LABEL: add_i64_constant: 840; GFX1064: ; %bb.0: ; %entry 841; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 842; GFX1064-NEXT: s_mov_b64 s[4:5], exec 843; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 844; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 845; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 846; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 847; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 848; GFX1064-NEXT: s_cbranch_execz BB4_2 849; GFX1064-NEXT: ; %bb.1: 850; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 851; GFX1064-NEXT: v_mov_b32_e32 v2, 0 852; GFX1064-NEXT: s_mul_i32 s4, s4, 5 853; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 854; GFX1064-NEXT: v_mov_b32_e32 v1, s4 855; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 856; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 857; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 858; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 859; GFX1064-NEXT: buffer_gl0_inv 860; GFX1064-NEXT: BB4_2: 861; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 862; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 863; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 864; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 865; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 866; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 867; GFX1064-NEXT: s_mov_b32 s2, -1 868; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 869; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 870; GFX1064-NEXT: s_endpgm 871; 872; GFX1032-LABEL: add_i64_constant: 873; GFX1032: ; %bb.0: ; %entry 874; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 875; GFX1032-NEXT: s_mov_b32 s3, exec_lo 876; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 877; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 878; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 879; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 880; GFX1032-NEXT: s_cbranch_execz BB4_2 881; GFX1032-NEXT: ; %bb.1: 882; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 883; GFX1032-NEXT: v_mov_b32_e32 v2, 0 884; GFX1032-NEXT: s_mul_i32 s3, s3, 5 885; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 886; GFX1032-NEXT: v_mov_b32_e32 v1, s3 887; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 888; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 889; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 890; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 891; GFX1032-NEXT: buffer_gl0_inv 892; GFX1032-NEXT: BB4_2: 893; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 894; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 895; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 896; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 897; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 898; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 899; GFX1032-NEXT: s_mov_b32 s2, -1 900; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 901; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 902; GFX1032-NEXT: s_endpgm 903entry: 904 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 905 store i64 %old, i64 addrspace(1)* %out 906 ret void 907} 908 909define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 910; 911; 912; GFX7LESS-LABEL: add_i64_uniform: 913; GFX7LESS: ; %bb.0: ; %entry 914; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 915; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 916; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 917; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 918; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 919; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 920; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 921; GFX7LESS-NEXT: s_cbranch_execz BB5_2 922; GFX7LESS-NEXT: ; %bb.1: 923; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 924; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 925; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 926; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 927; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 928; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 929; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 930; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 931; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 932; GFX7LESS-NEXT: s_mov_b32 m0, -1 933; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 934; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 935; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 936; GFX7LESS-NEXT: BB5_2: 937; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 938; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 939; GFX7LESS-NEXT: s_mov_b32 s6, -1 940; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 941; GFX7LESS-NEXT: s_mov_b32 s4, s0 942; GFX7LESS-NEXT: s_mov_b32 s5, s1 943; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 944; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 945; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 946; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 947; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 948; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 949; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 950; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 951; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 952; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 953; GFX7LESS-NEXT: s_endpgm 954; 955; GFX8-LABEL: add_i64_uniform: 956; GFX8: ; %bb.0: ; %entry 957; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 958; GFX8-NEXT: s_mov_b64 s[6:7], exec 959; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 960; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 961; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 962; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 963; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 964; GFX8-NEXT: s_cbranch_execz BB5_2 965; GFX8-NEXT: ; %bb.1: 966; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 967; GFX8-NEXT: v_mov_b32_e32 v1, s6 968; GFX8-NEXT: s_waitcnt lgkmcnt(0) 969; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 970; GFX8-NEXT: s_mul_i32 s7, s3, s6 971; GFX8-NEXT: s_mul_i32 s6, s2, s6 972; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 973; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 974; GFX8-NEXT: v_mov_b32_e32 v1, s6 975; GFX8-NEXT: s_mov_b32 m0, -1 976; GFX8-NEXT: s_waitcnt lgkmcnt(0) 977; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 978; GFX8-NEXT: s_waitcnt lgkmcnt(0) 979; GFX8-NEXT: BB5_2: 980; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 981; GFX8-NEXT: s_waitcnt lgkmcnt(0) 982; GFX8-NEXT: s_mov_b32 s4, s0 983; GFX8-NEXT: v_readfirstlane_b32 s0, v1 984; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 985; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 986; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 987; GFX8-NEXT: s_mov_b32 s5, s1 988; GFX8-NEXT: v_readfirstlane_b32 s1, v2 989; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 990; GFX8-NEXT: v_mov_b32_e32 v2, s1 991; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 992; GFX8-NEXT: s_mov_b32 s7, 0xf000 993; GFX8-NEXT: s_mov_b32 s6, -1 994; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 995; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 996; GFX8-NEXT: s_endpgm 997; 998; GFX9-LABEL: add_i64_uniform: 999; GFX9: ; %bb.0: ; %entry 1000; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1001; GFX9-NEXT: s_mov_b64 s[6:7], exec 1002; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1003; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1004; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1005; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1006; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1007; GFX9-NEXT: s_cbranch_execz BB5_2 1008; GFX9-NEXT: ; %bb.1: 1009; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1010; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1011; GFX9-NEXT: s_mul_i32 s7, s3, s6 1012; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1013; GFX9-NEXT: s_add_i32 s8, s8, s7 1014; GFX9-NEXT: s_mul_i32 s6, s2, s6 1015; GFX9-NEXT: v_mov_b32_e32 v1, s6 1016; GFX9-NEXT: v_mov_b32_e32 v2, s8 1017; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1018; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1020; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1021; GFX9-NEXT: BB5_2: 1022; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1023; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1025; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1026; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1027; GFX9-NEXT: s_mov_b32 s4, s0 1028; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1029; GFX9-NEXT: s_mov_b32 s5, s1 1030; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1031; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1032; GFX9-NEXT: v_mov_b32_e32 v2, s1 1033; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1034; GFX9-NEXT: s_mov_b32 s7, 0xf000 1035; GFX9-NEXT: s_mov_b32 s6, -1 1036; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1037; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1038; GFX9-NEXT: s_endpgm 1039; 1040; GFX1064-LABEL: add_i64_uniform: 1041; GFX1064: ; %bb.0: ; %entry 1042; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1043; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1044; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1045; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1046; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1047; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1048; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1049; GFX1064-NEXT: s_cbranch_execz BB5_2 1050; GFX1064-NEXT: ; %bb.1: 1051; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1052; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1053; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1054; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1055; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1056; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1057; GFX1064-NEXT: s_add_i32 s8, s8, s7 1058; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1059; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1060; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1061; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1062; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1063; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1064; GFX1064-NEXT: buffer_gl0_inv 1065; GFX1064-NEXT: BB5_2: 1066; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1067; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1068; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1069; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1070; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1071; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1072; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1073; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1074; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1075; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1076; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 1077; GFX1064-NEXT: s_mov_b32 s2, -1 1078; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1079; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1080; GFX1064-NEXT: s_endpgm 1081; 1082; GFX1032-LABEL: add_i64_uniform: 1083; GFX1032: ; %bb.0: ; %entry 1084; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1085; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1086; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1087; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1088; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1089; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1090; GFX1032-NEXT: s_cbranch_execz BB5_2 1091; GFX1032-NEXT: ; %bb.1: 1092; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1093; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1094; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1095; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1096; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1097; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1098; GFX1032-NEXT: s_add_i32 s7, s7, s6 1099; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1100; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1101; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1102; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1103; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1104; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1105; GFX1032-NEXT: buffer_gl0_inv 1106; GFX1032-NEXT: BB5_2: 1107; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1108; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1109; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1111; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1112; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1113; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1114; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1115; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1116; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1117; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 1118; GFX1032-NEXT: s_mov_b32 s2, -1 1119; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1120; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1121; GFX1032-NEXT: s_endpgm 1122entry: 1123 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1124 store i64 %old, i64 addrspace(1)* %out 1125 ret void 1126} 1127 1128define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1129; 1130; 1131; GFX7LESS-LABEL: add_i64_varying: 1132; GFX7LESS: ; %bb.0: ; %entry 1133; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1134; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1135; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1136; GFX7LESS-NEXT: s_mov_b32 m0, -1 1137; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1139; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1141; GFX7LESS-NEXT: s_mov_b32 s2, -1 1142; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1143; GFX7LESS-NEXT: s_endpgm 1144; 1145; GFX8-LABEL: add_i64_varying: 1146; GFX8: ; %bb.0: ; %entry 1147; GFX8-NEXT: v_mov_b32_e32 v1, 0 1148; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1149; GFX8-NEXT: s_mov_b32 m0, -1 1150; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1151; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1152; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1153; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1154; GFX8-NEXT: s_mov_b32 s3, 0xf000 1155; GFX8-NEXT: s_mov_b32 s2, -1 1156; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1157; GFX8-NEXT: s_endpgm 1158; 1159; GFX9-LABEL: add_i64_varying: 1160; GFX9: ; %bb.0: ; %entry 1161; GFX9-NEXT: v_mov_b32_e32 v1, 0 1162; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1163; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1166; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX9-NEXT: s_mov_b32 s3, 0xf000 1168; GFX9-NEXT: s_mov_b32 s2, -1 1169; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1170; GFX9-NEXT: s_endpgm 1171; 1172; GFX10-LABEL: add_i64_varying: 1173; GFX10: ; %bb.0: ; %entry 1174; GFX10-NEXT: v_mov_b32_e32 v1, 0 1175; GFX10-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1176; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1177; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1178; GFX10-NEXT: s_mov_b32 s2, -1 1179; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1180; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1181; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1182; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1183; GFX10-NEXT: buffer_gl0_inv 1184; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1185; GFX10-NEXT: s_endpgm 1186entry: 1187 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1188 %zext = zext i32 %lane to i64 1189 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1190 store i64 %old, i64 addrspace(1)* %out 1191 ret void 1192} 1193 1194define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1195; 1196; 1197; GFX7LESS-LABEL: sub_i32_constant: 1198; GFX7LESS: ; %bb.0: ; %entry 1199; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1200; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1201; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1202; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1203; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1204; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1205; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1206; GFX7LESS-NEXT: s_cbranch_execz BB7_2 1207; GFX7LESS-NEXT: ; %bb.1: 1208; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1209; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1210; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1211; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1212; GFX7LESS-NEXT: s_mov_b32 m0, -1 1213; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1214; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1215; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX7LESS-NEXT: BB7_2: 1217; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1218; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1219; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1220; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1221; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1222; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1223; GFX7LESS-NEXT: s_mov_b32 s2, -1 1224; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1225; GFX7LESS-NEXT: s_endpgm 1226; 1227; GFX8-LABEL: sub_i32_constant: 1228; GFX8: ; %bb.0: ; %entry 1229; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1230; GFX8-NEXT: s_mov_b64 s[2:3], exec 1231; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1232; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1233; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1234; GFX8-NEXT: ; implicit-def: $vgpr1 1235; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1236; GFX8-NEXT: s_cbranch_execz BB7_2 1237; GFX8-NEXT: ; %bb.1: 1238; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1239; GFX8-NEXT: s_mul_i32 s2, s2, 5 1240; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1241; GFX8-NEXT: v_mov_b32_e32 v2, s2 1242; GFX8-NEXT: s_mov_b32 m0, -1 1243; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1244; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1245; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX8-NEXT: BB7_2: 1247; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1248; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1250; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1251; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1252; GFX8-NEXT: s_mov_b32 s3, 0xf000 1253; GFX8-NEXT: s_mov_b32 s2, -1 1254; GFX8-NEXT: s_nop 0 1255; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1256; GFX8-NEXT: s_endpgm 1257; 1258; GFX9-LABEL: sub_i32_constant: 1259; GFX9: ; %bb.0: ; %entry 1260; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1261; GFX9-NEXT: s_mov_b64 s[2:3], exec 1262; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1263; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1264; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1265; GFX9-NEXT: ; implicit-def: $vgpr1 1266; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1267; GFX9-NEXT: s_cbranch_execz BB7_2 1268; GFX9-NEXT: ; %bb.1: 1269; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1270; GFX9-NEXT: s_mul_i32 s2, s2, 5 1271; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1272; GFX9-NEXT: v_mov_b32_e32 v2, s2 1273; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1274; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1275; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1276; GFX9-NEXT: BB7_2: 1277; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1278; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1280; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1281; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1282; GFX9-NEXT: s_mov_b32 s3, 0xf000 1283; GFX9-NEXT: s_mov_b32 s2, -1 1284; GFX9-NEXT: s_nop 0 1285; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1286; GFX9-NEXT: s_endpgm 1287; 1288; GFX1064-LABEL: sub_i32_constant: 1289; GFX1064: ; %bb.0: ; %entry 1290; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1291; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1292; GFX1064-NEXT: ; implicit-def: $vgpr1 1293; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1294; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1295; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1296; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1297; GFX1064-NEXT: s_cbranch_execz BB7_2 1298; GFX1064-NEXT: ; %bb.1: 1299; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1300; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1301; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1302; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1303; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1304; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1305; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1306; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1307; GFX1064-NEXT: buffer_gl0_inv 1308; GFX1064-NEXT: BB7_2: 1309; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1310; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1311; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1312; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1313; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1314; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1315; GFX1064-NEXT: s_mov_b32 s2, -1 1316; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1317; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1318; GFX1064-NEXT: s_endpgm 1319; 1320; GFX1032-LABEL: sub_i32_constant: 1321; GFX1032: ; %bb.0: ; %entry 1322; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1323; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1324; GFX1032-NEXT: ; implicit-def: $vgpr1 1325; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1326; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1327; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1328; GFX1032-NEXT: s_cbranch_execz BB7_2 1329; GFX1032-NEXT: ; %bb.1: 1330; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1331; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1332; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1333; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1334; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1335; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1336; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1337; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1338; GFX1032-NEXT: buffer_gl0_inv 1339; GFX1032-NEXT: BB7_2: 1340; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1341; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1342; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1343; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1344; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1345; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1346; GFX1032-NEXT: s_mov_b32 s2, -1 1347; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1348; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1349; GFX1032-NEXT: s_endpgm 1350entry: 1351 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1352 store i32 %old, i32 addrspace(1)* %out 1353 ret void 1354} 1355 1356define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1357; 1358; 1359; GFX7LESS-LABEL: sub_i32_uniform: 1360; GFX7LESS: ; %bb.0: ; %entry 1361; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1362; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1363; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 1364; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1365; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1366; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1367; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1368; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 1369; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1370; GFX7LESS-NEXT: ; %bb.1: 1371; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1372; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1373; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 1374; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1375; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1376; GFX7LESS-NEXT: s_mov_b32 m0, -1 1377; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1378; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1379; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1380; GFX7LESS-NEXT: BB8_2: 1381; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 1382; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1384; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1385; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1386; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 1387; GFX7LESS-NEXT: s_mov_b32 s6, -1 1388; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1389; GFX7LESS-NEXT: s_endpgm 1390; 1391; GFX8-LABEL: sub_i32_uniform: 1392; GFX8: ; %bb.0: ; %entry 1393; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1394; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1395; GFX8-NEXT: s_mov_b64 s[2:3], exec 1396; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1397; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1398; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1399; GFX8-NEXT: ; implicit-def: $vgpr1 1400; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1401; GFX8-NEXT: s_cbranch_execz BB8_2 1402; GFX8-NEXT: ; %bb.1: 1403; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1404; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1405; GFX8-NEXT: s_mul_i32 s1, s0, s1 1406; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1407; GFX8-NEXT: v_mov_b32_e32 v2, s1 1408; GFX8-NEXT: s_mov_b32 m0, -1 1409; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1410; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1411; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1412; GFX8-NEXT: BB8_2: 1413; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1414; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1415; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1416; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1417; GFX8-NEXT: s_mov_b32 s7, 0xf000 1418; GFX8-NEXT: s_mov_b32 s6, -1 1419; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1420; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1421; GFX8-NEXT: s_endpgm 1422; 1423; GFX9-LABEL: sub_i32_uniform: 1424; GFX9: ; %bb.0: ; %entry 1425; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1426; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1427; GFX9-NEXT: s_mov_b64 s[6:7], exec 1428; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1429; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1430; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1431; GFX9-NEXT: ; implicit-def: $vgpr1 1432; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1433; GFX9-NEXT: s_cbranch_execz BB8_2 1434; GFX9-NEXT: ; %bb.1: 1435; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1436; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1437; GFX9-NEXT: s_mul_i32 s3, s2, s3 1438; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1439; GFX9-NEXT: v_mov_b32_e32 v2, s3 1440; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1442; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX9-NEXT: BB8_2: 1444; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1445; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1446; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1447; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1448; GFX9-NEXT: s_mov_b32 s7, 0xf000 1449; GFX9-NEXT: s_mov_b32 s6, -1 1450; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1451; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1452; GFX9-NEXT: s_endpgm 1453; 1454; GFX1064-LABEL: sub_i32_uniform: 1455; GFX1064: ; %bb.0: ; %entry 1456; GFX1064-NEXT: s_clause 0x1 1457; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1458; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 1459; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1460; GFX1064-NEXT: ; implicit-def: $vgpr1 1461; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1462; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1463; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1464; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1465; GFX1064-NEXT: s_cbranch_execz BB8_2 1466; GFX1064-NEXT: ; %bb.1: 1467; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1468; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1469; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1470; GFX1064-NEXT: s_mul_i32 s3, s2, s3 1471; GFX1064-NEXT: v_mov_b32_e32 v2, s3 1472; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1473; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1474; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1475; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1476; GFX1064-NEXT: buffer_gl0_inv 1477; GFX1064-NEXT: BB8_2: 1478; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1479; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1480; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1481; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1482; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1483; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1484; GFX1064-NEXT: s_mov_b32 s6, -1 1485; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1486; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1487; GFX1064-NEXT: s_endpgm 1488; 1489; GFX1032-LABEL: sub_i32_uniform: 1490; GFX1032: ; %bb.0: ; %entry 1491; GFX1032-NEXT: s_clause 0x1 1492; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1493; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1494; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1495; GFX1032-NEXT: ; implicit-def: $vgpr1 1496; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1497; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1498; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1499; GFX1032-NEXT: s_cbranch_execz BB8_2 1500; GFX1032-NEXT: ; %bb.1: 1501; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1502; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1503; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1504; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1505; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1506; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1507; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1508; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1509; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1510; GFX1032-NEXT: buffer_gl0_inv 1511; GFX1032-NEXT: BB8_2: 1512; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1513; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1514; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1515; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1516; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1517; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1518; GFX1032-NEXT: s_mov_b32 s6, -1 1519; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1520; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1521; GFX1032-NEXT: s_endpgm 1522entry: 1523 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1524 store i32 %old, i32 addrspace(1)* %out 1525 ret void 1526} 1527 1528define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1529; 1530; 1531; GFX7LESS-LABEL: sub_i32_varying: 1532; GFX7LESS: ; %bb.0: ; %entry 1533; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1534; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1535; GFX7LESS-NEXT: s_mov_b32 m0, -1 1536; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1537; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1538; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1539; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1540; GFX7LESS-NEXT: s_mov_b32 s2, -1 1541; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1542; GFX7LESS-NEXT: s_endpgm 1543; 1544; GFX8-LABEL: sub_i32_varying: 1545; GFX8: ; %bb.0: ; %entry 1546; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1547; GFX8-NEXT: v_mov_b32_e32 v2, v0 1548; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1549; GFX8-NEXT: v_mov_b32_e32 v1, 0 1550; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1551; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1552; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1553; GFX8-NEXT: s_not_b64 exec, exec 1554; GFX8-NEXT: v_mov_b32_e32 v2, 0 1555; GFX8-NEXT: s_not_b64 exec, exec 1556; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1557; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1558; GFX8-NEXT: s_nop 1 1559; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1560; GFX8-NEXT: s_nop 1 1561; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1562; GFX8-NEXT: s_nop 1 1563; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1564; GFX8-NEXT: s_nop 1 1565; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1566; GFX8-NEXT: s_nop 1 1567; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1568; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1569; GFX8-NEXT: s_nop 0 1570; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1571; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1572; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1573; GFX8-NEXT: ; implicit-def: $vgpr0 1574; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1575; GFX8-NEXT: s_cbranch_execz BB9_2 1576; GFX8-NEXT: ; %bb.1: 1577; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1578; GFX8-NEXT: v_mov_b32_e32 v3, s4 1579; GFX8-NEXT: s_mov_b32 m0, -1 1580; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1581; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1582; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX8-NEXT: BB9_2: 1584; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1585; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1586; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1587; GFX8-NEXT: v_mov_b32_e32 v0, v1 1588; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1589; GFX8-NEXT: s_mov_b32 s3, 0xf000 1590; GFX8-NEXT: s_mov_b32 s2, -1 1591; GFX8-NEXT: s_nop 0 1592; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1593; GFX8-NEXT: s_endpgm 1594; 1595; GFX9-LABEL: sub_i32_varying: 1596; GFX9: ; %bb.0: ; %entry 1597; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1598; GFX9-NEXT: v_mov_b32_e32 v2, v0 1599; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1600; GFX9-NEXT: v_mov_b32_e32 v1, 0 1601; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1602; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1603; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1604; GFX9-NEXT: s_not_b64 exec, exec 1605; GFX9-NEXT: v_mov_b32_e32 v2, 0 1606; GFX9-NEXT: s_not_b64 exec, exec 1607; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1608; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1609; GFX9-NEXT: s_nop 1 1610; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1611; GFX9-NEXT: s_nop 1 1612; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1613; GFX9-NEXT: s_nop 1 1614; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1615; GFX9-NEXT: s_nop 1 1616; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1617; GFX9-NEXT: s_nop 1 1618; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1619; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1620; GFX9-NEXT: s_nop 0 1621; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1622; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1623; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1624; GFX9-NEXT: ; implicit-def: $vgpr0 1625; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1626; GFX9-NEXT: s_cbranch_execz BB9_2 1627; GFX9-NEXT: ; %bb.1: 1628; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1629; GFX9-NEXT: v_mov_b32_e32 v3, s4 1630; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1631; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1632; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1633; GFX9-NEXT: BB9_2: 1634; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1635; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1636; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1637; GFX9-NEXT: v_mov_b32_e32 v0, v1 1638; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1639; GFX9-NEXT: s_mov_b32 s3, 0xf000 1640; GFX9-NEXT: s_mov_b32 s2, -1 1641; GFX9-NEXT: s_nop 0 1642; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1643; GFX9-NEXT: s_endpgm 1644; 1645; GFX1064-LABEL: sub_i32_varying: 1646; GFX1064: ; %bb.0: ; %entry 1647; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1648; GFX1064-NEXT: s_not_b64 exec, exec 1649; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1650; GFX1064-NEXT: s_not_b64 exec, exec 1651; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1652; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1653; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1654; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1655; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1656; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1657; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1658; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1659; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1660; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1661; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1662; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1663; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1664; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1665; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1666; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1667; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1668; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1669; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 1670; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1671; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1672; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1673; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 1674; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 1675; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 1676; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1677; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 1678; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1679; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 1680; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1681; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1682; GFX1064-NEXT: s_mov_b32 s2, -1 1683; GFX1064-NEXT: ; implicit-def: $vgpr0 1684; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1685; GFX1064-NEXT: s_cbranch_execz BB9_2 1686; GFX1064-NEXT: ; %bb.1: 1687; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1688; GFX1064-NEXT: v_mov_b32_e32 v4, s7 1689; GFX1064-NEXT: s_mov_b32 s3, s7 1690; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1691; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1692; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 1693; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1694; GFX1064-NEXT: buffer_gl0_inv 1695; GFX1064-NEXT: BB9_2: 1696; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1697; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1698; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1699; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1700; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1701; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1702; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1703; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1704; GFX1064-NEXT: s_endpgm 1705; 1706; GFX1032-LABEL: sub_i32_varying: 1707; GFX1032: ; %bb.0: ; %entry 1708; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1709; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1710; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1711; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1712; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1713; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1714; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1715; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1716; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1717; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1718; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1719; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1720; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1721; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1722; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1723; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1724; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1725; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1726; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1727; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1728; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1729; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1730; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1731; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1732; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1733; GFX1032-NEXT: s_mov_b32 s2, -1 1734; GFX1032-NEXT: ; implicit-def: $vgpr0 1735; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1736; GFX1032-NEXT: s_cbranch_execz BB9_2 1737; GFX1032-NEXT: ; %bb.1: 1738; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1739; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1740; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1741; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1742; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 1743; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1744; GFX1032-NEXT: buffer_gl0_inv 1745; GFX1032-NEXT: BB9_2: 1746; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1747; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1748; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1749; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1750; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1751; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1752; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1753; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1754; GFX1032-NEXT: s_endpgm 1755entry: 1756 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1757 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1758 store i32 %old, i32 addrspace(1)* %out 1759 ret void 1760} 1761 1762define amdgpu_kernel void @sub_i32_varying_nouse() { 1763; GFX7LESS-LABEL: sub_i32_varying_nouse: 1764; GFX7LESS: ; %bb.0: ; %entry 1765; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1766; GFX7LESS-NEXT: s_mov_b32 m0, -1 1767; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX7LESS-NEXT: ds_sub_u32 v1, v0 1769; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1770; GFX7LESS-NEXT: s_endpgm 1771; 1772; GFX8-LABEL: sub_i32_varying_nouse: 1773; GFX8: ; %bb.0: ; %entry 1774; GFX8-NEXT: v_mov_b32_e32 v1, v0 1775; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1776; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1777; GFX8-NEXT: s_not_b64 exec, exec 1778; GFX8-NEXT: v_mov_b32_e32 v1, 0 1779; GFX8-NEXT: s_not_b64 exec, exec 1780; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 1781; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1782; GFX8-NEXT: s_nop 1 1783; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1784; GFX8-NEXT: s_nop 1 1785; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1786; GFX8-NEXT: s_nop 1 1787; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1788; GFX8-NEXT: s_nop 1 1789; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1790; GFX8-NEXT: s_nop 1 1791; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1792; GFX8-NEXT: v_readlane_b32 s2, v1, 63 1793; GFX8-NEXT: s_mov_b64 exec, s[0:1] 1794; GFX8-NEXT: s_mov_b32 s0, s2 1795; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1796; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1797; GFX8-NEXT: s_cbranch_execz BB10_2 1798; GFX8-NEXT: ; %bb.1: 1799; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1800; GFX8-NEXT: v_mov_b32_e32 v2, s0 1801; GFX8-NEXT: s_mov_b32 m0, -1 1802; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1803; GFX8-NEXT: ds_sub_u32 v0, v2 1804; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1805; GFX8-NEXT: BB10_2: 1806; GFX8-NEXT: s_endpgm 1807; 1808; GFX9-LABEL: sub_i32_varying_nouse: 1809; GFX9: ; %bb.0: ; %entry 1810; GFX9-NEXT: v_mov_b32_e32 v1, v0 1811; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1812; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1813; GFX9-NEXT: s_not_b64 exec, exec 1814; GFX9-NEXT: v_mov_b32_e32 v1, 0 1815; GFX9-NEXT: s_not_b64 exec, exec 1816; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 1817; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1818; GFX9-NEXT: s_nop 1 1819; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1820; GFX9-NEXT: s_nop 1 1821; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1822; GFX9-NEXT: s_nop 1 1823; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1824; GFX9-NEXT: s_nop 1 1825; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1826; GFX9-NEXT: s_nop 1 1827; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1828; GFX9-NEXT: v_readlane_b32 s2, v1, 63 1829; GFX9-NEXT: s_mov_b64 exec, s[0:1] 1830; GFX9-NEXT: s_mov_b32 s0, s2 1831; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1832; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1833; GFX9-NEXT: s_cbranch_execz BB10_2 1834; GFX9-NEXT: ; %bb.1: 1835; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1836; GFX9-NEXT: v_mov_b32_e32 v2, s0 1837; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1838; GFX9-NEXT: ds_sub_u32 v0, v2 1839; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1840; GFX9-NEXT: BB10_2: 1841; GFX9-NEXT: s_endpgm 1842; 1843; GFX1064-LABEL: sub_i32_varying_nouse: 1844; GFX1064: ; %bb.0: ; %entry 1845; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1846; GFX1064-NEXT: s_not_b64 exec, exec 1847; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1848; GFX1064-NEXT: s_not_b64 exec, exec 1849; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1850; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1851; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1852; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1853; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1854; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1855; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1856; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1857; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 1858; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1859; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1860; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1861; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1862; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1863; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 1864; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1865; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 1866; GFX1064-NEXT: s_mov_b32 s0, s2 1867; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1868; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1869; GFX1064-NEXT: s_cbranch_execz BB10_2 1870; GFX1064-NEXT: ; %bb.1: 1871; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1872; GFX1064-NEXT: v_mov_b32_e32 v3, s0 1873; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1874; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1875; GFX1064-NEXT: ds_sub_u32 v0, v3 1876; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1877; GFX1064-NEXT: buffer_gl0_inv 1878; GFX1064-NEXT: BB10_2: 1879; GFX1064-NEXT: s_endpgm 1880; 1881; GFX1032-LABEL: sub_i32_varying_nouse: 1882; GFX1032: ; %bb.0: ; %entry 1883; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1884; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1885; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1886; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1887; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1888; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1889; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1890; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1891; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1892; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1893; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1894; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1895; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 1896; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1897; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1898; GFX1032-NEXT: s_mov_b32 s0, s1 1899; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1900; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1901; GFX1032-NEXT: s_cbranch_execz BB10_2 1902; GFX1032-NEXT: ; %bb.1: 1903; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1904; GFX1032-NEXT: v_mov_b32_e32 v3, s0 1905; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1906; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1907; GFX1032-NEXT: ds_sub_u32 v0, v3 1908; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1909; GFX1032-NEXT: buffer_gl0_inv 1910; GFX1032-NEXT: BB10_2: 1911; GFX1032-NEXT: s_endpgm 1912entry: 1913 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1914 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1915 ret void 1916} 1917 1918define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 1919; 1920; 1921; GFX7LESS-LABEL: sub_i64_constant: 1922; GFX7LESS: ; %bb.0: ; %entry 1923; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1924; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1925; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1926; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1927; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1928; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1929; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1930; GFX7LESS-NEXT: s_cbranch_execz BB11_2 1931; GFX7LESS-NEXT: ; %bb.1: 1932; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1933; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 1934; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1935; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1936; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 1937; GFX7LESS-NEXT: s_mov_b32 m0, -1 1938; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1939; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 1940; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1941; GFX7LESS-NEXT: BB11_2: 1942; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1943; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1944; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1945; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1946; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1947; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1948; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1949; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1950; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1951; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1952; GFX7LESS-NEXT: s_mov_b32 s2, -1 1953; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1954; GFX7LESS-NEXT: s_endpgm 1955; 1956; GFX8-LABEL: sub_i64_constant: 1957; GFX8: ; %bb.0: ; %entry 1958; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1959; GFX8-NEXT: s_mov_b64 s[4:5], exec 1960; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1961; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1962; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1963; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1964; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1965; GFX8-NEXT: s_cbranch_execz BB11_2 1966; GFX8-NEXT: ; %bb.1: 1967; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1968; GFX8-NEXT: s_mul_i32 s4, s4, 5 1969; GFX8-NEXT: v_mov_b32_e32 v1, s4 1970; GFX8-NEXT: v_mov_b32_e32 v2, 0 1971; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1972; GFX8-NEXT: s_mov_b32 m0, -1 1973; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1974; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 1975; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1976; GFX8-NEXT: BB11_2: 1977; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1978; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1980; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1981; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1982; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1983; GFX8-NEXT: v_mov_b32_e32 v2, s3 1984; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1985; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1986; GFX8-NEXT: s_mov_b32 s3, 0xf000 1987; GFX8-NEXT: s_mov_b32 s2, -1 1988; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1989; GFX8-NEXT: s_endpgm 1990; 1991; GFX9-LABEL: sub_i64_constant: 1992; GFX9: ; %bb.0: ; %entry 1993; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1994; GFX9-NEXT: s_mov_b64 s[4:5], exec 1995; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1996; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1997; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1998; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1999; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2000; GFX9-NEXT: s_cbranch_execz BB11_2 2001; GFX9-NEXT: ; %bb.1: 2002; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2003; GFX9-NEXT: s_mul_i32 s4, s4, 5 2004; GFX9-NEXT: v_mov_b32_e32 v1, s4 2005; GFX9-NEXT: v_mov_b32_e32 v2, 0 2006; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2007; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2008; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2009; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2010; GFX9-NEXT: BB11_2: 2011; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2012; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2013; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2014; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2015; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2016; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2017; GFX9-NEXT: v_mov_b32_e32 v2, s3 2018; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2019; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2020; GFX9-NEXT: s_mov_b32 s3, 0xf000 2021; GFX9-NEXT: s_mov_b32 s2, -1 2022; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2023; GFX9-NEXT: s_endpgm 2024; 2025; GFX1064-LABEL: sub_i64_constant: 2026; GFX1064: ; %bb.0: ; %entry 2027; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2028; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2029; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2030; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2031; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2032; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2033; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2034; GFX1064-NEXT: s_cbranch_execz BB11_2 2035; GFX1064-NEXT: ; %bb.1: 2036; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2037; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2038; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2039; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2040; GFX1064-NEXT: v_mov_b32_e32 v1, s4 2041; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2042; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2043; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2044; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2045; GFX1064-NEXT: buffer_gl0_inv 2046; GFX1064-NEXT: BB11_2: 2047; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2048; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2049; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2050; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2051; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2052; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2053; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2054; GFX1064-NEXT: s_mov_b32 s2, -1 2055; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2056; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2057; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2058; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2059; GFX1064-NEXT: s_endpgm 2060; 2061; GFX1032-LABEL: sub_i64_constant: 2062; GFX1032: ; %bb.0: ; %entry 2063; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2064; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2065; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2066; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2067; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2068; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2069; GFX1032-NEXT: s_cbranch_execz BB11_2 2070; GFX1032-NEXT: ; %bb.1: 2071; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2072; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2073; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2074; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2075; GFX1032-NEXT: v_mov_b32_e32 v1, s3 2076; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2077; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2078; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2079; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2080; GFX1032-NEXT: buffer_gl0_inv 2081; GFX1032-NEXT: BB11_2: 2082; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2083; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2084; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2085; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2086; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2087; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2088; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2089; GFX1032-NEXT: s_mov_b32 s2, -1 2090; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2091; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2092; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2093; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2094; GFX1032-NEXT: s_endpgm 2095entry: 2096 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2097 store i64 %old, i64 addrspace(1)* %out 2098 ret void 2099} 2100 2101define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2102; 2103; 2104; GFX7LESS-LABEL: sub_i64_uniform: 2105; GFX7LESS: ; %bb.0: ; %entry 2106; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2107; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2108; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2109; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2110; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2111; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2112; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2113; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2114; GFX7LESS-NEXT: ; %bb.1: 2115; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2116; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2117; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2118; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2119; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2120; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2121; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2122; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2123; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2124; GFX7LESS-NEXT: s_mov_b32 m0, -1 2125; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2126; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2127; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2128; GFX7LESS-NEXT: BB12_2: 2129; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2130; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2131; GFX7LESS-NEXT: s_mov_b32 s6, -1 2132; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2133; GFX7LESS-NEXT: s_mov_b32 s4, s0 2134; GFX7LESS-NEXT: s_mov_b32 s5, s1 2135; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2136; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2137; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2138; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2139; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2140; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2141; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2142; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2143; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2144; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2145; GFX7LESS-NEXT: s_endpgm 2146; 2147; GFX8-LABEL: sub_i64_uniform: 2148; GFX8: ; %bb.0: ; %entry 2149; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2150; GFX8-NEXT: s_mov_b64 s[6:7], exec 2151; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2152; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2153; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2154; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2155; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2156; GFX8-NEXT: s_cbranch_execz BB12_2 2157; GFX8-NEXT: ; %bb.1: 2158; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2159; GFX8-NEXT: v_mov_b32_e32 v1, s6 2160; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2161; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2162; GFX8-NEXT: s_mul_i32 s7, s3, s6 2163; GFX8-NEXT: s_mul_i32 s6, s2, s6 2164; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2165; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2166; GFX8-NEXT: v_mov_b32_e32 v1, s6 2167; GFX8-NEXT: s_mov_b32 m0, -1 2168; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2169; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2170; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2171; GFX8-NEXT: BB12_2: 2172; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2173; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2174; GFX8-NEXT: s_mov_b32 s4, s0 2175; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2176; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2177; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2178; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2179; GFX8-NEXT: s_mov_b32 s5, s1 2180; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2181; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2182; GFX8-NEXT: v_mov_b32_e32 v2, s1 2183; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2184; GFX8-NEXT: s_mov_b32 s7, 0xf000 2185; GFX8-NEXT: s_mov_b32 s6, -1 2186; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2187; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2188; GFX8-NEXT: s_endpgm 2189; 2190; GFX9-LABEL: sub_i64_uniform: 2191; GFX9: ; %bb.0: ; %entry 2192; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2193; GFX9-NEXT: s_mov_b64 s[6:7], exec 2194; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2195; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2196; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2197; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2198; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2199; GFX9-NEXT: s_cbranch_execz BB12_2 2200; GFX9-NEXT: ; %bb.1: 2201; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2203; GFX9-NEXT: s_mul_i32 s7, s3, s6 2204; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2205; GFX9-NEXT: s_add_i32 s8, s8, s7 2206; GFX9-NEXT: s_mul_i32 s6, s2, s6 2207; GFX9-NEXT: v_mov_b32_e32 v1, s6 2208; GFX9-NEXT: v_mov_b32_e32 v2, s8 2209; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2210; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2211; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2212; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX9-NEXT: BB12_2: 2214; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2215; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2216; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2217; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2218; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2219; GFX9-NEXT: s_mov_b32 s4, s0 2220; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2221; GFX9-NEXT: s_mov_b32 s5, s1 2222; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2223; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2224; GFX9-NEXT: v_mov_b32_e32 v2, s1 2225; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2226; GFX9-NEXT: s_mov_b32 s7, 0xf000 2227; GFX9-NEXT: s_mov_b32 s6, -1 2228; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2229; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2230; GFX9-NEXT: s_endpgm 2231; 2232; GFX1064-LABEL: sub_i64_uniform: 2233; GFX1064: ; %bb.0: ; %entry 2234; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2235; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2236; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2237; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2238; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2239; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2240; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2241; GFX1064-NEXT: s_cbranch_execz BB12_2 2242; GFX1064-NEXT: ; %bb.1: 2243; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2244; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2245; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2246; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2247; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2248; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2249; GFX1064-NEXT: s_add_i32 s8, s8, s7 2250; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2251; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2252; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2253; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2254; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2255; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2256; GFX1064-NEXT: buffer_gl0_inv 2257; GFX1064-NEXT: BB12_2: 2258; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2259; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2260; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2261; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2262; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2263; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2264; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2265; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2266; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2267; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2268; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0 2269; GFX1064-NEXT: s_mov_b32 s2, -1 2270; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2271; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2272; GFX1064-NEXT: s_endpgm 2273; 2274; GFX1032-LABEL: sub_i64_uniform: 2275; GFX1032: ; %bb.0: ; %entry 2276; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2277; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2278; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2279; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2280; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2281; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2282; GFX1032-NEXT: s_cbranch_execz BB12_2 2283; GFX1032-NEXT: ; %bb.1: 2284; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2285; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2286; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2287; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2288; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2289; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2290; GFX1032-NEXT: s_add_i32 s7, s7, s6 2291; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2292; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2293; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2294; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2295; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2296; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2297; GFX1032-NEXT: buffer_gl0_inv 2298; GFX1032-NEXT: BB12_2: 2299; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2300; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2301; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2303; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2304; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2305; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2306; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2307; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2308; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2309; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0 2310; GFX1032-NEXT: s_mov_b32 s2, -1 2311; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2312; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2313; GFX1032-NEXT: s_endpgm 2314entry: 2315 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2316 store i64 %old, i64 addrspace(1)* %out 2317 ret void 2318} 2319 2320define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2321; 2322; 2323; GFX7LESS-LABEL: sub_i64_varying: 2324; GFX7LESS: ; %bb.0: ; %entry 2325; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2326; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2327; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2328; GFX7LESS-NEXT: s_mov_b32 m0, -1 2329; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2330; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2331; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2332; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2333; GFX7LESS-NEXT: s_mov_b32 s2, -1 2334; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2335; GFX7LESS-NEXT: s_endpgm 2336; 2337; GFX8-LABEL: sub_i64_varying: 2338; GFX8: ; %bb.0: ; %entry 2339; GFX8-NEXT: v_mov_b32_e32 v1, 0 2340; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2341; GFX8-NEXT: s_mov_b32 m0, -1 2342; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2343; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2344; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2345; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2346; GFX8-NEXT: s_mov_b32 s3, 0xf000 2347; GFX8-NEXT: s_mov_b32 s2, -1 2348; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2349; GFX8-NEXT: s_endpgm 2350; 2351; GFX9-LABEL: sub_i64_varying: 2352; GFX9: ; %bb.0: ; %entry 2353; GFX9-NEXT: v_mov_b32_e32 v1, 0 2354; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2355; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2356; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2357; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2358; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2359; GFX9-NEXT: s_mov_b32 s3, 0xf000 2360; GFX9-NEXT: s_mov_b32 s2, -1 2361; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2362; GFX9-NEXT: s_endpgm 2363; 2364; GFX10-LABEL: sub_i64_varying: 2365; GFX10: ; %bb.0: ; %entry 2366; GFX10-NEXT: v_mov_b32_e32 v1, 0 2367; GFX10-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2368; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2369; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2370; GFX10-NEXT: s_mov_b32 s2, -1 2371; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2372; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2373; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2374; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2375; GFX10-NEXT: buffer_gl0_inv 2376; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2377; GFX10-NEXT: s_endpgm 2378entry: 2379 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2380 %zext = zext i32 %lane to i64 2381 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2382 store i64 %old, i64 addrspace(1)* %out 2383 ret void 2384} 2385 2386define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2387; 2388; 2389; GFX7LESS-LABEL: and_i32_varying: 2390; GFX7LESS: ; %bb.0: ; %entry 2391; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2392; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2393; GFX7LESS-NEXT: s_mov_b32 m0, -1 2394; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2395; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2396; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2397; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2398; GFX7LESS-NEXT: s_mov_b32 s2, -1 2399; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2400; GFX7LESS-NEXT: s_endpgm 2401; 2402; GFX8-LABEL: and_i32_varying: 2403; GFX8: ; %bb.0: ; %entry 2404; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2405; GFX8-NEXT: v_mov_b32_e32 v2, v0 2406; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2407; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2408; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2409; GFX8-NEXT: v_mov_b32_e32 v1, -1 2410; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2411; GFX8-NEXT: s_not_b64 exec, exec 2412; GFX8-NEXT: v_mov_b32_e32 v2, -1 2413; GFX8-NEXT: s_not_b64 exec, exec 2414; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2415; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2416; GFX8-NEXT: s_nop 1 2417; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2418; GFX8-NEXT: s_nop 1 2419; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2420; GFX8-NEXT: s_nop 1 2421; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2422; GFX8-NEXT: s_nop 1 2423; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2424; GFX8-NEXT: s_nop 1 2425; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2426; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2427; GFX8-NEXT: s_nop 0 2428; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2429; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2430; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2431; GFX8-NEXT: ; implicit-def: $vgpr0 2432; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2433; GFX8-NEXT: s_cbranch_execz BB14_2 2434; GFX8-NEXT: ; %bb.1: 2435; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2436; GFX8-NEXT: v_mov_b32_e32 v3, s4 2437; GFX8-NEXT: s_mov_b32 m0, -1 2438; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2439; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2440; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2441; GFX8-NEXT: BB14_2: 2442; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2443; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2444; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2445; GFX8-NEXT: v_mov_b32_e32 v0, v1 2446; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2447; GFX8-NEXT: s_mov_b32 s3, 0xf000 2448; GFX8-NEXT: s_mov_b32 s2, -1 2449; GFX8-NEXT: s_nop 0 2450; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2451; GFX8-NEXT: s_endpgm 2452; 2453; GFX9-LABEL: and_i32_varying: 2454; GFX9: ; %bb.0: ; %entry 2455; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2456; GFX9-NEXT: v_mov_b32_e32 v2, v0 2457; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2458; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2459; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2460; GFX9-NEXT: v_mov_b32_e32 v1, -1 2461; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2462; GFX9-NEXT: s_not_b64 exec, exec 2463; GFX9-NEXT: v_mov_b32_e32 v2, -1 2464; GFX9-NEXT: s_not_b64 exec, exec 2465; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2466; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2467; GFX9-NEXT: s_nop 1 2468; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2469; GFX9-NEXT: s_nop 1 2470; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2471; GFX9-NEXT: s_nop 1 2472; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2473; GFX9-NEXT: s_nop 1 2474; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2475; GFX9-NEXT: s_nop 1 2476; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2477; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2478; GFX9-NEXT: s_nop 0 2479; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2480; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2481; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2482; GFX9-NEXT: ; implicit-def: $vgpr0 2483; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2484; GFX9-NEXT: s_cbranch_execz BB14_2 2485; GFX9-NEXT: ; %bb.1: 2486; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2487; GFX9-NEXT: v_mov_b32_e32 v3, s4 2488; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2489; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2490; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2491; GFX9-NEXT: BB14_2: 2492; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2493; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2494; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2495; GFX9-NEXT: v_mov_b32_e32 v0, v1 2496; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2497; GFX9-NEXT: s_mov_b32 s3, 0xf000 2498; GFX9-NEXT: s_mov_b32 s2, -1 2499; GFX9-NEXT: s_nop 0 2500; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2501; GFX9-NEXT: s_endpgm 2502; 2503; GFX1064-LABEL: and_i32_varying: 2504; GFX1064: ; %bb.0: ; %entry 2505; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2506; GFX1064-NEXT: s_not_b64 exec, exec 2507; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2508; GFX1064-NEXT: s_not_b64 exec, exec 2509; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2510; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2511; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2512; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2513; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2514; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2515; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2516; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2517; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2518; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2519; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2520; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2521; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2522; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2523; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2524; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2525; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2526; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2527; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2528; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2529; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2530; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2531; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2532; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2533; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2534; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2535; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2536; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2537; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2538; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2539; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2540; GFX1064-NEXT: s_mov_b32 s2, -1 2541; GFX1064-NEXT: ; implicit-def: $vgpr0 2542; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2543; GFX1064-NEXT: s_cbranch_execz BB14_2 2544; GFX1064-NEXT: ; %bb.1: 2545; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2546; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2547; GFX1064-NEXT: s_mov_b32 s3, s7 2548; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2549; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2550; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 2551; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2552; GFX1064-NEXT: buffer_gl0_inv 2553; GFX1064-NEXT: BB14_2: 2554; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2555; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2556; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2557; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2558; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2559; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2560; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2561; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2562; GFX1064-NEXT: s_endpgm 2563; 2564; GFX1032-LABEL: and_i32_varying: 2565; GFX1032: ; %bb.0: ; %entry 2566; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2567; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2568; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2569; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2570; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2571; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2572; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2573; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2574; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2575; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2576; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2577; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2578; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2579; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2580; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2581; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2582; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2583; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2584; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2585; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2586; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2587; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2588; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2589; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2590; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2591; GFX1032-NEXT: s_mov_b32 s2, -1 2592; GFX1032-NEXT: ; implicit-def: $vgpr0 2593; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2594; GFX1032-NEXT: s_cbranch_execz BB14_2 2595; GFX1032-NEXT: ; %bb.1: 2596; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2597; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2598; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2599; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2600; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 2601; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2602; GFX1032-NEXT: buffer_gl0_inv 2603; GFX1032-NEXT: BB14_2: 2604; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2605; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2606; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2607; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2608; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2609; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2610; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2611; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2612; GFX1032-NEXT: s_endpgm 2613entry: 2614 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2615 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2616 store i32 %old, i32 addrspace(1)* %out 2617 ret void 2618} 2619 2620define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2621; 2622; 2623; GFX7LESS-LABEL: or_i32_varying: 2624; GFX7LESS: ; %bb.0: ; %entry 2625; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2626; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2627; GFX7LESS-NEXT: s_mov_b32 m0, -1 2628; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2629; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2630; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2631; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2632; GFX7LESS-NEXT: s_mov_b32 s2, -1 2633; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2634; GFX7LESS-NEXT: s_endpgm 2635; 2636; GFX8-LABEL: or_i32_varying: 2637; GFX8: ; %bb.0: ; %entry 2638; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2639; GFX8-NEXT: v_mov_b32_e32 v2, v0 2640; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2641; GFX8-NEXT: v_mov_b32_e32 v1, 0 2642; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2643; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2644; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2645; GFX8-NEXT: s_not_b64 exec, exec 2646; GFX8-NEXT: v_mov_b32_e32 v2, 0 2647; GFX8-NEXT: s_not_b64 exec, exec 2648; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2649; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2650; GFX8-NEXT: s_nop 1 2651; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2652; GFX8-NEXT: s_nop 1 2653; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2654; GFX8-NEXT: s_nop 1 2655; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2656; GFX8-NEXT: s_nop 1 2657; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2658; GFX8-NEXT: s_nop 1 2659; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2660; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2661; GFX8-NEXT: s_nop 0 2662; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2663; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2664; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2665; GFX8-NEXT: ; implicit-def: $vgpr0 2666; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2667; GFX8-NEXT: s_cbranch_execz BB15_2 2668; GFX8-NEXT: ; %bb.1: 2669; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2670; GFX8-NEXT: v_mov_b32_e32 v3, s4 2671; GFX8-NEXT: s_mov_b32 m0, -1 2672; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2673; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2674; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2675; GFX8-NEXT: BB15_2: 2676; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2677; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2678; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2679; GFX8-NEXT: v_mov_b32_e32 v0, v1 2680; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2681; GFX8-NEXT: s_mov_b32 s3, 0xf000 2682; GFX8-NEXT: s_mov_b32 s2, -1 2683; GFX8-NEXT: s_nop 0 2684; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2685; GFX8-NEXT: s_endpgm 2686; 2687; GFX9-LABEL: or_i32_varying: 2688; GFX9: ; %bb.0: ; %entry 2689; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2690; GFX9-NEXT: v_mov_b32_e32 v2, v0 2691; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2692; GFX9-NEXT: v_mov_b32_e32 v1, 0 2693; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2694; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2695; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2696; GFX9-NEXT: s_not_b64 exec, exec 2697; GFX9-NEXT: v_mov_b32_e32 v2, 0 2698; GFX9-NEXT: s_not_b64 exec, exec 2699; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2700; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2701; GFX9-NEXT: s_nop 1 2702; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2703; GFX9-NEXT: s_nop 1 2704; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2705; GFX9-NEXT: s_nop 1 2706; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2707; GFX9-NEXT: s_nop 1 2708; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2709; GFX9-NEXT: s_nop 1 2710; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2711; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2712; GFX9-NEXT: s_nop 0 2713; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2714; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2715; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2716; GFX9-NEXT: ; implicit-def: $vgpr0 2717; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2718; GFX9-NEXT: s_cbranch_execz BB15_2 2719; GFX9-NEXT: ; %bb.1: 2720; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2721; GFX9-NEXT: v_mov_b32_e32 v3, s4 2722; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2723; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2724; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2725; GFX9-NEXT: BB15_2: 2726; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2727; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2728; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2729; GFX9-NEXT: v_mov_b32_e32 v0, v1 2730; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2731; GFX9-NEXT: s_mov_b32 s3, 0xf000 2732; GFX9-NEXT: s_mov_b32 s2, -1 2733; GFX9-NEXT: s_nop 0 2734; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2735; GFX9-NEXT: s_endpgm 2736; 2737; GFX1064-LABEL: or_i32_varying: 2738; GFX1064: ; %bb.0: ; %entry 2739; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2740; GFX1064-NEXT: s_not_b64 exec, exec 2741; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2742; GFX1064-NEXT: s_not_b64 exec, exec 2743; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2744; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2745; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2746; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2747; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2748; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2749; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2750; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2751; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2752; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2753; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2754; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2755; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2756; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2757; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2758; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2759; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2760; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2761; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2762; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2763; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2764; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2765; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2766; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2767; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2768; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2769; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2770; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2771; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2772; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2773; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2774; GFX1064-NEXT: s_mov_b32 s2, -1 2775; GFX1064-NEXT: ; implicit-def: $vgpr0 2776; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2777; GFX1064-NEXT: s_cbranch_execz BB15_2 2778; GFX1064-NEXT: ; %bb.1: 2779; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2780; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2781; GFX1064-NEXT: s_mov_b32 s3, s7 2782; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2783; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2784; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 2785; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2786; GFX1064-NEXT: buffer_gl0_inv 2787; GFX1064-NEXT: BB15_2: 2788; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2789; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2790; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2791; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2792; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2793; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2794; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2795; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2796; GFX1064-NEXT: s_endpgm 2797; 2798; GFX1032-LABEL: or_i32_varying: 2799; GFX1032: ; %bb.0: ; %entry 2800; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2801; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2802; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2803; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2804; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2805; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2806; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2807; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2808; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2809; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2810; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2811; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2812; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2813; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2814; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2815; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2816; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2817; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2818; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2819; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2820; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2821; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2822; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2823; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2824; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2825; GFX1032-NEXT: s_mov_b32 s2, -1 2826; GFX1032-NEXT: ; implicit-def: $vgpr0 2827; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2828; GFX1032-NEXT: s_cbranch_execz BB15_2 2829; GFX1032-NEXT: ; %bb.1: 2830; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2831; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2832; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2833; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2834; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 2835; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2836; GFX1032-NEXT: buffer_gl0_inv 2837; GFX1032-NEXT: BB15_2: 2838; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2839; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2840; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2841; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2842; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 2843; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2844; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2845; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2846; GFX1032-NEXT: s_endpgm 2847entry: 2848 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2849 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2850 store i32 %old, i32 addrspace(1)* %out 2851 ret void 2852} 2853 2854define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 2855; 2856; 2857; GFX7LESS-LABEL: xor_i32_varying: 2858; GFX7LESS: ; %bb.0: ; %entry 2859; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2860; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2861; GFX7LESS-NEXT: s_mov_b32 m0, -1 2862; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2863; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 2864; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2865; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2866; GFX7LESS-NEXT: s_mov_b32 s2, -1 2867; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2868; GFX7LESS-NEXT: s_endpgm 2869; 2870; GFX8-LABEL: xor_i32_varying: 2871; GFX8: ; %bb.0: ; %entry 2872; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2873; GFX8-NEXT: v_mov_b32_e32 v2, v0 2874; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2875; GFX8-NEXT: v_mov_b32_e32 v1, 0 2876; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2877; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2878; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2879; GFX8-NEXT: s_not_b64 exec, exec 2880; GFX8-NEXT: v_mov_b32_e32 v2, 0 2881; GFX8-NEXT: s_not_b64 exec, exec 2882; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2883; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2884; GFX8-NEXT: s_nop 1 2885; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2886; GFX8-NEXT: s_nop 1 2887; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2888; GFX8-NEXT: s_nop 1 2889; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2890; GFX8-NEXT: s_nop 1 2891; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2892; GFX8-NEXT: s_nop 1 2893; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2894; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2895; GFX8-NEXT: s_nop 0 2896; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2897; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2898; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2899; GFX8-NEXT: ; implicit-def: $vgpr0 2900; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2901; GFX8-NEXT: s_cbranch_execz BB16_2 2902; GFX8-NEXT: ; %bb.1: 2903; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2904; GFX8-NEXT: v_mov_b32_e32 v3, s4 2905; GFX8-NEXT: s_mov_b32 m0, -1 2906; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2907; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 2908; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2909; GFX8-NEXT: BB16_2: 2910; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2911; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2912; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2913; GFX8-NEXT: v_mov_b32_e32 v0, v1 2914; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 2915; GFX8-NEXT: s_mov_b32 s3, 0xf000 2916; GFX8-NEXT: s_mov_b32 s2, -1 2917; GFX8-NEXT: s_nop 0 2918; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2919; GFX8-NEXT: s_endpgm 2920; 2921; GFX9-LABEL: xor_i32_varying: 2922; GFX9: ; %bb.0: ; %entry 2923; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2924; GFX9-NEXT: v_mov_b32_e32 v2, v0 2925; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2926; GFX9-NEXT: v_mov_b32_e32 v1, 0 2927; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2928; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2929; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2930; GFX9-NEXT: s_not_b64 exec, exec 2931; GFX9-NEXT: v_mov_b32_e32 v2, 0 2932; GFX9-NEXT: s_not_b64 exec, exec 2933; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2934; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2935; GFX9-NEXT: s_nop 1 2936; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2937; GFX9-NEXT: s_nop 1 2938; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2939; GFX9-NEXT: s_nop 1 2940; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2941; GFX9-NEXT: s_nop 1 2942; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2943; GFX9-NEXT: s_nop 1 2944; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2945; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2946; GFX9-NEXT: s_nop 0 2947; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2948; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2949; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2950; GFX9-NEXT: ; implicit-def: $vgpr0 2951; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2952; GFX9-NEXT: s_cbranch_execz BB16_2 2953; GFX9-NEXT: ; %bb.1: 2954; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2955; GFX9-NEXT: v_mov_b32_e32 v3, s4 2956; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2957; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 2958; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2959; GFX9-NEXT: BB16_2: 2960; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2961; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2962; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2963; GFX9-NEXT: v_mov_b32_e32 v0, v1 2964; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2965; GFX9-NEXT: s_mov_b32 s3, 0xf000 2966; GFX9-NEXT: s_mov_b32 s2, -1 2967; GFX9-NEXT: s_nop 0 2968; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2969; GFX9-NEXT: s_endpgm 2970; 2971; GFX1064-LABEL: xor_i32_varying: 2972; GFX1064: ; %bb.0: ; %entry 2973; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2974; GFX1064-NEXT: s_not_b64 exec, exec 2975; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2976; GFX1064-NEXT: s_not_b64 exec, exec 2977; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2978; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2979; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2980; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2981; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2982; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2983; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2984; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2985; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2986; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2987; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2988; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2989; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2990; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2991; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2992; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2993; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2994; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2995; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2996; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2997; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2998; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2999; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3000; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3001; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3002; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3003; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3004; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3005; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3006; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3007; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3008; GFX1064-NEXT: s_mov_b32 s2, -1 3009; GFX1064-NEXT: ; implicit-def: $vgpr0 3010; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3011; GFX1064-NEXT: s_cbranch_execz BB16_2 3012; GFX1064-NEXT: ; %bb.1: 3013; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3014; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3015; GFX1064-NEXT: s_mov_b32 s3, s7 3016; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3017; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3018; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 3019; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3020; GFX1064-NEXT: buffer_gl0_inv 3021; GFX1064-NEXT: BB16_2: 3022; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3023; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3024; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3025; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3026; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3027; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3028; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3029; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3030; GFX1064-NEXT: s_endpgm 3031; 3032; GFX1032-LABEL: xor_i32_varying: 3033; GFX1032: ; %bb.0: ; %entry 3034; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3035; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3036; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3037; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3038; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3039; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3040; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3041; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3042; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3043; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3044; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3045; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3046; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3047; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3048; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3049; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3050; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3051; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3052; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3053; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3054; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3055; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3056; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3057; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3058; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3059; GFX1032-NEXT: s_mov_b32 s2, -1 3060; GFX1032-NEXT: ; implicit-def: $vgpr0 3061; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3062; GFX1032-NEXT: s_cbranch_execz BB16_2 3063; GFX1032-NEXT: ; %bb.1: 3064; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3065; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3066; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3067; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3068; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 3069; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3070; GFX1032-NEXT: buffer_gl0_inv 3071; GFX1032-NEXT: BB16_2: 3072; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3073; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3074; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3075; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3076; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3077; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3078; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3079; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3080; GFX1032-NEXT: s_endpgm 3081entry: 3082 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3083 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3084 store i32 %old, i32 addrspace(1)* %out 3085 ret void 3086} 3087 3088define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3089; 3090; 3091; GFX7LESS-LABEL: max_i32_varying: 3092; GFX7LESS: ; %bb.0: ; %entry 3093; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3094; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3095; GFX7LESS-NEXT: s_mov_b32 m0, -1 3096; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3097; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3098; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3099; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3100; GFX7LESS-NEXT: s_mov_b32 s2, -1 3101; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3102; GFX7LESS-NEXT: s_endpgm 3103; 3104; GFX8-LABEL: max_i32_varying: 3105; GFX8: ; %bb.0: ; %entry 3106; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3107; GFX8-NEXT: v_mov_b32_e32 v2, v0 3108; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3109; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3110; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3111; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3112; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3113; GFX8-NEXT: s_not_b64 exec, exec 3114; GFX8-NEXT: v_mov_b32_e32 v2, v1 3115; GFX8-NEXT: s_not_b64 exec, exec 3116; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3117; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3118; GFX8-NEXT: s_nop 1 3119; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3120; GFX8-NEXT: s_nop 1 3121; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3122; GFX8-NEXT: s_nop 1 3123; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3124; GFX8-NEXT: s_nop 1 3125; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3126; GFX8-NEXT: s_nop 1 3127; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3128; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3129; GFX8-NEXT: s_nop 0 3130; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3131; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3132; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3133; GFX8-NEXT: ; implicit-def: $vgpr0 3134; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3135; GFX8-NEXT: s_cbranch_execz BB17_2 3136; GFX8-NEXT: ; %bb.1: 3137; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3138; GFX8-NEXT: v_mov_b32_e32 v3, s4 3139; GFX8-NEXT: s_mov_b32 m0, -1 3140; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3141; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3142; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3143; GFX8-NEXT: BB17_2: 3144; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3145; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3146; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3147; GFX8-NEXT: v_mov_b32_e32 v0, v1 3148; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3149; GFX8-NEXT: s_mov_b32 s3, 0xf000 3150; GFX8-NEXT: s_mov_b32 s2, -1 3151; GFX8-NEXT: s_nop 0 3152; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3153; GFX8-NEXT: s_endpgm 3154; 3155; GFX9-LABEL: max_i32_varying: 3156; GFX9: ; %bb.0: ; %entry 3157; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3158; GFX9-NEXT: v_mov_b32_e32 v2, v0 3159; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3160; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3161; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3162; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3163; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3164; GFX9-NEXT: s_not_b64 exec, exec 3165; GFX9-NEXT: v_mov_b32_e32 v2, v1 3166; GFX9-NEXT: s_not_b64 exec, exec 3167; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3168; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3169; GFX9-NEXT: s_nop 1 3170; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3171; GFX9-NEXT: s_nop 1 3172; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3173; GFX9-NEXT: s_nop 1 3174; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3175; GFX9-NEXT: s_nop 1 3176; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3177; GFX9-NEXT: s_nop 1 3178; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3179; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3180; GFX9-NEXT: s_nop 0 3181; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3182; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3183; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3184; GFX9-NEXT: ; implicit-def: $vgpr0 3185; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3186; GFX9-NEXT: s_cbranch_execz BB17_2 3187; GFX9-NEXT: ; %bb.1: 3188; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3189; GFX9-NEXT: v_mov_b32_e32 v3, s4 3190; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3191; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3192; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3193; GFX9-NEXT: BB17_2: 3194; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3195; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3196; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3197; GFX9-NEXT: v_mov_b32_e32 v0, v1 3198; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3199; GFX9-NEXT: s_mov_b32 s3, 0xf000 3200; GFX9-NEXT: s_mov_b32 s2, -1 3201; GFX9-NEXT: s_nop 0 3202; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3203; GFX9-NEXT: s_endpgm 3204; 3205; GFX1064-LABEL: max_i32_varying: 3206; GFX1064: ; %bb.0: ; %entry 3207; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3208; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3209; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3210; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3211; GFX1064-NEXT: s_not_b64 exec, exec 3212; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3213; GFX1064-NEXT: s_not_b64 exec, exec 3214; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3215; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3216; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3217; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3218; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3219; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3220; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3221; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3222; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3223; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3224; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3225; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3226; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3227; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3228; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3229; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3230; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3231; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3232; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3233; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3234; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3235; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3236; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3237; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3238; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3239; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3240; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3241; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3242; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3243; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3244; GFX1064-NEXT: s_mov_b32 s2, -1 3245; GFX1064-NEXT: ; implicit-def: $vgpr0 3246; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3247; GFX1064-NEXT: s_cbranch_execz BB17_2 3248; GFX1064-NEXT: ; %bb.1: 3249; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3250; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3251; GFX1064-NEXT: s_mov_b32 s3, s7 3252; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3253; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3254; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 3255; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3256; GFX1064-NEXT: buffer_gl0_inv 3257; GFX1064-NEXT: BB17_2: 3258; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3259; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3260; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3261; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3262; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3263; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3264; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3265; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3266; GFX1064-NEXT: s_endpgm 3267; 3268; GFX1032-LABEL: max_i32_varying: 3269; GFX1032: ; %bb.0: ; %entry 3270; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3271; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3272; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3273; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3274; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3275; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3276; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3277; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3278; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3279; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3280; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3281; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3282; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3283; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3284; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3285; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3286; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3287; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3288; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3289; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3290; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3291; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3292; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3293; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3294; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3295; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3296; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3297; GFX1032-NEXT: s_mov_b32 s2, -1 3298; GFX1032-NEXT: ; implicit-def: $vgpr0 3299; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3300; GFX1032-NEXT: s_cbranch_execz BB17_2 3301; GFX1032-NEXT: ; %bb.1: 3302; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3303; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3304; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3305; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3306; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 3307; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3308; GFX1032-NEXT: buffer_gl0_inv 3309; GFX1032-NEXT: BB17_2: 3310; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3311; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3312; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3313; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3314; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3315; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3316; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3317; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3318; GFX1032-NEXT: s_endpgm 3319entry: 3320 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3321 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3322 store i32 %old, i32 addrspace(1)* %out 3323 ret void 3324} 3325 3326define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3327; 3328; 3329; GFX7LESS-LABEL: max_i64_constant: 3330; GFX7LESS: ; %bb.0: ; %entry 3331; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3332; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3333; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3334; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3335; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3336; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3337; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3338; GFX7LESS-NEXT: ; %bb.1: 3339; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3340; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3341; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3342; GFX7LESS-NEXT: s_mov_b32 m0, -1 3343; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3344; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3345; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3346; GFX7LESS-NEXT: BB18_2: 3347; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3348; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3349; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3350; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3351; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3352; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3353; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3354; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3355; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3356; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3357; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3358; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3359; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3360; GFX7LESS-NEXT: s_mov_b32 s2, -1 3361; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3362; GFX7LESS-NEXT: s_endpgm 3363; 3364; GFX8-LABEL: max_i64_constant: 3365; GFX8: ; %bb.0: ; %entry 3366; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3367; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3368; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3369; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3370; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3371; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3372; GFX8-NEXT: s_cbranch_execz BB18_2 3373; GFX8-NEXT: ; %bb.1: 3374; GFX8-NEXT: v_mov_b32_e32 v0, 5 3375; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3376; GFX8-NEXT: v_mov_b32_e32 v1, 0 3377; GFX8-NEXT: s_mov_b32 m0, -1 3378; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3379; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3380; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3381; GFX8-NEXT: BB18_2: 3382; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3383; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3384; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3385; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3386; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3387; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3388; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3389; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3390; GFX8-NEXT: v_mov_b32_e32 v2, s3 3391; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3392; GFX8-NEXT: v_mov_b32_e32 v2, s2 3393; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3394; GFX8-NEXT: s_mov_b32 s3, 0xf000 3395; GFX8-NEXT: s_mov_b32 s2, -1 3396; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3397; GFX8-NEXT: s_endpgm 3398; 3399; GFX9-LABEL: max_i64_constant: 3400; GFX9: ; %bb.0: ; %entry 3401; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3402; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3403; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3404; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3405; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3406; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3407; GFX9-NEXT: s_cbranch_execz BB18_2 3408; GFX9-NEXT: ; %bb.1: 3409; GFX9-NEXT: v_mov_b32_e32 v0, 5 3410; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3411; GFX9-NEXT: v_mov_b32_e32 v1, 0 3412; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3413; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3414; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3415; GFX9-NEXT: BB18_2: 3416; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3417; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3418; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3419; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3420; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3421; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3422; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3423; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3424; GFX9-NEXT: v_mov_b32_e32 v2, s3 3425; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3426; GFX9-NEXT: v_mov_b32_e32 v2, s2 3427; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3428; GFX9-NEXT: s_mov_b32 s3, 0xf000 3429; GFX9-NEXT: s_mov_b32 s2, -1 3430; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3431; GFX9-NEXT: s_endpgm 3432; 3433; GFX1064-LABEL: max_i64_constant: 3434; GFX1064: ; %bb.0: ; %entry 3435; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3436; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3437; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3438; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3439; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3440; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3441; GFX1064-NEXT: s_cbranch_execz BB18_2 3442; GFX1064-NEXT: ; %bb.1: 3443; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3444; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3445; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3446; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3447; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3448; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3449; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3450; GFX1064-NEXT: buffer_gl0_inv 3451; GFX1064-NEXT: BB18_2: 3452; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3453; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3454; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3455; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3456; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3457; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3458; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3459; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3460; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3461; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3462; GFX1064-NEXT: s_mov_b32 s2, -1 3463; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3464; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3465; GFX1064-NEXT: s_endpgm 3466; 3467; GFX1032-LABEL: max_i64_constant: 3468; GFX1032: ; %bb.0: ; %entry 3469; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3470; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3471; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3472; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3473; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3474; GFX1032-NEXT: s_cbranch_execz BB18_2 3475; GFX1032-NEXT: ; %bb.1: 3476; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3477; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3478; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3479; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3480; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3481; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3482; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3483; GFX1032-NEXT: buffer_gl0_inv 3484; GFX1032-NEXT: BB18_2: 3485; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3486; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3487; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3488; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3489; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3490; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3491; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3492; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3493; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3494; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3495; GFX1032-NEXT: s_mov_b32 s2, -1 3496; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3497; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3498; GFX1032-NEXT: s_endpgm 3499entry: 3500 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3501 store i64 %old, i64 addrspace(1)* %out 3502 ret void 3503} 3504 3505define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3506; 3507; 3508; GFX7LESS-LABEL: min_i32_varying: 3509; GFX7LESS: ; %bb.0: ; %entry 3510; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3511; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3512; GFX7LESS-NEXT: s_mov_b32 m0, -1 3513; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3514; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3515; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3516; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3517; GFX7LESS-NEXT: s_mov_b32 s2, -1 3518; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3519; GFX7LESS-NEXT: s_endpgm 3520; 3521; GFX8-LABEL: min_i32_varying: 3522; GFX8: ; %bb.0: ; %entry 3523; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3524; GFX8-NEXT: v_mov_b32_e32 v2, v0 3525; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3526; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3527; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3528; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3529; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3530; GFX8-NEXT: s_not_b64 exec, exec 3531; GFX8-NEXT: v_mov_b32_e32 v2, v1 3532; GFX8-NEXT: s_not_b64 exec, exec 3533; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3534; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3535; GFX8-NEXT: s_nop 1 3536; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3537; GFX8-NEXT: s_nop 1 3538; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3539; GFX8-NEXT: s_nop 1 3540; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3541; GFX8-NEXT: s_nop 1 3542; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3543; GFX8-NEXT: s_nop 1 3544; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3545; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3546; GFX8-NEXT: s_nop 0 3547; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3548; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3549; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3550; GFX8-NEXT: ; implicit-def: $vgpr0 3551; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3552; GFX8-NEXT: s_cbranch_execz BB19_2 3553; GFX8-NEXT: ; %bb.1: 3554; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3555; GFX8-NEXT: v_mov_b32_e32 v3, s4 3556; GFX8-NEXT: s_mov_b32 m0, -1 3557; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3558; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3559; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3560; GFX8-NEXT: BB19_2: 3561; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3562; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3563; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3564; GFX8-NEXT: v_mov_b32_e32 v0, v1 3565; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3566; GFX8-NEXT: s_mov_b32 s3, 0xf000 3567; GFX8-NEXT: s_mov_b32 s2, -1 3568; GFX8-NEXT: s_nop 0 3569; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3570; GFX8-NEXT: s_endpgm 3571; 3572; GFX9-LABEL: min_i32_varying: 3573; GFX9: ; %bb.0: ; %entry 3574; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3575; GFX9-NEXT: v_mov_b32_e32 v2, v0 3576; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3577; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3578; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3579; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3580; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3581; GFX9-NEXT: s_not_b64 exec, exec 3582; GFX9-NEXT: v_mov_b32_e32 v2, v1 3583; GFX9-NEXT: s_not_b64 exec, exec 3584; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3585; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3586; GFX9-NEXT: s_nop 1 3587; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3588; GFX9-NEXT: s_nop 1 3589; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3590; GFX9-NEXT: s_nop 1 3591; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3592; GFX9-NEXT: s_nop 1 3593; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3594; GFX9-NEXT: s_nop 1 3595; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3596; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3597; GFX9-NEXT: s_nop 0 3598; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3599; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3600; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3601; GFX9-NEXT: ; implicit-def: $vgpr0 3602; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3603; GFX9-NEXT: s_cbranch_execz BB19_2 3604; GFX9-NEXT: ; %bb.1: 3605; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3606; GFX9-NEXT: v_mov_b32_e32 v3, s4 3607; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3608; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3609; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3610; GFX9-NEXT: BB19_2: 3611; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3612; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3613; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3614; GFX9-NEXT: v_mov_b32_e32 v0, v1 3615; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3616; GFX9-NEXT: s_mov_b32 s3, 0xf000 3617; GFX9-NEXT: s_mov_b32 s2, -1 3618; GFX9-NEXT: s_nop 0 3619; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3620; GFX9-NEXT: s_endpgm 3621; 3622; GFX1064-LABEL: min_i32_varying: 3623; GFX1064: ; %bb.0: ; %entry 3624; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3625; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3626; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3627; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3628; GFX1064-NEXT: s_not_b64 exec, exec 3629; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3630; GFX1064-NEXT: s_not_b64 exec, exec 3631; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3632; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3633; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3634; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3635; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3636; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3637; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3638; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3639; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3640; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3641; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3642; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3643; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3644; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3645; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3646; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3647; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3648; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3649; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3650; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3651; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3652; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3653; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3654; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3655; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3656; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3657; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3658; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3659; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3660; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3661; GFX1064-NEXT: s_mov_b32 s2, -1 3662; GFX1064-NEXT: ; implicit-def: $vgpr0 3663; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3664; GFX1064-NEXT: s_cbranch_execz BB19_2 3665; GFX1064-NEXT: ; %bb.1: 3666; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3667; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3668; GFX1064-NEXT: s_mov_b32 s3, s7 3669; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3670; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3671; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 3672; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3673; GFX1064-NEXT: buffer_gl0_inv 3674; GFX1064-NEXT: BB19_2: 3675; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3676; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3677; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3678; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3679; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3680; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3681; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3682; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3683; GFX1064-NEXT: s_endpgm 3684; 3685; GFX1032-LABEL: min_i32_varying: 3686; GFX1032: ; %bb.0: ; %entry 3687; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3688; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3689; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3690; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3691; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3692; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3693; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3694; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3695; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3696; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3697; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3698; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3699; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3700; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3701; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3702; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3703; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3704; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3705; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3706; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3707; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3708; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3709; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3710; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3711; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3712; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3713; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3714; GFX1032-NEXT: s_mov_b32 s2, -1 3715; GFX1032-NEXT: ; implicit-def: $vgpr0 3716; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3717; GFX1032-NEXT: s_cbranch_execz BB19_2 3718; GFX1032-NEXT: ; %bb.1: 3719; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3720; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3721; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3722; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3723; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 3724; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3725; GFX1032-NEXT: buffer_gl0_inv 3726; GFX1032-NEXT: BB19_2: 3727; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3728; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3729; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3730; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3731; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3732; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3733; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3734; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3735; GFX1032-NEXT: s_endpgm 3736entry: 3737 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3738 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3739 store i32 %old, i32 addrspace(1)* %out 3740 ret void 3741} 3742 3743define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3744; 3745; 3746; GFX7LESS-LABEL: min_i64_constant: 3747; GFX7LESS: ; %bb.0: ; %entry 3748; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3749; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3750; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3751; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3752; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3753; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3754; GFX7LESS-NEXT: s_cbranch_execz BB20_2 3755; GFX7LESS-NEXT: ; %bb.1: 3756; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3757; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3758; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3759; GFX7LESS-NEXT: s_mov_b32 m0, -1 3760; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3761; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3762; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3763; GFX7LESS-NEXT: BB20_2: 3764; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3765; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3766; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3767; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3768; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3769; GFX7LESS-NEXT: s_mov_b32 s2, -1 3770; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3771; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3772; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3773; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3774; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3775; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3776; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3777; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3778; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3779; GFX7LESS-NEXT: s_endpgm 3780; 3781; GFX8-LABEL: min_i64_constant: 3782; GFX8: ; %bb.0: ; %entry 3783; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3784; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3785; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3786; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3787; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3788; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3789; GFX8-NEXT: s_cbranch_execz BB20_2 3790; GFX8-NEXT: ; %bb.1: 3791; GFX8-NEXT: v_mov_b32_e32 v0, 5 3792; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3793; GFX8-NEXT: v_mov_b32_e32 v1, 0 3794; GFX8-NEXT: s_mov_b32 m0, -1 3795; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3796; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3797; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3798; GFX8-NEXT: BB20_2: 3799; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3800; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3801; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3802; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3803; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3804; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3805; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3806; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3807; GFX8-NEXT: v_mov_b32_e32 v2, s5 3808; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3809; GFX8-NEXT: v_mov_b32_e32 v2, s4 3810; GFX8-NEXT: s_mov_b32 s2, -1 3811; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3812; GFX8-NEXT: s_mov_b32 s3, 0xf000 3813; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3814; GFX8-NEXT: s_endpgm 3815; 3816; GFX9-LABEL: min_i64_constant: 3817; GFX9: ; %bb.0: ; %entry 3818; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3819; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3820; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3821; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3822; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3823; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3824; GFX9-NEXT: s_cbranch_execz BB20_2 3825; GFX9-NEXT: ; %bb.1: 3826; GFX9-NEXT: v_mov_b32_e32 v0, 5 3827; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3828; GFX9-NEXT: v_mov_b32_e32 v1, 0 3829; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3830; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3831; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3832; GFX9-NEXT: BB20_2: 3833; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3834; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3835; GFX9-NEXT: v_readfirstlane_b32 s4, v0 3836; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 3837; GFX9-NEXT: v_readfirstlane_b32 s5, v1 3838; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3839; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3840; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3841; GFX9-NEXT: v_mov_b32_e32 v2, s5 3842; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3843; GFX9-NEXT: v_mov_b32_e32 v2, s4 3844; GFX9-NEXT: s_mov_b32 s2, -1 3845; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3846; GFX9-NEXT: s_mov_b32 s3, 0xf000 3847; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3848; GFX9-NEXT: s_endpgm 3849; 3850; GFX1064-LABEL: min_i64_constant: 3851; GFX1064: ; %bb.0: ; %entry 3852; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3853; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3854; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3855; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3856; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3857; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3858; GFX1064-NEXT: s_cbranch_execz BB20_2 3859; GFX1064-NEXT: ; %bb.1: 3860; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3861; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3862; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3863; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3864; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3865; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3866; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3867; GFX1064-NEXT: buffer_gl0_inv 3868; GFX1064-NEXT: BB20_2: 3869; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3870; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3871; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3872; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3873; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 3874; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3875; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 3876; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3877; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3878; GFX1064-NEXT: s_mov_b32 s2, -1 3879; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3880; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3881; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3882; GFX1064-NEXT: s_endpgm 3883; 3884; GFX1032-LABEL: min_i64_constant: 3885; GFX1032: ; %bb.0: ; %entry 3886; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3887; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3888; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3889; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3890; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3891; GFX1032-NEXT: s_cbranch_execz BB20_2 3892; GFX1032-NEXT: ; %bb.1: 3893; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3894; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3895; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3896; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3897; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3898; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3899; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3900; GFX1032-NEXT: buffer_gl0_inv 3901; GFX1032-NEXT: BB20_2: 3902; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3903; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3904; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3905; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3906; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 3907; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 3908; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 3909; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3910; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3911; GFX1032-NEXT: s_mov_b32 s2, -1 3912; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3913; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3914; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3915; GFX1032-NEXT: s_endpgm 3916entry: 3917 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 3918 store i64 %old, i64 addrspace(1)* %out 3919 ret void 3920} 3921 3922define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 3923; 3924; 3925; GFX7LESS-LABEL: umax_i32_varying: 3926; GFX7LESS: ; %bb.0: ; %entry 3927; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3928; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3929; GFX7LESS-NEXT: s_mov_b32 m0, -1 3930; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3931; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 3932; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3933; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3934; GFX7LESS-NEXT: s_mov_b32 s2, -1 3935; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3936; GFX7LESS-NEXT: s_endpgm 3937; 3938; GFX8-LABEL: umax_i32_varying: 3939; GFX8: ; %bb.0: ; %entry 3940; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3941; GFX8-NEXT: v_mov_b32_e32 v2, v0 3942; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3943; GFX8-NEXT: v_mov_b32_e32 v1, 0 3944; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3945; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3946; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3947; GFX8-NEXT: s_not_b64 exec, exec 3948; GFX8-NEXT: v_mov_b32_e32 v2, 0 3949; GFX8-NEXT: s_not_b64 exec, exec 3950; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3951; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3952; GFX8-NEXT: s_nop 1 3953; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3954; GFX8-NEXT: s_nop 1 3955; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3956; GFX8-NEXT: s_nop 1 3957; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3958; GFX8-NEXT: s_nop 1 3959; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3960; GFX8-NEXT: s_nop 1 3961; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3962; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3963; GFX8-NEXT: s_nop 0 3964; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3965; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3966; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3967; GFX8-NEXT: ; implicit-def: $vgpr0 3968; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3969; GFX8-NEXT: s_cbranch_execz BB21_2 3970; GFX8-NEXT: ; %bb.1: 3971; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3972; GFX8-NEXT: v_mov_b32_e32 v3, s4 3973; GFX8-NEXT: s_mov_b32 m0, -1 3974; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3975; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 3976; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3977; GFX8-NEXT: BB21_2: 3978; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3979; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3980; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3981; GFX8-NEXT: v_mov_b32_e32 v0, v1 3982; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 3983; GFX8-NEXT: s_mov_b32 s3, 0xf000 3984; GFX8-NEXT: s_mov_b32 s2, -1 3985; GFX8-NEXT: s_nop 0 3986; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3987; GFX8-NEXT: s_endpgm 3988; 3989; GFX9-LABEL: umax_i32_varying: 3990; GFX9: ; %bb.0: ; %entry 3991; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3992; GFX9-NEXT: v_mov_b32_e32 v2, v0 3993; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3994; GFX9-NEXT: v_mov_b32_e32 v1, 0 3995; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3996; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3997; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3998; GFX9-NEXT: s_not_b64 exec, exec 3999; GFX9-NEXT: v_mov_b32_e32 v2, 0 4000; GFX9-NEXT: s_not_b64 exec, exec 4001; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4002; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4003; GFX9-NEXT: s_nop 1 4004; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4005; GFX9-NEXT: s_nop 1 4006; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4007; GFX9-NEXT: s_nop 1 4008; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4009; GFX9-NEXT: s_nop 1 4010; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4011; GFX9-NEXT: s_nop 1 4012; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4013; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4014; GFX9-NEXT: s_nop 0 4015; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4016; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4017; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4018; GFX9-NEXT: ; implicit-def: $vgpr0 4019; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4020; GFX9-NEXT: s_cbranch_execz BB21_2 4021; GFX9-NEXT: ; %bb.1: 4022; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4023; GFX9-NEXT: v_mov_b32_e32 v3, s4 4024; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4025; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4026; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4027; GFX9-NEXT: BB21_2: 4028; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4029; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4030; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4031; GFX9-NEXT: v_mov_b32_e32 v0, v1 4032; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4033; GFX9-NEXT: s_mov_b32 s3, 0xf000 4034; GFX9-NEXT: s_mov_b32 s2, -1 4035; GFX9-NEXT: s_nop 0 4036; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4037; GFX9-NEXT: s_endpgm 4038; 4039; GFX1064-LABEL: umax_i32_varying: 4040; GFX1064: ; %bb.0: ; %entry 4041; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4042; GFX1064-NEXT: s_not_b64 exec, exec 4043; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4044; GFX1064-NEXT: s_not_b64 exec, exec 4045; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4046; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4047; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4048; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4049; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4050; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4051; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4052; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4053; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4054; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4055; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4056; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4057; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4058; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4059; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4060; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4061; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4062; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4063; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4064; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4065; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4066; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4067; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4068; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4069; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4070; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4071; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4072; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4073; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4074; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4075; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4076; GFX1064-NEXT: s_mov_b32 s2, -1 4077; GFX1064-NEXT: ; implicit-def: $vgpr0 4078; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4079; GFX1064-NEXT: s_cbranch_execz BB21_2 4080; GFX1064-NEXT: ; %bb.1: 4081; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4082; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4083; GFX1064-NEXT: s_mov_b32 s3, s7 4084; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4085; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4086; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 4087; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4088; GFX1064-NEXT: buffer_gl0_inv 4089; GFX1064-NEXT: BB21_2: 4090; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4091; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4092; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4093; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4094; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4095; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4096; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4097; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4098; GFX1064-NEXT: s_endpgm 4099; 4100; GFX1032-LABEL: umax_i32_varying: 4101; GFX1032: ; %bb.0: ; %entry 4102; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4103; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4104; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4105; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4106; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4107; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4108; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4109; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4110; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4111; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4112; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4113; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4114; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4115; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4116; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4117; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4118; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4119; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4120; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4121; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4122; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4123; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4124; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4125; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4126; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4127; GFX1032-NEXT: s_mov_b32 s2, -1 4128; GFX1032-NEXT: ; implicit-def: $vgpr0 4129; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4130; GFX1032-NEXT: s_cbranch_execz BB21_2 4131; GFX1032-NEXT: ; %bb.1: 4132; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4133; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4134; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4135; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4136; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 4137; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4138; GFX1032-NEXT: buffer_gl0_inv 4139; GFX1032-NEXT: BB21_2: 4140; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4141; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4142; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4143; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4144; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4145; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4146; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4147; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4148; GFX1032-NEXT: s_endpgm 4149entry: 4150 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4151 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4152 store i32 %old, i32 addrspace(1)* %out 4153 ret void 4154} 4155 4156define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4157; 4158; 4159; GFX7LESS-LABEL: umax_i64_constant: 4160; GFX7LESS: ; %bb.0: ; %entry 4161; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4162; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4163; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4164; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4165; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4166; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4167; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4168; GFX7LESS-NEXT: ; %bb.1: 4169; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4170; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4171; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4172; GFX7LESS-NEXT: s_mov_b32 m0, -1 4173; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4174; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4175; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4176; GFX7LESS-NEXT: BB22_2: 4177; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4178; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4179; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4180; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4181; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4182; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4183; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4184; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4185; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4186; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4187; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4188; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4189; GFX7LESS-NEXT: s_mov_b32 s2, -1 4190; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4191; GFX7LESS-NEXT: s_endpgm 4192; 4193; GFX8-LABEL: umax_i64_constant: 4194; GFX8: ; %bb.0: ; %entry 4195; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4196; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4197; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4198; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4199; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4200; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4201; GFX8-NEXT: s_cbranch_execz BB22_2 4202; GFX8-NEXT: ; %bb.1: 4203; GFX8-NEXT: v_mov_b32_e32 v0, 5 4204; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4205; GFX8-NEXT: v_mov_b32_e32 v1, 0 4206; GFX8-NEXT: s_mov_b32 m0, -1 4207; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4208; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4210; GFX8-NEXT: BB22_2: 4211; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4212; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4213; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4214; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4215; GFX8-NEXT: v_mov_b32_e32 v1, 0 4216; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4217; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4218; GFX8-NEXT: v_mov_b32_e32 v1, s3 4219; GFX8-NEXT: v_mov_b32_e32 v2, s2 4220; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4221; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4222; GFX8-NEXT: s_mov_b32 s3, 0xf000 4223; GFX8-NEXT: s_mov_b32 s2, -1 4224; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4225; GFX8-NEXT: s_endpgm 4226; 4227; GFX9-LABEL: umax_i64_constant: 4228; GFX9: ; %bb.0: ; %entry 4229; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4230; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4231; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4232; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4233; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4234; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4235; GFX9-NEXT: s_cbranch_execz BB22_2 4236; GFX9-NEXT: ; %bb.1: 4237; GFX9-NEXT: v_mov_b32_e32 v0, 5 4238; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4239; GFX9-NEXT: v_mov_b32_e32 v1, 0 4240; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4241; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4242; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4243; GFX9-NEXT: BB22_2: 4244; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4245; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4246; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4247; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4248; GFX9-NEXT: v_mov_b32_e32 v1, 0 4249; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4250; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4251; GFX9-NEXT: v_mov_b32_e32 v1, s3 4252; GFX9-NEXT: v_mov_b32_e32 v2, s2 4253; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4254; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4255; GFX9-NEXT: s_mov_b32 s3, 0xf000 4256; GFX9-NEXT: s_mov_b32 s2, -1 4257; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4258; GFX9-NEXT: s_endpgm 4259; 4260; GFX1064-LABEL: umax_i64_constant: 4261; GFX1064: ; %bb.0: ; %entry 4262; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4263; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4264; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4265; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4266; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4267; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4268; GFX1064-NEXT: s_cbranch_execz BB22_2 4269; GFX1064-NEXT: ; %bb.1: 4270; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4271; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4272; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4273; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4274; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4275; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4276; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4277; GFX1064-NEXT: buffer_gl0_inv 4278; GFX1064-NEXT: BB22_2: 4279; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4280; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4281; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4282; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4283; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4284; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4285; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4286; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4287; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4288; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4289; GFX1064-NEXT: s_mov_b32 s2, -1 4290; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4291; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4292; GFX1064-NEXT: s_endpgm 4293; 4294; GFX1032-LABEL: umax_i64_constant: 4295; GFX1032: ; %bb.0: ; %entry 4296; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4297; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4298; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4299; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4300; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4301; GFX1032-NEXT: s_cbranch_execz BB22_2 4302; GFX1032-NEXT: ; %bb.1: 4303; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4304; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4305; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4306; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4307; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4308; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4309; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4310; GFX1032-NEXT: buffer_gl0_inv 4311; GFX1032-NEXT: BB22_2: 4312; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4313; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4314; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4315; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4316; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4317; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4318; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4319; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4320; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4321; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4322; GFX1032-NEXT: s_mov_b32 s2, -1 4323; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4324; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4325; GFX1032-NEXT: s_endpgm 4326entry: 4327 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4328 store i64 %old, i64 addrspace(1)* %out 4329 ret void 4330} 4331 4332define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4333; 4334; 4335; GFX7LESS-LABEL: umin_i32_varying: 4336; GFX7LESS: ; %bb.0: ; %entry 4337; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4338; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4339; GFX7LESS-NEXT: s_mov_b32 m0, -1 4340; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4341; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4342; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4343; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4344; GFX7LESS-NEXT: s_mov_b32 s2, -1 4345; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4346; GFX7LESS-NEXT: s_endpgm 4347; 4348; GFX8-LABEL: umin_i32_varying: 4349; GFX8: ; %bb.0: ; %entry 4350; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4351; GFX8-NEXT: v_mov_b32_e32 v2, v0 4352; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4353; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4354; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4355; GFX8-NEXT: v_mov_b32_e32 v1, -1 4356; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4357; GFX8-NEXT: s_not_b64 exec, exec 4358; GFX8-NEXT: v_mov_b32_e32 v2, -1 4359; GFX8-NEXT: s_not_b64 exec, exec 4360; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4361; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4362; GFX8-NEXT: s_nop 1 4363; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4364; GFX8-NEXT: s_nop 1 4365; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4366; GFX8-NEXT: s_nop 1 4367; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4368; GFX8-NEXT: s_nop 1 4369; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4370; GFX8-NEXT: s_nop 1 4371; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4372; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4373; GFX8-NEXT: s_nop 0 4374; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4375; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4376; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4377; GFX8-NEXT: ; implicit-def: $vgpr0 4378; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4379; GFX8-NEXT: s_cbranch_execz BB23_2 4380; GFX8-NEXT: ; %bb.1: 4381; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4382; GFX8-NEXT: v_mov_b32_e32 v3, s4 4383; GFX8-NEXT: s_mov_b32 m0, -1 4384; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4385; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4386; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4387; GFX8-NEXT: BB23_2: 4388; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4389; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4390; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4391; GFX8-NEXT: v_mov_b32_e32 v0, v1 4392; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4393; GFX8-NEXT: s_mov_b32 s3, 0xf000 4394; GFX8-NEXT: s_mov_b32 s2, -1 4395; GFX8-NEXT: s_nop 0 4396; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4397; GFX8-NEXT: s_endpgm 4398; 4399; GFX9-LABEL: umin_i32_varying: 4400; GFX9: ; %bb.0: ; %entry 4401; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4402; GFX9-NEXT: v_mov_b32_e32 v2, v0 4403; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4404; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4405; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4406; GFX9-NEXT: v_mov_b32_e32 v1, -1 4407; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4408; GFX9-NEXT: s_not_b64 exec, exec 4409; GFX9-NEXT: v_mov_b32_e32 v2, -1 4410; GFX9-NEXT: s_not_b64 exec, exec 4411; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4412; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4413; GFX9-NEXT: s_nop 1 4414; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4415; GFX9-NEXT: s_nop 1 4416; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4417; GFX9-NEXT: s_nop 1 4418; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4419; GFX9-NEXT: s_nop 1 4420; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4421; GFX9-NEXT: s_nop 1 4422; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4423; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4424; GFX9-NEXT: s_nop 0 4425; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4426; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4427; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4428; GFX9-NEXT: ; implicit-def: $vgpr0 4429; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4430; GFX9-NEXT: s_cbranch_execz BB23_2 4431; GFX9-NEXT: ; %bb.1: 4432; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4433; GFX9-NEXT: v_mov_b32_e32 v3, s4 4434; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4435; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4436; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4437; GFX9-NEXT: BB23_2: 4438; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4439; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4440; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4441; GFX9-NEXT: v_mov_b32_e32 v0, v1 4442; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4443; GFX9-NEXT: s_mov_b32 s3, 0xf000 4444; GFX9-NEXT: s_mov_b32 s2, -1 4445; GFX9-NEXT: s_nop 0 4446; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4447; GFX9-NEXT: s_endpgm 4448; 4449; GFX1064-LABEL: umin_i32_varying: 4450; GFX1064: ; %bb.0: ; %entry 4451; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4452; GFX1064-NEXT: s_not_b64 exec, exec 4453; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4454; GFX1064-NEXT: s_not_b64 exec, exec 4455; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4456; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4457; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4458; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4459; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4460; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4461; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4462; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4463; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4464; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4465; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4466; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4467; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4468; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4469; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4470; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4471; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4472; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4473; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4474; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4475; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4476; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4477; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4478; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4479; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4480; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4481; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4482; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4483; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4484; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4485; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4486; GFX1064-NEXT: s_mov_b32 s2, -1 4487; GFX1064-NEXT: ; implicit-def: $vgpr0 4488; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4489; GFX1064-NEXT: s_cbranch_execz BB23_2 4490; GFX1064-NEXT: ; %bb.1: 4491; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4492; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4493; GFX1064-NEXT: s_mov_b32 s3, s7 4494; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4495; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4496; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 4497; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4498; GFX1064-NEXT: buffer_gl0_inv 4499; GFX1064-NEXT: BB23_2: 4500; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4501; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4502; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4503; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4504; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4505; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4506; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4507; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4508; GFX1064-NEXT: s_endpgm 4509; 4510; GFX1032-LABEL: umin_i32_varying: 4511; GFX1032: ; %bb.0: ; %entry 4512; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4513; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4514; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4515; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4516; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4517; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4518; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4519; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4520; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4521; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4522; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4523; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4524; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4525; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4526; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4527; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4528; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4529; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4530; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4531; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4532; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4533; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4534; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4535; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4536; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4537; GFX1032-NEXT: s_mov_b32 s2, -1 4538; GFX1032-NEXT: ; implicit-def: $vgpr0 4539; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4540; GFX1032-NEXT: s_cbranch_execz BB23_2 4541; GFX1032-NEXT: ; %bb.1: 4542; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4543; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4544; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4545; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4546; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 4547; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4548; GFX1032-NEXT: buffer_gl0_inv 4549; GFX1032-NEXT: BB23_2: 4550; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4551; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4552; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4553; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4554; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4555; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4556; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4557; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4558; GFX1032-NEXT: s_endpgm 4559entry: 4560 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4561 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4562 store i32 %old, i32 addrspace(1)* %out 4563 ret void 4564} 4565 4566define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4567; 4568; 4569; GFX7LESS-LABEL: umin_i64_constant: 4570; GFX7LESS: ; %bb.0: ; %entry 4571; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4572; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4573; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4574; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4575; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4576; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4577; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4578; GFX7LESS-NEXT: ; %bb.1: 4579; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4580; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4581; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4582; GFX7LESS-NEXT: s_mov_b32 m0, -1 4583; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4584; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4585; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4586; GFX7LESS-NEXT: BB24_2: 4587; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4588; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4589; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4590; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4591; GFX7LESS-NEXT: s_mov_b32 s2, -1 4592; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4593; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4594; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4595; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4596; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4597; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4598; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4599; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4600; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4601; GFX7LESS-NEXT: s_endpgm 4602; 4603; GFX8-LABEL: umin_i64_constant: 4604; GFX8: ; %bb.0: ; %entry 4605; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4606; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4607; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4608; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4609; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4610; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4611; GFX8-NEXT: s_cbranch_execz BB24_2 4612; GFX8-NEXT: ; %bb.1: 4613; GFX8-NEXT: v_mov_b32_e32 v0, 5 4614; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4615; GFX8-NEXT: v_mov_b32_e32 v1, 0 4616; GFX8-NEXT: s_mov_b32 m0, -1 4617; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4618; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4619; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4620; GFX8-NEXT: BB24_2: 4621; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4622; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4623; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4624; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4625; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4626; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4627; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4628; GFX8-NEXT: v_mov_b32_e32 v2, s5 4629; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4630; GFX8-NEXT: v_mov_b32_e32 v2, s4 4631; GFX8-NEXT: s_mov_b32 s2, -1 4632; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4633; GFX8-NEXT: s_mov_b32 s3, 0xf000 4634; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4635; GFX8-NEXT: s_endpgm 4636; 4637; GFX9-LABEL: umin_i64_constant: 4638; GFX9: ; %bb.0: ; %entry 4639; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4640; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4641; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4642; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4643; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4644; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4645; GFX9-NEXT: s_cbranch_execz BB24_2 4646; GFX9-NEXT: ; %bb.1: 4647; GFX9-NEXT: v_mov_b32_e32 v0, 5 4648; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4649; GFX9-NEXT: v_mov_b32_e32 v1, 0 4650; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4651; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4652; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4653; GFX9-NEXT: BB24_2: 4654; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4656; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4657; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4658; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4659; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4660; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4661; GFX9-NEXT: v_mov_b32_e32 v2, s5 4662; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4663; GFX9-NEXT: v_mov_b32_e32 v2, s4 4664; GFX9-NEXT: s_mov_b32 s2, -1 4665; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4666; GFX9-NEXT: s_mov_b32 s3, 0xf000 4667; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4668; GFX9-NEXT: s_endpgm 4669; 4670; GFX1064-LABEL: umin_i64_constant: 4671; GFX1064: ; %bb.0: ; %entry 4672; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4673; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4674; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4675; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4676; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4677; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4678; GFX1064-NEXT: s_cbranch_execz BB24_2 4679; GFX1064-NEXT: ; %bb.1: 4680; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4681; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4682; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4683; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4684; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4685; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4686; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4687; GFX1064-NEXT: buffer_gl0_inv 4688; GFX1064-NEXT: BB24_2: 4689; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4690; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4691; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4692; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4693; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4694; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4695; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4696; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4697; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4698; GFX1064-NEXT: s_mov_b32 s2, -1 4699; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4700; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4701; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4702; GFX1064-NEXT: s_endpgm 4703; 4704; GFX1032-LABEL: umin_i64_constant: 4705; GFX1032: ; %bb.0: ; %entry 4706; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4707; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4708; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4709; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4710; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4711; GFX1032-NEXT: s_cbranch_execz BB24_2 4712; GFX1032-NEXT: ; %bb.1: 4713; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4714; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4715; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4716; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4717; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4718; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4719; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4720; GFX1032-NEXT: buffer_gl0_inv 4721; GFX1032-NEXT: BB24_2: 4722; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4723; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4724; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4725; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4726; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4727; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4728; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4729; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4730; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4731; GFX1032-NEXT: s_mov_b32 s2, -1 4732; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4733; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4734; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4735; GFX1032-NEXT: s_endpgm 4736entry: 4737 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4738 store i64 %old, i64 addrspace(1)* %out 4739 ret void 4740} 4741