1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: s_nop 0 135; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 136; GFX1064-NEXT: s_endpgm 137; 138; GFX1032-LABEL: add_i32_constant: 139; GFX1032: ; %bb.0: ; %entry 140; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 141; GFX1032-NEXT: s_mov_b32 s3, exec_lo 142; GFX1032-NEXT: ; implicit-def: $vgpr1 143; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 144; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 145; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 146; GFX1032-NEXT: s_cbranch_execz BB0_2 147; GFX1032-NEXT: ; %bb.1: 148; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 149; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 150; GFX1032-NEXT: s_mul_i32 s3, s3, 5 151; GFX1032-NEXT: v_mov_b32_e32 v2, s3 152; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 153; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 154; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 155; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 156; GFX1032-NEXT: buffer_gl0_inv 157; GFX1032-NEXT: BB0_2: 158; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 159; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 160; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 161; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 162; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 163; GFX1032-NEXT: s_mov_b32 s2, -1 164; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 165; GFX1032-NEXT: s_nop 0 166; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 167; GFX1032-NEXT: s_endpgm 168entry: 169 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 170 store i32 %old, i32 addrspace(1)* %out 171 ret void 172} 173 174define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 175; 176; 177; GFX7LESS-LABEL: add_i32_uniform: 178; GFX7LESS: ; %bb.0: ; %entry 179; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 180; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 181; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 182; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 183; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 184; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 185; GFX7LESS-NEXT: ; implicit-def: $vgpr1 186; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 187; GFX7LESS-NEXT: s_cbranch_execz BB1_2 188; GFX7LESS-NEXT: ; %bb.1: 189; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 190; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 191; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 192; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 193; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 194; GFX7LESS-NEXT: s_mov_b32 m0, -1 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 197; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 198; GFX7LESS-NEXT: BB1_2: 199; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 200; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 201; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 202; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 203; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 204; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 205; GFX7LESS-NEXT: s_mov_b32 s6, -1 206; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 207; GFX7LESS-NEXT: s_endpgm 208; 209; GFX8-LABEL: add_i32_uniform: 210; GFX8: ; %bb.0: ; %entry 211; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 212; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 213; GFX8-NEXT: s_mov_b64 s[2:3], exec 214; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 215; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 216; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 217; GFX8-NEXT: ; implicit-def: $vgpr1 218; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 219; GFX8-NEXT: s_cbranch_execz BB1_2 220; GFX8-NEXT: ; %bb.1: 221; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 222; GFX8-NEXT: s_waitcnt lgkmcnt(0) 223; GFX8-NEXT: s_mul_i32 s1, s0, s1 224; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 225; GFX8-NEXT: v_mov_b32_e32 v2, s1 226; GFX8-NEXT: s_mov_b32 m0, -1 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 229; GFX8-NEXT: s_waitcnt lgkmcnt(0) 230; GFX8-NEXT: BB1_2: 231; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 232; GFX8-NEXT: s_waitcnt lgkmcnt(0) 233; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 234; GFX8-NEXT: v_readfirstlane_b32 s0, v1 235; GFX8-NEXT: s_mov_b32 s7, 0xf000 236; GFX8-NEXT: s_mov_b32 s6, -1 237; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 238; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 239; GFX8-NEXT: s_endpgm 240; 241; GFX9-LABEL: add_i32_uniform: 242; GFX9: ; %bb.0: ; %entry 243; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 244; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 245; GFX9-NEXT: s_mov_b64 s[6:7], exec 246; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 247; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 248; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 249; GFX9-NEXT: ; implicit-def: $vgpr1 250; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 251; GFX9-NEXT: s_cbranch_execz BB1_2 252; GFX9-NEXT: ; %bb.1: 253; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 254; GFX9-NEXT: s_waitcnt lgkmcnt(0) 255; GFX9-NEXT: s_mul_i32 s3, s2, s3 256; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 257; GFX9-NEXT: v_mov_b32_e32 v2, s3 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 260; GFX9-NEXT: s_waitcnt lgkmcnt(0) 261; GFX9-NEXT: BB1_2: 262; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 263; GFX9-NEXT: s_waitcnt lgkmcnt(0) 264; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 265; GFX9-NEXT: v_readfirstlane_b32 s0, v1 266; GFX9-NEXT: s_mov_b32 s7, 0xf000 267; GFX9-NEXT: s_mov_b32 s6, -1 268; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 269; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 270; GFX9-NEXT: s_endpgm 271; 272; GFX1064-LABEL: add_i32_uniform: 273; GFX1064: ; %bb.0: ; %entry 274; GFX1064-NEXT: s_clause 0x1 275; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 276; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 277; GFX1064-NEXT: s_mov_b64 s[6:7], exec 278; GFX1064-NEXT: ; implicit-def: $vgpr1 279; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 280; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 281; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 282; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 283; GFX1064-NEXT: s_cbranch_execz BB1_2 284; GFX1064-NEXT: ; %bb.1: 285; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 286; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 287; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 288; GFX1064-NEXT: s_mul_i32 s3, s2, s3 289; GFX1064-NEXT: v_mov_b32_e32 v2, s3 290; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 291; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 292; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 293; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 294; GFX1064-NEXT: buffer_gl0_inv 295; GFX1064-NEXT: BB1_2: 296; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 297; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 298; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 299; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 300; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 301; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 302; GFX1064-NEXT: s_mov_b32 s6, -1 303; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 304; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 305; GFX1064-NEXT: s_endpgm 306; 307; GFX1032-LABEL: add_i32_uniform: 308; GFX1032: ; %bb.0: ; %entry 309; GFX1032-NEXT: s_clause 0x1 310; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 311; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 312; GFX1032-NEXT: s_mov_b32 s3, exec_lo 313; GFX1032-NEXT: ; implicit-def: $vgpr1 314; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 315; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 316; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 317; GFX1032-NEXT: s_cbranch_execz BB1_2 318; GFX1032-NEXT: ; %bb.1: 319; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 320; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 321; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 322; GFX1032-NEXT: s_mul_i32 s1, s2, s1 323; GFX1032-NEXT: v_mov_b32_e32 v2, s1 324; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 325; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 326; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 327; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 328; GFX1032-NEXT: buffer_gl0_inv 329; GFX1032-NEXT: BB1_2: 330; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 331; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 332; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 333; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 334; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 335; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 336; GFX1032-NEXT: s_mov_b32 s6, -1 337; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 338; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 339; GFX1032-NEXT: s_endpgm 340entry: 341 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 342 store i32 %old, i32 addrspace(1)* %out 343 ret void 344} 345 346define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 347; 348; 349; GFX7LESS-LABEL: add_i32_varying: 350; GFX7LESS: ; %bb.0: ; %entry 351; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 352; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 353; GFX7LESS-NEXT: s_mov_b32 m0, -1 354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 356; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 357; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 358; GFX7LESS-NEXT: s_mov_b32 s2, -1 359; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 360; GFX7LESS-NEXT: s_endpgm 361; 362; GFX8-LABEL: add_i32_varying: 363; GFX8: ; %bb.0: ; %entry 364; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 365; GFX8-NEXT: v_mov_b32_e32 v2, v0 366; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 367; GFX8-NEXT: v_mov_b32_e32 v1, 0 368; GFX8-NEXT: s_mov_b64 exec, s[2:3] 369; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 370; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 371; GFX8-NEXT: s_not_b64 exec, exec 372; GFX8-NEXT: v_mov_b32_e32 v2, 0 373; GFX8-NEXT: s_not_b64 exec, exec 374; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 382; GFX8-NEXT: s_nop 1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 384; GFX8-NEXT: s_nop 1 385; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 386; GFX8-NEXT: v_readlane_b32 s4, v2, 63 387; GFX8-NEXT: s_nop 0 388; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 389; GFX8-NEXT: s_mov_b64 exec, s[2:3] 390; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 391; GFX8-NEXT: ; implicit-def: $vgpr0 392; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 393; GFX8-NEXT: s_cbranch_execz BB2_2 394; GFX8-NEXT: ; %bb.1: 395; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 396; GFX8-NEXT: v_mov_b32_e32 v3, s4 397; GFX8-NEXT: s_mov_b32 m0, -1 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 400; GFX8-NEXT: s_waitcnt lgkmcnt(0) 401; GFX8-NEXT: BB2_2: 402; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 403; GFX8-NEXT: s_waitcnt lgkmcnt(0) 404; GFX8-NEXT: v_readfirstlane_b32 s2, v0 405; GFX8-NEXT: v_mov_b32_e32 v0, v1 406; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 407; GFX8-NEXT: s_mov_b32 s3, 0xf000 408; GFX8-NEXT: s_mov_b32 s2, -1 409; GFX8-NEXT: s_nop 0 410; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 411; GFX8-NEXT: s_endpgm 412; 413; GFX9-LABEL: add_i32_varying: 414; GFX9: ; %bb.0: ; %entry 415; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 416; GFX9-NEXT: v_mov_b32_e32 v2, v0 417; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 418; GFX9-NEXT: v_mov_b32_e32 v1, 0 419; GFX9-NEXT: s_mov_b64 exec, s[2:3] 420; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 421; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 422; GFX9-NEXT: s_not_b64 exec, exec 423; GFX9-NEXT: v_mov_b32_e32 v2, 0 424; GFX9-NEXT: s_not_b64 exec, exec 425; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 435; GFX9-NEXT: s_nop 1 436; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 437; GFX9-NEXT: v_readlane_b32 s4, v2, 63 438; GFX9-NEXT: s_nop 0 439; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 440; GFX9-NEXT: s_mov_b64 exec, s[2:3] 441; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 442; GFX9-NEXT: ; implicit-def: $vgpr0 443; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 444; GFX9-NEXT: s_cbranch_execz BB2_2 445; GFX9-NEXT: ; %bb.1: 446; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 447; GFX9-NEXT: v_mov_b32_e32 v3, s4 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 450; GFX9-NEXT: s_waitcnt lgkmcnt(0) 451; GFX9-NEXT: BB2_2: 452; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 453; GFX9-NEXT: s_waitcnt lgkmcnt(0) 454; GFX9-NEXT: v_readfirstlane_b32 s2, v0 455; GFX9-NEXT: v_mov_b32_e32 v0, v1 456; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 457; GFX9-NEXT: s_mov_b32 s3, 0xf000 458; GFX9-NEXT: s_mov_b32 s2, -1 459; GFX9-NEXT: s_nop 0 460; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 461; GFX9-NEXT: s_endpgm 462; 463; GFX1064-LABEL: add_i32_varying: 464; GFX1064: ; %bb.0: ; %entry 465; GFX1064-NEXT: v_mov_b32_e32 v1, v0 466; GFX1064-NEXT: s_not_b64 exec, exec 467; GFX1064-NEXT: v_mov_b32_e32 v1, 0 468; GFX1064-NEXT: s_not_b64 exec, exec 469; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_mov_b32_e32 v3, 0 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 474; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 475; GFX1064-NEXT: v_mov_b32_e32 v2, v1 476; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 477; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 478; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 479; GFX1064-NEXT: v_mov_b32_e32 v2, s4 480; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 481; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 482; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 483; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 484; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 485; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 486; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 487; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 488; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 489; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 490; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 491; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 492; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 493; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 494; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 495; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 496; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 497; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 498; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 499; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 500; GFX1064-NEXT: s_mov_b32 s2, -1 501; GFX1064-NEXT: ; implicit-def: $vgpr0 502; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 503; GFX1064-NEXT: s_cbranch_execz BB2_2 504; GFX1064-NEXT: ; %bb.1: 505; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 506; GFX1064-NEXT: v_mov_b32_e32 v4, s7 507; GFX1064-NEXT: s_mov_b32 s3, s7 508; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 509; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 510; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 511; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 512; GFX1064-NEXT: buffer_gl0_inv 513; GFX1064-NEXT: BB2_2: 514; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 515; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 516; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 517; GFX1064-NEXT: v_mov_b32_e32 v0, v3 518; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 519; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 520; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 521; GFX1064-NEXT: s_nop 0 522; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 523; GFX1064-NEXT: s_endpgm 524; 525; GFX1032-LABEL: add_i32_varying: 526; GFX1032: ; %bb.0: ; %entry 527; GFX1032-NEXT: v_mov_b32_e32 v1, v0 528; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 529; GFX1032-NEXT: v_mov_b32_e32 v1, 0 530; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 531; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 532; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 534; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 535; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 536; GFX1032-NEXT: v_mov_b32_e32 v2, v1 537; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 538; GFX1032-NEXT: s_mov_b32 exec_lo, s2 539; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 540; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 541; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 542; GFX1032-NEXT: v_mov_b32_e32 v3, 0 543; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 544; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 545; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 546; GFX1032-NEXT: s_mov_b32 exec_lo, s2 547; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 548; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 549; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 550; GFX1032-NEXT: s_mov_b32 exec_lo, s2 551; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 552; GFX1032-NEXT: s_mov_b32 s2, -1 553; GFX1032-NEXT: ; implicit-def: $vgpr0 554; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 555; GFX1032-NEXT: s_cbranch_execz BB2_2 556; GFX1032-NEXT: ; %bb.1: 557; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 558; GFX1032-NEXT: v_mov_b32_e32 v4, s4 559; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 560; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 561; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 562; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 563; GFX1032-NEXT: buffer_gl0_inv 564; GFX1032-NEXT: BB2_2: 565; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 566; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 567; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 568; GFX1032-NEXT: v_mov_b32_e32 v0, v3 569; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 570; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 571; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 572; GFX1032-NEXT: s_nop 0 573; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 574; GFX1032-NEXT: s_endpgm 575entry: 576 %lane = call i32 @llvm.amdgcn.workitem.id.x() 577 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 578 store i32 %old, i32 addrspace(1)* %out 579 ret void 580} 581 582define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 583; 584; 585; GFX7LESS-LABEL: add_i32_varying_gfx1032: 586; GFX7LESS: ; %bb.0: ; %entry 587; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 588; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 589; GFX7LESS-NEXT: s_mov_b32 m0, -1 590; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 591; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 592; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 593; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 594; GFX7LESS-NEXT: s_mov_b32 s2, -1 595; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 596; GFX7LESS-NEXT: s_endpgm 597; 598; GFX8-LABEL: add_i32_varying_gfx1032: 599; GFX8: ; %bb.0: ; %entry 600; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 601; GFX8-NEXT: v_mov_b32_e32 v2, v0 602; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 603; GFX8-NEXT: v_mov_b32_e32 v1, 0 604; GFX8-NEXT: s_mov_b64 exec, s[2:3] 605; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 606; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 607; GFX8-NEXT: s_not_b64 exec, exec 608; GFX8-NEXT: v_mov_b32_e32 v2, 0 609; GFX8-NEXT: s_not_b64 exec, exec 610; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 611; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 612; GFX8-NEXT: s_nop 1 613; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 614; GFX8-NEXT: s_nop 1 615; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 616; GFX8-NEXT: s_nop 1 617; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 618; GFX8-NEXT: s_nop 1 619; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 620; GFX8-NEXT: s_nop 1 621; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 622; GFX8-NEXT: v_readlane_b32 s4, v2, 63 623; GFX8-NEXT: s_nop 0 624; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 625; GFX8-NEXT: s_mov_b64 exec, s[2:3] 626; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 627; GFX8-NEXT: ; implicit-def: $vgpr0 628; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 629; GFX8-NEXT: s_cbranch_execz BB3_2 630; GFX8-NEXT: ; %bb.1: 631; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 632; GFX8-NEXT: v_mov_b32_e32 v3, s4 633; GFX8-NEXT: s_mov_b32 m0, -1 634; GFX8-NEXT: s_waitcnt lgkmcnt(0) 635; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 636; GFX8-NEXT: s_waitcnt lgkmcnt(0) 637; GFX8-NEXT: BB3_2: 638; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 639; GFX8-NEXT: s_waitcnt lgkmcnt(0) 640; GFX8-NEXT: v_readfirstlane_b32 s2, v0 641; GFX8-NEXT: v_mov_b32_e32 v0, v1 642; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 643; GFX8-NEXT: s_mov_b32 s3, 0xf000 644; GFX8-NEXT: s_mov_b32 s2, -1 645; GFX8-NEXT: s_nop 0 646; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 647; GFX8-NEXT: s_endpgm 648; 649; GFX9-LABEL: add_i32_varying_gfx1032: 650; GFX9: ; %bb.0: ; %entry 651; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 652; GFX9-NEXT: v_mov_b32_e32 v2, v0 653; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 654; GFX9-NEXT: v_mov_b32_e32 v1, 0 655; GFX9-NEXT: s_mov_b64 exec, s[2:3] 656; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 657; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 658; GFX9-NEXT: s_not_b64 exec, exec 659; GFX9-NEXT: v_mov_b32_e32 v2, 0 660; GFX9-NEXT: s_not_b64 exec, exec 661; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 662; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 663; GFX9-NEXT: s_nop 1 664; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 665; GFX9-NEXT: s_nop 1 666; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX9-NEXT: s_nop 1 668; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX9-NEXT: s_nop 1 670; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 671; GFX9-NEXT: s_nop 1 672; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 673; GFX9-NEXT: v_readlane_b32 s4, v2, 63 674; GFX9-NEXT: s_nop 0 675; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 676; GFX9-NEXT: s_mov_b64 exec, s[2:3] 677; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 678; GFX9-NEXT: ; implicit-def: $vgpr0 679; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 680; GFX9-NEXT: s_cbranch_execz BB3_2 681; GFX9-NEXT: ; %bb.1: 682; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 683; GFX9-NEXT: v_mov_b32_e32 v3, s4 684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 685; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 686; GFX9-NEXT: s_waitcnt lgkmcnt(0) 687; GFX9-NEXT: BB3_2: 688; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 689; GFX9-NEXT: s_waitcnt lgkmcnt(0) 690; GFX9-NEXT: v_readfirstlane_b32 s2, v0 691; GFX9-NEXT: v_mov_b32_e32 v0, v1 692; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 693; GFX9-NEXT: s_mov_b32 s3, 0xf000 694; GFX9-NEXT: s_mov_b32 s2, -1 695; GFX9-NEXT: s_nop 0 696; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 697; GFX9-NEXT: s_endpgm 698; 699; GFX1064-LABEL: add_i32_varying_gfx1032: 700; GFX1064: ; %bb.0: ; %entry 701; GFX1064-NEXT: v_mov_b32_e32 v1, v0 702; GFX1064-NEXT: s_not_b64 exec, exec 703; GFX1064-NEXT: v_mov_b32_e32 v1, 0 704; GFX1064-NEXT: s_not_b64 exec, exec 705; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 706; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 707; GFX1064-NEXT: v_mov_b32_e32 v3, 0 708; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 709; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 710; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 711; GFX1064-NEXT: v_mov_b32_e32 v2, v1 712; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 713; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 714; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 715; GFX1064-NEXT: v_mov_b32_e32 v2, s4 716; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 717; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 718; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 719; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 720; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 721; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 722; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 723; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 724; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 725; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 726; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 727; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 728; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 729; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 730; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 731; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 732; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 733; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 734; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 735; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 736; GFX1064-NEXT: s_mov_b32 s2, -1 737; GFX1064-NEXT: ; implicit-def: $vgpr0 738; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 739; GFX1064-NEXT: s_cbranch_execz BB3_2 740; GFX1064-NEXT: ; %bb.1: 741; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 742; GFX1064-NEXT: v_mov_b32_e32 v4, s7 743; GFX1064-NEXT: s_mov_b32 s3, s7 744; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 745; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 746; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 747; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 748; GFX1064-NEXT: buffer_gl0_inv 749; GFX1064-NEXT: BB3_2: 750; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 751; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 752; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 753; GFX1064-NEXT: v_mov_b32_e32 v0, v3 754; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 755; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 756; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 757; GFX1064-NEXT: s_nop 0 758; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 759; GFX1064-NEXT: s_endpgm 760; 761; GFX1032-LABEL: add_i32_varying_gfx1032: 762; GFX1032: ; %bb.0: ; %entry 763; GFX1032-NEXT: v_mov_b32_e32 v1, v0 764; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 765; GFX1032-NEXT: v_mov_b32_e32 v1, 0 766; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 767; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 768; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 769; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 770; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 771; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 772; GFX1032-NEXT: v_mov_b32_e32 v2, v1 773; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 774; GFX1032-NEXT: s_mov_b32 exec_lo, s2 775; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 776; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 777; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 778; GFX1032-NEXT: v_mov_b32_e32 v3, 0 779; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 780; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 781; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 782; GFX1032-NEXT: s_mov_b32 exec_lo, s2 783; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 784; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 785; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 786; GFX1032-NEXT: s_mov_b32 exec_lo, s2 787; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 788; GFX1032-NEXT: s_mov_b32 s2, -1 789; GFX1032-NEXT: ; implicit-def: $vgpr0 790; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 791; GFX1032-NEXT: s_cbranch_execz BB3_2 792; GFX1032-NEXT: ; %bb.1: 793; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 794; GFX1032-NEXT: v_mov_b32_e32 v4, s4 795; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 796; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 797; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 798; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 799; GFX1032-NEXT: buffer_gl0_inv 800; GFX1032-NEXT: BB3_2: 801; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 802; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 803; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 804; GFX1032-NEXT: v_mov_b32_e32 v0, v3 805; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 806; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 807; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 808; GFX1032-NEXT: s_nop 0 809; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 810; GFX1032-NEXT: s_endpgm 811entry: 812 %lane = call i32 @llvm.amdgcn.workitem.id.x() 813 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 814 store i32 %old, i32 addrspace(1)* %out 815 ret void 816} 817 818define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 819; 820; 821; GFX7LESS-LABEL: add_i32_varying_gfx1064: 822; GFX7LESS: ; %bb.0: ; %entry 823; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 824; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 825; GFX7LESS-NEXT: s_mov_b32 m0, -1 826; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 827; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 828; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 829; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 830; GFX7LESS-NEXT: s_mov_b32 s2, -1 831; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 832; GFX7LESS-NEXT: s_endpgm 833; 834; GFX8-LABEL: add_i32_varying_gfx1064: 835; GFX8: ; %bb.0: ; %entry 836; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 837; GFX8-NEXT: v_mov_b32_e32 v2, v0 838; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 839; GFX8-NEXT: v_mov_b32_e32 v1, 0 840; GFX8-NEXT: s_mov_b64 exec, s[2:3] 841; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 842; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 843; GFX8-NEXT: s_not_b64 exec, exec 844; GFX8-NEXT: v_mov_b32_e32 v2, 0 845; GFX8-NEXT: s_not_b64 exec, exec 846; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 847; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 848; GFX8-NEXT: s_nop 1 849; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 850; GFX8-NEXT: s_nop 1 851; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 852; GFX8-NEXT: s_nop 1 853; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 854; GFX8-NEXT: s_nop 1 855; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 856; GFX8-NEXT: s_nop 1 857; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 858; GFX8-NEXT: v_readlane_b32 s4, v2, 63 859; GFX8-NEXT: s_nop 0 860; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 861; GFX8-NEXT: s_mov_b64 exec, s[2:3] 862; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 863; GFX8-NEXT: ; implicit-def: $vgpr0 864; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 865; GFX8-NEXT: s_cbranch_execz BB4_2 866; GFX8-NEXT: ; %bb.1: 867; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 868; GFX8-NEXT: v_mov_b32_e32 v3, s4 869; GFX8-NEXT: s_mov_b32 m0, -1 870; GFX8-NEXT: s_waitcnt lgkmcnt(0) 871; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 872; GFX8-NEXT: s_waitcnt lgkmcnt(0) 873; GFX8-NEXT: BB4_2: 874; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 875; GFX8-NEXT: s_waitcnt lgkmcnt(0) 876; GFX8-NEXT: v_readfirstlane_b32 s2, v0 877; GFX8-NEXT: v_mov_b32_e32 v0, v1 878; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 879; GFX8-NEXT: s_mov_b32 s3, 0xf000 880; GFX8-NEXT: s_mov_b32 s2, -1 881; GFX8-NEXT: s_nop 0 882; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 883; GFX8-NEXT: s_endpgm 884; 885; GFX9-LABEL: add_i32_varying_gfx1064: 886; GFX9: ; %bb.0: ; %entry 887; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 888; GFX9-NEXT: v_mov_b32_e32 v2, v0 889; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 890; GFX9-NEXT: v_mov_b32_e32 v1, 0 891; GFX9-NEXT: s_mov_b64 exec, s[2:3] 892; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 893; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 894; GFX9-NEXT: s_not_b64 exec, exec 895; GFX9-NEXT: v_mov_b32_e32 v2, 0 896; GFX9-NEXT: s_not_b64 exec, exec 897; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 898; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 899; GFX9-NEXT: s_nop 1 900; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 901; GFX9-NEXT: s_nop 1 902; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 903; GFX9-NEXT: s_nop 1 904; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 905; GFX9-NEXT: s_nop 1 906; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 907; GFX9-NEXT: s_nop 1 908; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 909; GFX9-NEXT: v_readlane_b32 s4, v2, 63 910; GFX9-NEXT: s_nop 0 911; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 912; GFX9-NEXT: s_mov_b64 exec, s[2:3] 913; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 914; GFX9-NEXT: ; implicit-def: $vgpr0 915; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 916; GFX9-NEXT: s_cbranch_execz BB4_2 917; GFX9-NEXT: ; %bb.1: 918; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 919; GFX9-NEXT: v_mov_b32_e32 v3, s4 920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 921; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 922; GFX9-NEXT: s_waitcnt lgkmcnt(0) 923; GFX9-NEXT: BB4_2: 924; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 925; GFX9-NEXT: s_waitcnt lgkmcnt(0) 926; GFX9-NEXT: v_readfirstlane_b32 s2, v0 927; GFX9-NEXT: v_mov_b32_e32 v0, v1 928; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 929; GFX9-NEXT: s_mov_b32 s3, 0xf000 930; GFX9-NEXT: s_mov_b32 s2, -1 931; GFX9-NEXT: s_nop 0 932; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 933; GFX9-NEXT: s_endpgm 934; 935; GFX1064-LABEL: add_i32_varying_gfx1064: 936; GFX1064: ; %bb.0: ; %entry 937; GFX1064-NEXT: v_mov_b32_e32 v1, v0 938; GFX1064-NEXT: s_not_b64 exec, exec 939; GFX1064-NEXT: v_mov_b32_e32 v1, 0 940; GFX1064-NEXT: s_not_b64 exec, exec 941; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 942; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 943; GFX1064-NEXT: v_mov_b32_e32 v3, 0 944; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 945; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 946; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 947; GFX1064-NEXT: v_mov_b32_e32 v2, v1 948; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 949; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 950; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 951; GFX1064-NEXT: v_mov_b32_e32 v2, s4 952; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 953; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 954; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 955; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 956; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 957; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 958; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 959; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 960; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 961; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 962; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 963; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 964; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 965; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 966; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 967; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 968; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 969; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 970; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 971; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 972; GFX1064-NEXT: s_mov_b32 s2, -1 973; GFX1064-NEXT: ; implicit-def: $vgpr0 974; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 975; GFX1064-NEXT: s_cbranch_execz BB4_2 976; GFX1064-NEXT: ; %bb.1: 977; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 978; GFX1064-NEXT: v_mov_b32_e32 v4, s7 979; GFX1064-NEXT: s_mov_b32 s3, s7 980; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 981; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 982; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 983; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 984; GFX1064-NEXT: buffer_gl0_inv 985; GFX1064-NEXT: BB4_2: 986; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 987; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 988; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 989; GFX1064-NEXT: v_mov_b32_e32 v0, v3 990; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 991; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 992; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 993; GFX1064-NEXT: s_nop 0 994; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 995; GFX1064-NEXT: s_endpgm 996; 997; GFX1032-LABEL: add_i32_varying_gfx1064: 998; GFX1032: ; %bb.0: ; %entry 999; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1000; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1001; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1002; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1003; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1004; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1005; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1006; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1007; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1008; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1009; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1010; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1011; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1012; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1013; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1014; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1015; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1016; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1017; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1018; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1019; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1020; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1021; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1022; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1023; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1024; GFX1032-NEXT: s_mov_b32 s2, -1 1025; GFX1032-NEXT: ; implicit-def: $vgpr0 1026; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1027; GFX1032-NEXT: s_cbranch_execz BB4_2 1028; GFX1032-NEXT: ; %bb.1: 1029; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1030; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1031; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1032; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1033; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 1034; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX1032-NEXT: buffer_gl0_inv 1036; GFX1032-NEXT: BB4_2: 1037; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1038; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1039; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1040; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1041; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1042; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1043; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1044; GFX1032-NEXT: s_nop 0 1045; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1046; GFX1032-NEXT: s_endpgm 1047entry: 1048 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1049 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1050 store i32 %old, i32 addrspace(1)* %out 1051 ret void 1052} 1053 1054define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1055; 1056; 1057; GFX7LESS-LABEL: add_i64_constant: 1058; GFX7LESS: ; %bb.0: ; %entry 1059; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1060; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1061; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1062; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1063; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1064; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1065; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1066; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1067; GFX7LESS-NEXT: ; %bb.1: 1068; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1069; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1070; GFX7LESS-NEXT: s_mul_i32 s5, s4, 5 1071; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1072; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 1073; GFX7LESS-NEXT: s_mov_b32 m0, -1 1074; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1075; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1076; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1077; GFX7LESS-NEXT: BB5_2: 1078; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1079; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1080; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1081; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1082; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1083; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1084; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1085; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1086; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1087; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1088; GFX7LESS-NEXT: s_mov_b32 s2, -1 1089; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1090; GFX7LESS-NEXT: s_endpgm 1091; 1092; GFX8-LABEL: add_i64_constant: 1093; GFX8: ; %bb.0: ; %entry 1094; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1095; GFX8-NEXT: s_mov_b64 s[4:5], exec 1096; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1097; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1098; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1099; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1100; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1101; GFX8-NEXT: s_cbranch_execz BB5_2 1102; GFX8-NEXT: ; %bb.1: 1103; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1104; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1105; GFX8-NEXT: s_mul_i32 s4, s4, 5 1106; GFX8-NEXT: v_mov_b32_e32 v1, s4 1107; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1108; GFX8-NEXT: s_mov_b32 m0, -1 1109; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1111; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1112; GFX8-NEXT: BB5_2: 1113; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1114; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1115; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1116; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1117; GFX8-NEXT: v_mov_b32_e32 v1, s2 1118; GFX8-NEXT: v_mov_b32_e32 v2, s3 1119; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1120; GFX8-NEXT: s_mov_b32 s3, 0xf000 1121; GFX8-NEXT: s_mov_b32 s2, -1 1122; GFX8-NEXT: s_nop 2 1123; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1124; GFX8-NEXT: s_endpgm 1125; 1126; GFX9-LABEL: add_i64_constant: 1127; GFX9: ; %bb.0: ; %entry 1128; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1129; GFX9-NEXT: s_mov_b64 s[4:5], exec 1130; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1131; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1132; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1133; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1134; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1135; GFX9-NEXT: s_cbranch_execz BB5_2 1136; GFX9-NEXT: ; %bb.1: 1137; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1138; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1139; GFX9-NEXT: s_mul_i32 s4, s4, 5 1140; GFX9-NEXT: v_mov_b32_e32 v1, s4 1141; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1142; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1143; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1144; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1145; GFX9-NEXT: BB5_2: 1146; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1147; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1148; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1149; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1150; GFX9-NEXT: v_mov_b32_e32 v1, s2 1151; GFX9-NEXT: v_mov_b32_e32 v2, s3 1152; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1153; GFX9-NEXT: s_mov_b32 s3, 0xf000 1154; GFX9-NEXT: s_mov_b32 s2, -1 1155; GFX9-NEXT: s_nop 2 1156; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1157; GFX9-NEXT: s_endpgm 1158; 1159; GFX1064-LABEL: add_i64_constant: 1160; GFX1064: ; %bb.0: ; %entry 1161; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1162; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1163; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1164; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1165; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 1166; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1167; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1168; GFX1064-NEXT: s_cbranch_execz BB5_2 1169; GFX1064-NEXT: ; %bb.1: 1170; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1171; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1172; GFX1064-NEXT: s_mul_i32 s5, s4, 5 1173; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1174; GFX1064-NEXT: v_mov_b32_e32 v1, s5 1175; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1176; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1177; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1178; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1179; GFX1064-NEXT: buffer_gl0_inv 1180; GFX1064-NEXT: BB5_2: 1181; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1182; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1183; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1184; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1185; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1186; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1187; GFX1064-NEXT: s_mov_b32 s2, -1 1188; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1189; GFX1064-NEXT: s_nop 1 1190; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1191; GFX1064-NEXT: s_endpgm 1192; 1193; GFX1032-LABEL: add_i64_constant: 1194; GFX1032: ; %bb.0: ; %entry 1195; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1196; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1197; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1198; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1199; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1200; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1201; GFX1032-NEXT: s_cbranch_execz BB5_2 1202; GFX1032-NEXT: ; %bb.1: 1203; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1204; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1205; GFX1032-NEXT: s_mul_i32 s4, s3, 5 1206; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1207; GFX1032-NEXT: v_mov_b32_e32 v1, s4 1208; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1209; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1210; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1211; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1212; GFX1032-NEXT: buffer_gl0_inv 1213; GFX1032-NEXT: BB5_2: 1214; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1215; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1216; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1217; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1218; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1219; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1220; GFX1032-NEXT: s_mov_b32 s2, -1 1221; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX1032-NEXT: s_nop 1 1223; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1224; GFX1032-NEXT: s_endpgm 1225entry: 1226 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1227 store i64 %old, i64 addrspace(1)* %out 1228 ret void 1229} 1230 1231define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1232; 1233; 1234; GFX7LESS-LABEL: add_i64_uniform: 1235; GFX7LESS: ; %bb.0: ; %entry 1236; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1237; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1238; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1239; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1240; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1241; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1242; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1243; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1244; GFX7LESS-NEXT: ; %bb.1: 1245; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1246; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1247; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1248; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1249; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1250; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1251; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1252; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1253; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1254; GFX7LESS-NEXT: s_mov_b32 m0, -1 1255; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1256; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1257; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1258; GFX7LESS-NEXT: BB6_2: 1259; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1260; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1261; GFX7LESS-NEXT: s_mov_b32 s6, -1 1262; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1263; GFX7LESS-NEXT: s_mov_b32 s4, s0 1264; GFX7LESS-NEXT: s_mov_b32 s5, s1 1265; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1266; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1267; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1268; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1269; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1270; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1271; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1272; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1273; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1274; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1275; GFX7LESS-NEXT: s_endpgm 1276; 1277; GFX8-LABEL: add_i64_uniform: 1278; GFX8: ; %bb.0: ; %entry 1279; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1280; GFX8-NEXT: s_mov_b64 s[6:7], exec 1281; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1282; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1283; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1284; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1285; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1286; GFX8-NEXT: s_cbranch_execz BB6_2 1287; GFX8-NEXT: ; %bb.1: 1288; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1289; GFX8-NEXT: v_mov_b32_e32 v1, s6 1290; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1292; GFX8-NEXT: s_mul_i32 s7, s3, s6 1293; GFX8-NEXT: s_mul_i32 s6, s2, s6 1294; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1295; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1296; GFX8-NEXT: v_mov_b32_e32 v1, s6 1297; GFX8-NEXT: s_mov_b32 m0, -1 1298; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1299; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1300; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1301; GFX8-NEXT: BB6_2: 1302; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1303; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX8-NEXT: s_mov_b32 s4, s0 1305; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1306; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1307; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1308; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1309; GFX8-NEXT: s_mov_b32 s5, s1 1310; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1311; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1312; GFX8-NEXT: v_mov_b32_e32 v2, s1 1313; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1314; GFX8-NEXT: s_mov_b32 s7, 0xf000 1315; GFX8-NEXT: s_mov_b32 s6, -1 1316; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1317; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1318; GFX8-NEXT: s_endpgm 1319; 1320; GFX9-LABEL: add_i64_uniform: 1321; GFX9: ; %bb.0: ; %entry 1322; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1323; GFX9-NEXT: s_mov_b64 s[6:7], exec 1324; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1325; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1326; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1327; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1328; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1329; GFX9-NEXT: s_cbranch_execz BB6_2 1330; GFX9-NEXT: ; %bb.1: 1331; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1333; GFX9-NEXT: s_mul_i32 s7, s3, s6 1334; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1335; GFX9-NEXT: s_add_i32 s8, s8, s7 1336; GFX9-NEXT: s_mul_i32 s6, s2, s6 1337; GFX9-NEXT: v_mov_b32_e32 v1, s6 1338; GFX9-NEXT: v_mov_b32_e32 v2, s8 1339; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1340; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1341; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1342; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX9-NEXT: BB6_2: 1344; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1345; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1346; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1347; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1348; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1349; GFX9-NEXT: s_mov_b32 s4, s0 1350; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1351; GFX9-NEXT: s_mov_b32 s5, s1 1352; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1353; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1354; GFX9-NEXT: v_mov_b32_e32 v2, s1 1355; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1356; GFX9-NEXT: s_mov_b32 s7, 0xf000 1357; GFX9-NEXT: s_mov_b32 s6, -1 1358; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1359; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1360; GFX9-NEXT: s_endpgm 1361; 1362; GFX1064-LABEL: add_i64_uniform: 1363; GFX1064: ; %bb.0: ; %entry 1364; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1365; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1366; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1367; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1368; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1369; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1370; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1371; GFX1064-NEXT: s_cbranch_execz BB6_2 1372; GFX1064-NEXT: ; %bb.1: 1373; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1374; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1375; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1376; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1377; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1378; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1379; GFX1064-NEXT: s_add_i32 s8, s8, s7 1380; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1381; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1382; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1383; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1384; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1385; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1386; GFX1064-NEXT: buffer_gl0_inv 1387; GFX1064-NEXT: BB6_2: 1388; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1389; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1390; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1391; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1392; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1393; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1394; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1395; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1396; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1397; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1398; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 1399; GFX1064-NEXT: s_mov_b32 s2, -1 1400; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1401; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1402; GFX1064-NEXT: s_endpgm 1403; 1404; GFX1032-LABEL: add_i64_uniform: 1405; GFX1032: ; %bb.0: ; %entry 1406; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1407; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1408; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1409; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1410; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1411; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1412; GFX1032-NEXT: s_cbranch_execz BB6_2 1413; GFX1032-NEXT: ; %bb.1: 1414; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1415; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1416; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1418; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1419; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1420; GFX1032-NEXT: s_add_i32 s7, s7, s6 1421; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1422; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1423; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1424; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1425; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1426; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1427; GFX1032-NEXT: buffer_gl0_inv 1428; GFX1032-NEXT: BB6_2: 1429; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1430; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1431; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1432; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1433; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1434; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1435; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1436; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1437; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1438; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1439; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 1440; GFX1032-NEXT: s_mov_b32 s2, -1 1441; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1442; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1443; GFX1032-NEXT: s_endpgm 1444entry: 1445 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1446 store i64 %old, i64 addrspace(1)* %out 1447 ret void 1448} 1449 1450; GCN-NOT: v_mbcnt_lo_u32_b32 1451; GCN-NOT: v_mbcnt_hi_u32_b32 1452; GCN-NOT: s_bcnt1_i32_b64 1453define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1454; 1455; 1456; GFX7LESS-LABEL: add_i64_varying: 1457; GFX7LESS: ; %bb.0: ; %entry 1458; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1459; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1460; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1461; GFX7LESS-NEXT: s_mov_b32 m0, -1 1462; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1463; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1464; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1465; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1466; GFX7LESS-NEXT: s_mov_b32 s2, -1 1467; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1468; GFX7LESS-NEXT: s_endpgm 1469; 1470; GFX8-LABEL: add_i64_varying: 1471; GFX8: ; %bb.0: ; %entry 1472; GFX8-NEXT: v_mov_b32_e32 v1, 0 1473; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1474; GFX8-NEXT: s_mov_b32 m0, -1 1475; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1476; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1477; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1478; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1479; GFX8-NEXT: s_mov_b32 s3, 0xf000 1480; GFX8-NEXT: s_mov_b32 s2, -1 1481; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1482; GFX8-NEXT: s_endpgm 1483; 1484; GFX9-LABEL: add_i64_varying: 1485; GFX9: ; %bb.0: ; %entry 1486; GFX9-NEXT: v_mov_b32_e32 v1, 0 1487; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1488; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1489; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1491; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1492; GFX9-NEXT: s_mov_b32 s3, 0xf000 1493; GFX9-NEXT: s_mov_b32 s2, -1 1494; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1495; GFX9-NEXT: s_endpgm 1496; 1497; GFX1064-LABEL: add_i64_varying: 1498; GFX1064: ; %bb.0: ; %entry 1499; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1500; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1501; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1502; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1503; GFX1064-NEXT: s_mov_b32 s2, -1 1504; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1505; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1506; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1507; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1508; GFX1064-NEXT: buffer_gl0_inv 1509; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1510; GFX1064-NEXT: s_endpgm 1511; 1512; GFX1032-LABEL: add_i64_varying: 1513; GFX1032: ; %bb.0: ; %entry 1514; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1515; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1516; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1517; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1518; GFX1032-NEXT: s_mov_b32 s2, -1 1519; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1520; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1521; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1522; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1523; GFX1032-NEXT: buffer_gl0_inv 1524; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1525; GFX1032-NEXT: s_endpgm 1526entry: 1527 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1528 %zext = zext i32 %lane to i64 1529 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1530 store i64 %old, i64 addrspace(1)* %out 1531 ret void 1532} 1533 1534define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1535; 1536; 1537; GFX7LESS-LABEL: sub_i32_constant: 1538; GFX7LESS: ; %bb.0: ; %entry 1539; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1540; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1541; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1542; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1543; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1544; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1545; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1546; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1547; GFX7LESS-NEXT: ; %bb.1: 1548; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1549; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1550; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1551; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1552; GFX7LESS-NEXT: s_mov_b32 m0, -1 1553; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1554; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1555; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1556; GFX7LESS-NEXT: BB8_2: 1557; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1558; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1559; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1560; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1561; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1562; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1563; GFX7LESS-NEXT: s_mov_b32 s2, -1 1564; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1565; GFX7LESS-NEXT: s_endpgm 1566; 1567; GFX8-LABEL: sub_i32_constant: 1568; GFX8: ; %bb.0: ; %entry 1569; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1570; GFX8-NEXT: s_mov_b64 s[2:3], exec 1571; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1572; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1573; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1574; GFX8-NEXT: ; implicit-def: $vgpr1 1575; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1576; GFX8-NEXT: s_cbranch_execz BB8_2 1577; GFX8-NEXT: ; %bb.1: 1578; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1579; GFX8-NEXT: s_mul_i32 s2, s2, 5 1580; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1581; GFX8-NEXT: v_mov_b32_e32 v2, s2 1582; GFX8-NEXT: s_mov_b32 m0, -1 1583; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1584; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1585; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1586; GFX8-NEXT: BB8_2: 1587; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1588; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1589; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1590; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1591; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1592; GFX8-NEXT: s_mov_b32 s3, 0xf000 1593; GFX8-NEXT: s_mov_b32 s2, -1 1594; GFX8-NEXT: s_nop 0 1595; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1596; GFX8-NEXT: s_endpgm 1597; 1598; GFX9-LABEL: sub_i32_constant: 1599; GFX9: ; %bb.0: ; %entry 1600; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1601; GFX9-NEXT: s_mov_b64 s[2:3], exec 1602; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1603; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1604; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1605; GFX9-NEXT: ; implicit-def: $vgpr1 1606; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1607; GFX9-NEXT: s_cbranch_execz BB8_2 1608; GFX9-NEXT: ; %bb.1: 1609; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1610; GFX9-NEXT: s_mul_i32 s2, s2, 5 1611; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1612; GFX9-NEXT: v_mov_b32_e32 v2, s2 1613; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1615; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1616; GFX9-NEXT: BB8_2: 1617; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1618; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1619; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1620; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1621; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1622; GFX9-NEXT: s_mov_b32 s3, 0xf000 1623; GFX9-NEXT: s_mov_b32 s2, -1 1624; GFX9-NEXT: s_nop 0 1625; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1626; GFX9-NEXT: s_endpgm 1627; 1628; GFX1064-LABEL: sub_i32_constant: 1629; GFX1064: ; %bb.0: ; %entry 1630; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1631; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1632; GFX1064-NEXT: ; implicit-def: $vgpr1 1633; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1634; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1635; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1636; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1637; GFX1064-NEXT: s_cbranch_execz BB8_2 1638; GFX1064-NEXT: ; %bb.1: 1639; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1640; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1641; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1642; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1643; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1644; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1645; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1646; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1647; GFX1064-NEXT: buffer_gl0_inv 1648; GFX1064-NEXT: BB8_2: 1649; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1650; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1651; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1652; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1653; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1654; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1655; GFX1064-NEXT: s_mov_b32 s2, -1 1656; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1657; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1658; GFX1064-NEXT: s_endpgm 1659; 1660; GFX1032-LABEL: sub_i32_constant: 1661; GFX1032: ; %bb.0: ; %entry 1662; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1663; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1664; GFX1032-NEXT: ; implicit-def: $vgpr1 1665; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1666; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1667; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1668; GFX1032-NEXT: s_cbranch_execz BB8_2 1669; GFX1032-NEXT: ; %bb.1: 1670; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1671; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1672; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1673; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1674; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1675; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1676; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1677; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1678; GFX1032-NEXT: buffer_gl0_inv 1679; GFX1032-NEXT: BB8_2: 1680; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1681; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1682; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1683; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1684; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1685; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1686; GFX1032-NEXT: s_mov_b32 s2, -1 1687; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1688; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1689; GFX1032-NEXT: s_endpgm 1690entry: 1691 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1692 store i32 %old, i32 addrspace(1)* %out 1693 ret void 1694} 1695 1696define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1697; 1698; 1699; GFX7LESS-LABEL: sub_i32_uniform: 1700; GFX7LESS: ; %bb.0: ; %entry 1701; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1702; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1703; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 1704; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1705; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1706; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1707; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1708; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 1709; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1710; GFX7LESS-NEXT: ; %bb.1: 1711; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1712; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1713; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 1714; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1715; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1716; GFX7LESS-NEXT: s_mov_b32 m0, -1 1717; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1718; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1719; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1720; GFX7LESS-NEXT: BB9_2: 1721; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 1722; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1723; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1724; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1725; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1726; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 1727; GFX7LESS-NEXT: s_mov_b32 s6, -1 1728; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1729; GFX7LESS-NEXT: s_endpgm 1730; 1731; GFX8-LABEL: sub_i32_uniform: 1732; GFX8: ; %bb.0: ; %entry 1733; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1734; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1735; GFX8-NEXT: s_mov_b64 s[2:3], exec 1736; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1737; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1738; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1739; GFX8-NEXT: ; implicit-def: $vgpr1 1740; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1741; GFX8-NEXT: s_cbranch_execz BB9_2 1742; GFX8-NEXT: ; %bb.1: 1743; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1744; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1745; GFX8-NEXT: s_mul_i32 s1, s0, s1 1746; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1747; GFX8-NEXT: v_mov_b32_e32 v2, s1 1748; GFX8-NEXT: s_mov_b32 m0, -1 1749; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1750; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1751; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1752; GFX8-NEXT: BB9_2: 1753; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1754; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1756; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1757; GFX8-NEXT: s_mov_b32 s7, 0xf000 1758; GFX8-NEXT: s_mov_b32 s6, -1 1759; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1760; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1761; GFX8-NEXT: s_endpgm 1762; 1763; GFX9-LABEL: sub_i32_uniform: 1764; GFX9: ; %bb.0: ; %entry 1765; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1766; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1767; GFX9-NEXT: s_mov_b64 s[6:7], exec 1768; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1769; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1770; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1771; GFX9-NEXT: ; implicit-def: $vgpr1 1772; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1773; GFX9-NEXT: s_cbranch_execz BB9_2 1774; GFX9-NEXT: ; %bb.1: 1775; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1776; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1777; GFX9-NEXT: s_mul_i32 s3, s2, s3 1778; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1779; GFX9-NEXT: v_mov_b32_e32 v2, s3 1780; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1781; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1782; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1783; GFX9-NEXT: BB9_2: 1784; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1785; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1786; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1787; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1788; GFX9-NEXT: s_mov_b32 s7, 0xf000 1789; GFX9-NEXT: s_mov_b32 s6, -1 1790; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1791; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1792; GFX9-NEXT: s_endpgm 1793; 1794; GFX1064-LABEL: sub_i32_uniform: 1795; GFX1064: ; %bb.0: ; %entry 1796; GFX1064-NEXT: s_clause 0x1 1797; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1798; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 1799; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1800; GFX1064-NEXT: ; implicit-def: $vgpr1 1801; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1802; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1803; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1804; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1805; GFX1064-NEXT: s_cbranch_execz BB9_2 1806; GFX1064-NEXT: ; %bb.1: 1807; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1808; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1809; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1810; GFX1064-NEXT: s_mul_i32 s3, s2, s3 1811; GFX1064-NEXT: v_mov_b32_e32 v2, s3 1812; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1813; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1814; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1815; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1816; GFX1064-NEXT: buffer_gl0_inv 1817; GFX1064-NEXT: BB9_2: 1818; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1819; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1820; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1821; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1822; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1823; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1824; GFX1064-NEXT: s_mov_b32 s6, -1 1825; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1826; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1827; GFX1064-NEXT: s_endpgm 1828; 1829; GFX1032-LABEL: sub_i32_uniform: 1830; GFX1032: ; %bb.0: ; %entry 1831; GFX1032-NEXT: s_clause 0x1 1832; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1833; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1834; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1835; GFX1032-NEXT: ; implicit-def: $vgpr1 1836; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1837; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1838; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1839; GFX1032-NEXT: s_cbranch_execz BB9_2 1840; GFX1032-NEXT: ; %bb.1: 1841; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1842; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1843; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1844; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1845; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1846; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1847; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1848; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1849; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1850; GFX1032-NEXT: buffer_gl0_inv 1851; GFX1032-NEXT: BB9_2: 1852; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1853; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1854; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1856; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1857; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1858; GFX1032-NEXT: s_mov_b32 s6, -1 1859; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1860; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1861; GFX1032-NEXT: s_endpgm 1862entry: 1863 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1864 store i32 %old, i32 addrspace(1)* %out 1865 ret void 1866} 1867 1868define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1869; 1870; 1871; GFX7LESS-LABEL: sub_i32_varying: 1872; GFX7LESS: ; %bb.0: ; %entry 1873; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1874; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1875; GFX7LESS-NEXT: s_mov_b32 m0, -1 1876; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1877; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1878; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1879; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1880; GFX7LESS-NEXT: s_mov_b32 s2, -1 1881; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1882; GFX7LESS-NEXT: s_endpgm 1883; 1884; GFX8-LABEL: sub_i32_varying: 1885; GFX8: ; %bb.0: ; %entry 1886; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1887; GFX8-NEXT: v_mov_b32_e32 v2, v0 1888; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1889; GFX8-NEXT: v_mov_b32_e32 v1, 0 1890; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1891; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1892; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1893; GFX8-NEXT: s_not_b64 exec, exec 1894; GFX8-NEXT: v_mov_b32_e32 v2, 0 1895; GFX8-NEXT: s_not_b64 exec, exec 1896; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1897; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1898; GFX8-NEXT: s_nop 1 1899; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1900; GFX8-NEXT: s_nop 1 1901; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1902; GFX8-NEXT: s_nop 1 1903; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1904; GFX8-NEXT: s_nop 1 1905; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1906; GFX8-NEXT: s_nop 1 1907; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1908; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1909; GFX8-NEXT: s_nop 0 1910; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1911; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1912; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1913; GFX8-NEXT: ; implicit-def: $vgpr0 1914; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1915; GFX8-NEXT: s_cbranch_execz BB10_2 1916; GFX8-NEXT: ; %bb.1: 1917; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1918; GFX8-NEXT: v_mov_b32_e32 v3, s4 1919; GFX8-NEXT: s_mov_b32 m0, -1 1920; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1921; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1922; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1923; GFX8-NEXT: BB10_2: 1924; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1925; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1926; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1927; GFX8-NEXT: v_mov_b32_e32 v0, v1 1928; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1929; GFX8-NEXT: s_mov_b32 s3, 0xf000 1930; GFX8-NEXT: s_mov_b32 s2, -1 1931; GFX8-NEXT: s_nop 0 1932; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1933; GFX8-NEXT: s_endpgm 1934; 1935; GFX9-LABEL: sub_i32_varying: 1936; GFX9: ; %bb.0: ; %entry 1937; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1938; GFX9-NEXT: v_mov_b32_e32 v2, v0 1939; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1940; GFX9-NEXT: v_mov_b32_e32 v1, 0 1941; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1942; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1943; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1944; GFX9-NEXT: s_not_b64 exec, exec 1945; GFX9-NEXT: v_mov_b32_e32 v2, 0 1946; GFX9-NEXT: s_not_b64 exec, exec 1947; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1948; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1949; GFX9-NEXT: s_nop 1 1950; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1951; GFX9-NEXT: s_nop 1 1952; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1953; GFX9-NEXT: s_nop 1 1954; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1955; GFX9-NEXT: s_nop 1 1956; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1957; GFX9-NEXT: s_nop 1 1958; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1959; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1960; GFX9-NEXT: s_nop 0 1961; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1962; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1963; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1964; GFX9-NEXT: ; implicit-def: $vgpr0 1965; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1966; GFX9-NEXT: s_cbranch_execz BB10_2 1967; GFX9-NEXT: ; %bb.1: 1968; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1969; GFX9-NEXT: v_mov_b32_e32 v3, s4 1970; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1971; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1972; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1973; GFX9-NEXT: BB10_2: 1974; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1976; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1977; GFX9-NEXT: v_mov_b32_e32 v0, v1 1978; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1979; GFX9-NEXT: s_mov_b32 s3, 0xf000 1980; GFX9-NEXT: s_mov_b32 s2, -1 1981; GFX9-NEXT: s_nop 0 1982; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1983; GFX9-NEXT: s_endpgm 1984; 1985; GFX1064-LABEL: sub_i32_varying: 1986; GFX1064: ; %bb.0: ; %entry 1987; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1988; GFX1064-NEXT: s_not_b64 exec, exec 1989; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1990; GFX1064-NEXT: s_not_b64 exec, exec 1991; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1992; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1993; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1994; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1995; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1996; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1997; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1998; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1999; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2000; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2001; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2002; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2003; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2004; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2005; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2006; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2007; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2008; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2009; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2010; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2011; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2012; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2013; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2014; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2015; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2016; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2017; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2018; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2019; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2020; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2021; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2022; GFX1064-NEXT: s_mov_b32 s2, -1 2023; GFX1064-NEXT: ; implicit-def: $vgpr0 2024; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2025; GFX1064-NEXT: s_cbranch_execz BB10_2 2026; GFX1064-NEXT: ; %bb.1: 2027; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2028; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2029; GFX1064-NEXT: s_mov_b32 s3, s7 2030; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2031; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2032; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 2033; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2034; GFX1064-NEXT: buffer_gl0_inv 2035; GFX1064-NEXT: BB10_2: 2036; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2037; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2038; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2039; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2040; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2041; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2042; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2043; GFX1064-NEXT: s_nop 0 2044; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2045; GFX1064-NEXT: s_endpgm 2046; 2047; GFX1032-LABEL: sub_i32_varying: 2048; GFX1032: ; %bb.0: ; %entry 2049; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2050; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2051; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2052; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2053; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2054; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2055; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2056; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2057; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2058; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2059; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2060; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2061; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2062; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2063; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2064; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2065; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2066; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2067; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2068; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2069; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2070; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2071; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2072; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2073; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2074; GFX1032-NEXT: s_mov_b32 s2, -1 2075; GFX1032-NEXT: ; implicit-def: $vgpr0 2076; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2077; GFX1032-NEXT: s_cbranch_execz BB10_2 2078; GFX1032-NEXT: ; %bb.1: 2079; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2080; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2081; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2082; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2083; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 2084; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2085; GFX1032-NEXT: buffer_gl0_inv 2086; GFX1032-NEXT: BB10_2: 2087; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2088; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2089; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2090; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2091; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2092; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2093; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2094; GFX1032-NEXT: s_nop 0 2095; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2096; GFX1032-NEXT: s_endpgm 2097entry: 2098 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2099 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2100 store i32 %old, i32 addrspace(1)* %out 2101 ret void 2102} 2103 2104define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2105; 2106; 2107; GFX7LESS-LABEL: sub_i64_constant: 2108; GFX7LESS: ; %bb.0: ; %entry 2109; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2110; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2111; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2112; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2113; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2114; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2115; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2116; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2117; GFX7LESS-NEXT: ; %bb.1: 2118; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2119; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2120; GFX7LESS-NEXT: s_mul_i32 s5, s4, 5 2121; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2122; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 2123; GFX7LESS-NEXT: s_mov_b32 m0, -1 2124; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2125; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2126; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2127; GFX7LESS-NEXT: BB11_2: 2128; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2129; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2130; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2131; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2132; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2133; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2134; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2135; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2136; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2137; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2138; GFX7LESS-NEXT: s_mov_b32 s2, -1 2139; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2140; GFX7LESS-NEXT: s_endpgm 2141; 2142; GFX8-LABEL: sub_i64_constant: 2143; GFX8: ; %bb.0: ; %entry 2144; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2145; GFX8-NEXT: s_mov_b64 s[4:5], exec 2146; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2147; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2148; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2149; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2150; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2151; GFX8-NEXT: s_cbranch_execz BB11_2 2152; GFX8-NEXT: ; %bb.1: 2153; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2154; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2155; GFX8-NEXT: s_mul_i32 s4, s4, 5 2156; GFX8-NEXT: v_mov_b32_e32 v1, s4 2157; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2158; GFX8-NEXT: s_mov_b32 m0, -1 2159; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2160; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2161; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2162; GFX8-NEXT: BB11_2: 2163; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2164; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2165; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2166; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2167; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2168; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2169; GFX8-NEXT: v_mov_b32_e32 v2, s3 2170; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2171; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2172; GFX8-NEXT: s_mov_b32 s3, 0xf000 2173; GFX8-NEXT: s_mov_b32 s2, -1 2174; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2175; GFX8-NEXT: s_endpgm 2176; 2177; GFX9-LABEL: sub_i64_constant: 2178; GFX9: ; %bb.0: ; %entry 2179; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2180; GFX9-NEXT: s_mov_b64 s[4:5], exec 2181; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2182; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2183; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2184; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2185; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2186; GFX9-NEXT: s_cbranch_execz BB11_2 2187; GFX9-NEXT: ; %bb.1: 2188; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2189; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2190; GFX9-NEXT: s_mul_i32 s4, s4, 5 2191; GFX9-NEXT: v_mov_b32_e32 v1, s4 2192; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2193; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2194; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2195; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2196; GFX9-NEXT: BB11_2: 2197; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2198; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2199; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2200; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2201; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2202; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2203; GFX9-NEXT: v_mov_b32_e32 v2, s3 2204; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2205; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2206; GFX9-NEXT: s_mov_b32 s3, 0xf000 2207; GFX9-NEXT: s_mov_b32 s2, -1 2208; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2209; GFX9-NEXT: s_endpgm 2210; 2211; GFX1064-LABEL: sub_i64_constant: 2212; GFX1064: ; %bb.0: ; %entry 2213; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2214; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2215; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2216; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2217; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2218; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2219; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2220; GFX1064-NEXT: s_cbranch_execz BB11_2 2221; GFX1064-NEXT: ; %bb.1: 2222; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2223; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2224; GFX1064-NEXT: s_mul_i32 s5, s4, 5 2225; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2226; GFX1064-NEXT: v_mov_b32_e32 v1, s5 2227; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2228; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2229; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2230; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2231; GFX1064-NEXT: buffer_gl0_inv 2232; GFX1064-NEXT: BB11_2: 2233; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2234; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2235; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2236; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2237; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2238; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2239; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2240; GFX1064-NEXT: s_mov_b32 s2, -1 2241; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2242; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2243; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2244; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2245; GFX1064-NEXT: s_endpgm 2246; 2247; GFX1032-LABEL: sub_i64_constant: 2248; GFX1032: ; %bb.0: ; %entry 2249; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2250; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2251; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2252; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2253; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2254; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2255; GFX1032-NEXT: s_cbranch_execz BB11_2 2256; GFX1032-NEXT: ; %bb.1: 2257; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2258; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2259; GFX1032-NEXT: s_mul_i32 s4, s3, 5 2260; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2261; GFX1032-NEXT: v_mov_b32_e32 v1, s4 2262; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2263; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2264; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2265; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2266; GFX1032-NEXT: buffer_gl0_inv 2267; GFX1032-NEXT: BB11_2: 2268; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2269; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2270; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2271; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2272; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2273; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2274; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2275; GFX1032-NEXT: s_mov_b32 s2, -1 2276; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2277; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2278; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2279; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2280; GFX1032-NEXT: s_endpgm 2281entry: 2282 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2283 store i64 %old, i64 addrspace(1)* %out 2284 ret void 2285} 2286 2287define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2288; 2289; 2290; GFX7LESS-LABEL: sub_i64_uniform: 2291; GFX7LESS: ; %bb.0: ; %entry 2292; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2293; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2294; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2295; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2296; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2297; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2298; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2299; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2300; GFX7LESS-NEXT: ; %bb.1: 2301; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2302; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2303; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2304; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2305; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2306; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2307; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2308; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2309; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2310; GFX7LESS-NEXT: s_mov_b32 m0, -1 2311; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2312; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2313; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2314; GFX7LESS-NEXT: BB12_2: 2315; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2316; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2317; GFX7LESS-NEXT: s_mov_b32 s6, -1 2318; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2319; GFX7LESS-NEXT: s_mov_b32 s4, s0 2320; GFX7LESS-NEXT: s_mov_b32 s5, s1 2321; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2322; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2323; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2324; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2325; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2326; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2327; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2328; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2329; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2330; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2331; GFX7LESS-NEXT: s_endpgm 2332; 2333; GFX8-LABEL: sub_i64_uniform: 2334; GFX8: ; %bb.0: ; %entry 2335; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2336; GFX8-NEXT: s_mov_b64 s[6:7], exec 2337; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2338; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2339; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2340; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2341; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2342; GFX8-NEXT: s_cbranch_execz BB12_2 2343; GFX8-NEXT: ; %bb.1: 2344; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2345; GFX8-NEXT: v_mov_b32_e32 v1, s6 2346; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2347; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2348; GFX8-NEXT: s_mul_i32 s7, s3, s6 2349; GFX8-NEXT: s_mul_i32 s6, s2, s6 2350; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2351; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2352; GFX8-NEXT: v_mov_b32_e32 v1, s6 2353; GFX8-NEXT: s_mov_b32 m0, -1 2354; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2355; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2356; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2357; GFX8-NEXT: BB12_2: 2358; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2359; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2360; GFX8-NEXT: s_mov_b32 s4, s0 2361; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2362; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2363; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2364; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2365; GFX8-NEXT: s_mov_b32 s5, s1 2366; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2367; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2368; GFX8-NEXT: v_mov_b32_e32 v2, s1 2369; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2370; GFX8-NEXT: s_mov_b32 s7, 0xf000 2371; GFX8-NEXT: s_mov_b32 s6, -1 2372; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2373; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2374; GFX8-NEXT: s_endpgm 2375; 2376; GFX9-LABEL: sub_i64_uniform: 2377; GFX9: ; %bb.0: ; %entry 2378; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2379; GFX9-NEXT: s_mov_b64 s[6:7], exec 2380; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2381; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2382; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2383; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2384; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2385; GFX9-NEXT: s_cbranch_execz BB12_2 2386; GFX9-NEXT: ; %bb.1: 2387; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2388; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2389; GFX9-NEXT: s_mul_i32 s7, s3, s6 2390; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2391; GFX9-NEXT: s_add_i32 s8, s8, s7 2392; GFX9-NEXT: s_mul_i32 s6, s2, s6 2393; GFX9-NEXT: v_mov_b32_e32 v1, s6 2394; GFX9-NEXT: v_mov_b32_e32 v2, s8 2395; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2396; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2397; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2398; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2399; GFX9-NEXT: BB12_2: 2400; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2401; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2402; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2403; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2404; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2405; GFX9-NEXT: s_mov_b32 s4, s0 2406; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2407; GFX9-NEXT: s_mov_b32 s5, s1 2408; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2409; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2410; GFX9-NEXT: v_mov_b32_e32 v2, s1 2411; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2412; GFX9-NEXT: s_mov_b32 s7, 0xf000 2413; GFX9-NEXT: s_mov_b32 s6, -1 2414; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2415; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2416; GFX9-NEXT: s_endpgm 2417; 2418; GFX1064-LABEL: sub_i64_uniform: 2419; GFX1064: ; %bb.0: ; %entry 2420; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2421; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2422; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2423; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2424; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2425; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2426; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2427; GFX1064-NEXT: s_cbranch_execz BB12_2 2428; GFX1064-NEXT: ; %bb.1: 2429; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2430; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2431; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2432; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2433; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2434; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2435; GFX1064-NEXT: s_add_i32 s8, s8, s7 2436; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2437; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2438; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2439; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2440; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2441; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2442; GFX1064-NEXT: buffer_gl0_inv 2443; GFX1064-NEXT: BB12_2: 2444; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2445; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2446; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2447; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2448; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2449; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2450; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2451; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2452; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2453; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2454; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0 2455; GFX1064-NEXT: s_mov_b32 s2, -1 2456; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2457; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2458; GFX1064-NEXT: s_endpgm 2459; 2460; GFX1032-LABEL: sub_i64_uniform: 2461; GFX1032: ; %bb.0: ; %entry 2462; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2463; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2464; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2465; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2466; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2467; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2468; GFX1032-NEXT: s_cbranch_execz BB12_2 2469; GFX1032-NEXT: ; %bb.1: 2470; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2471; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2472; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2473; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2474; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2475; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2476; GFX1032-NEXT: s_add_i32 s7, s7, s6 2477; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2478; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2479; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2480; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2481; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2482; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2483; GFX1032-NEXT: buffer_gl0_inv 2484; GFX1032-NEXT: BB12_2: 2485; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2486; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2487; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2488; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2489; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2490; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2491; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2492; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2493; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2494; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2495; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0 2496; GFX1032-NEXT: s_mov_b32 s2, -1 2497; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2498; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2499; GFX1032-NEXT: s_endpgm 2500entry: 2501 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2502 store i64 %old, i64 addrspace(1)* %out 2503 ret void 2504} 2505 2506; GCN-NOT: v_mbcnt_lo_u32_b32 2507; GCN-NOT: v_mbcnt_hi_u32_b32 2508; GCN-NOT: s_bcnt1_i32_b64 2509define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2510; 2511; 2512; GFX7LESS-LABEL: sub_i64_varying: 2513; GFX7LESS: ; %bb.0: ; %entry 2514; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2515; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2516; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2517; GFX7LESS-NEXT: s_mov_b32 m0, -1 2518; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2519; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2520; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2521; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2522; GFX7LESS-NEXT: s_mov_b32 s2, -1 2523; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2524; GFX7LESS-NEXT: s_endpgm 2525; 2526; GFX8-LABEL: sub_i64_varying: 2527; GFX8: ; %bb.0: ; %entry 2528; GFX8-NEXT: v_mov_b32_e32 v1, 0 2529; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2530; GFX8-NEXT: s_mov_b32 m0, -1 2531; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2532; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2533; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2534; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2535; GFX8-NEXT: s_mov_b32 s3, 0xf000 2536; GFX8-NEXT: s_mov_b32 s2, -1 2537; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2538; GFX8-NEXT: s_endpgm 2539; 2540; GFX9-LABEL: sub_i64_varying: 2541; GFX9: ; %bb.0: ; %entry 2542; GFX9-NEXT: v_mov_b32_e32 v1, 0 2543; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2544; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2545; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2546; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2547; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2548; GFX9-NEXT: s_mov_b32 s3, 0xf000 2549; GFX9-NEXT: s_mov_b32 s2, -1 2550; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2551; GFX9-NEXT: s_endpgm 2552; 2553; GFX1064-LABEL: sub_i64_varying: 2554; GFX1064: ; %bb.0: ; %entry 2555; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2556; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2557; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2558; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2559; GFX1064-NEXT: s_mov_b32 s2, -1 2560; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2561; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2562; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2563; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2564; GFX1064-NEXT: buffer_gl0_inv 2565; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2566; GFX1064-NEXT: s_endpgm 2567; 2568; GFX1032-LABEL: sub_i64_varying: 2569; GFX1032: ; %bb.0: ; %entry 2570; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2571; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2572; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2573; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2574; GFX1032-NEXT: s_mov_b32 s2, -1 2575; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2576; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2577; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2578; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2579; GFX1032-NEXT: buffer_gl0_inv 2580; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2581; GFX1032-NEXT: s_endpgm 2582entry: 2583 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2584 %zext = zext i32 %lane to i64 2585 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2586 store i64 %old, i64 addrspace(1)* %out 2587 ret void 2588} 2589 2590define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2591; 2592; 2593; GFX7LESS-LABEL: and_i32_varying: 2594; GFX7LESS: ; %bb.0: ; %entry 2595; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2596; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2597; GFX7LESS-NEXT: s_mov_b32 m0, -1 2598; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2599; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2600; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2601; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2602; GFX7LESS-NEXT: s_mov_b32 s2, -1 2603; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2604; GFX7LESS-NEXT: s_endpgm 2605; 2606; GFX8-LABEL: and_i32_varying: 2607; GFX8: ; %bb.0: ; %entry 2608; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2609; GFX8-NEXT: v_mov_b32_e32 v2, v0 2610; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2611; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2612; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2613; GFX8-NEXT: v_mov_b32_e32 v1, -1 2614; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2615; GFX8-NEXT: s_not_b64 exec, exec 2616; GFX8-NEXT: v_mov_b32_e32 v2, -1 2617; GFX8-NEXT: s_not_b64 exec, exec 2618; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2619; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2620; GFX8-NEXT: s_nop 1 2621; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2622; GFX8-NEXT: s_nop 1 2623; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2624; GFX8-NEXT: s_nop 1 2625; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2626; GFX8-NEXT: s_nop 1 2627; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2628; GFX8-NEXT: s_nop 1 2629; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2630; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2631; GFX8-NEXT: s_nop 0 2632; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2633; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2634; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2635; GFX8-NEXT: ; implicit-def: $vgpr0 2636; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2637; GFX8-NEXT: s_cbranch_execz BB14_2 2638; GFX8-NEXT: ; %bb.1: 2639; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2640; GFX8-NEXT: v_mov_b32_e32 v3, s4 2641; GFX8-NEXT: s_mov_b32 m0, -1 2642; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2643; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2644; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2645; GFX8-NEXT: BB14_2: 2646; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2647; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2648; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2649; GFX8-NEXT: v_mov_b32_e32 v0, v1 2650; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2651; GFX8-NEXT: s_mov_b32 s3, 0xf000 2652; GFX8-NEXT: s_mov_b32 s2, -1 2653; GFX8-NEXT: s_nop 0 2654; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2655; GFX8-NEXT: s_endpgm 2656; 2657; GFX9-LABEL: and_i32_varying: 2658; GFX9: ; %bb.0: ; %entry 2659; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2660; GFX9-NEXT: v_mov_b32_e32 v2, v0 2661; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2662; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2663; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2664; GFX9-NEXT: v_mov_b32_e32 v1, -1 2665; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2666; GFX9-NEXT: s_not_b64 exec, exec 2667; GFX9-NEXT: v_mov_b32_e32 v2, -1 2668; GFX9-NEXT: s_not_b64 exec, exec 2669; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2670; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2671; GFX9-NEXT: s_nop 1 2672; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2673; GFX9-NEXT: s_nop 1 2674; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2675; GFX9-NEXT: s_nop 1 2676; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2677; GFX9-NEXT: s_nop 1 2678; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2679; GFX9-NEXT: s_nop 1 2680; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2681; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2682; GFX9-NEXT: s_nop 0 2683; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2684; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2685; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2686; GFX9-NEXT: ; implicit-def: $vgpr0 2687; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2688; GFX9-NEXT: s_cbranch_execz BB14_2 2689; GFX9-NEXT: ; %bb.1: 2690; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2691; GFX9-NEXT: v_mov_b32_e32 v3, s4 2692; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2693; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2694; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2695; GFX9-NEXT: BB14_2: 2696; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2698; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2699; GFX9-NEXT: v_mov_b32_e32 v0, v1 2700; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2701; GFX9-NEXT: s_mov_b32 s3, 0xf000 2702; GFX9-NEXT: s_mov_b32 s2, -1 2703; GFX9-NEXT: s_nop 0 2704; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2705; GFX9-NEXT: s_endpgm 2706; 2707; GFX1064-LABEL: and_i32_varying: 2708; GFX1064: ; %bb.0: ; %entry 2709; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2710; GFX1064-NEXT: s_not_b64 exec, exec 2711; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2712; GFX1064-NEXT: s_not_b64 exec, exec 2713; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2714; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2715; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2716; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2717; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2718; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2719; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2720; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2721; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2722; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2723; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2724; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2725; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2726; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2727; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2728; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2729; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2730; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2731; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2732; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2733; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2734; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2735; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2736; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2737; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2738; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2739; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2740; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2741; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2742; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2743; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2744; GFX1064-NEXT: s_mov_b32 s2, -1 2745; GFX1064-NEXT: ; implicit-def: $vgpr0 2746; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2747; GFX1064-NEXT: s_cbranch_execz BB14_2 2748; GFX1064-NEXT: ; %bb.1: 2749; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2750; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2751; GFX1064-NEXT: s_mov_b32 s3, s7 2752; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2753; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2754; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 2755; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2756; GFX1064-NEXT: buffer_gl0_inv 2757; GFX1064-NEXT: BB14_2: 2758; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2759; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2760; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2761; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2762; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2763; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2764; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2765; GFX1064-NEXT: s_nop 0 2766; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2767; GFX1064-NEXT: s_endpgm 2768; 2769; GFX1032-LABEL: and_i32_varying: 2770; GFX1032: ; %bb.0: ; %entry 2771; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2772; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2773; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2774; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2775; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2776; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2777; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2778; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2779; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2780; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2781; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2782; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2783; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2784; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2785; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2786; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2787; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2788; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2789; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2790; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2791; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2792; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2793; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2794; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2795; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2796; GFX1032-NEXT: s_mov_b32 s2, -1 2797; GFX1032-NEXT: ; implicit-def: $vgpr0 2798; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2799; GFX1032-NEXT: s_cbranch_execz BB14_2 2800; GFX1032-NEXT: ; %bb.1: 2801; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2802; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2803; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2804; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2805; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 2806; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2807; GFX1032-NEXT: buffer_gl0_inv 2808; GFX1032-NEXT: BB14_2: 2809; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2810; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2811; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2812; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2813; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2814; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2815; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2816; GFX1032-NEXT: s_nop 0 2817; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2818; GFX1032-NEXT: s_endpgm 2819entry: 2820 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2821 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2822 store i32 %old, i32 addrspace(1)* %out 2823 ret void 2824} 2825 2826define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2827; 2828; 2829; GFX7LESS-LABEL: or_i32_varying: 2830; GFX7LESS: ; %bb.0: ; %entry 2831; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2832; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2833; GFX7LESS-NEXT: s_mov_b32 m0, -1 2834; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2835; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2836; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2838; GFX7LESS-NEXT: s_mov_b32 s2, -1 2839; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2840; GFX7LESS-NEXT: s_endpgm 2841; 2842; GFX8-LABEL: or_i32_varying: 2843; GFX8: ; %bb.0: ; %entry 2844; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2845; GFX8-NEXT: v_mov_b32_e32 v2, v0 2846; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2847; GFX8-NEXT: v_mov_b32_e32 v1, 0 2848; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2849; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2850; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2851; GFX8-NEXT: s_not_b64 exec, exec 2852; GFX8-NEXT: v_mov_b32_e32 v2, 0 2853; GFX8-NEXT: s_not_b64 exec, exec 2854; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2855; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2856; GFX8-NEXT: s_nop 1 2857; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2858; GFX8-NEXT: s_nop 1 2859; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2860; GFX8-NEXT: s_nop 1 2861; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2862; GFX8-NEXT: s_nop 1 2863; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2864; GFX8-NEXT: s_nop 1 2865; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2866; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2867; GFX8-NEXT: s_nop 0 2868; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2869; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2870; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2871; GFX8-NEXT: ; implicit-def: $vgpr0 2872; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2873; GFX8-NEXT: s_cbranch_execz BB15_2 2874; GFX8-NEXT: ; %bb.1: 2875; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2876; GFX8-NEXT: v_mov_b32_e32 v3, s4 2877; GFX8-NEXT: s_mov_b32 m0, -1 2878; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2879; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2880; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2881; GFX8-NEXT: BB15_2: 2882; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2883; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2884; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2885; GFX8-NEXT: v_mov_b32_e32 v0, v1 2886; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2887; GFX8-NEXT: s_mov_b32 s3, 0xf000 2888; GFX8-NEXT: s_mov_b32 s2, -1 2889; GFX8-NEXT: s_nop 0 2890; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2891; GFX8-NEXT: s_endpgm 2892; 2893; GFX9-LABEL: or_i32_varying: 2894; GFX9: ; %bb.0: ; %entry 2895; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2896; GFX9-NEXT: v_mov_b32_e32 v2, v0 2897; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2898; GFX9-NEXT: v_mov_b32_e32 v1, 0 2899; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2900; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2901; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2902; GFX9-NEXT: s_not_b64 exec, exec 2903; GFX9-NEXT: v_mov_b32_e32 v2, 0 2904; GFX9-NEXT: s_not_b64 exec, exec 2905; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2906; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2907; GFX9-NEXT: s_nop 1 2908; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2909; GFX9-NEXT: s_nop 1 2910; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2911; GFX9-NEXT: s_nop 1 2912; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2913; GFX9-NEXT: s_nop 1 2914; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2915; GFX9-NEXT: s_nop 1 2916; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2917; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2918; GFX9-NEXT: s_nop 0 2919; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2920; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2921; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2922; GFX9-NEXT: ; implicit-def: $vgpr0 2923; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2924; GFX9-NEXT: s_cbranch_execz BB15_2 2925; GFX9-NEXT: ; %bb.1: 2926; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2927; GFX9-NEXT: v_mov_b32_e32 v3, s4 2928; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2929; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2930; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2931; GFX9-NEXT: BB15_2: 2932; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2933; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2934; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2935; GFX9-NEXT: v_mov_b32_e32 v0, v1 2936; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2937; GFX9-NEXT: s_mov_b32 s3, 0xf000 2938; GFX9-NEXT: s_mov_b32 s2, -1 2939; GFX9-NEXT: s_nop 0 2940; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2941; GFX9-NEXT: s_endpgm 2942; 2943; GFX1064-LABEL: or_i32_varying: 2944; GFX1064: ; %bb.0: ; %entry 2945; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2946; GFX1064-NEXT: s_not_b64 exec, exec 2947; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2948; GFX1064-NEXT: s_not_b64 exec, exec 2949; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2950; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2951; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2952; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2953; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2954; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2955; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2956; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2957; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2958; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2959; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2960; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2961; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2962; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2963; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2964; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2965; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2966; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2967; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2968; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2969; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2970; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2971; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2972; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2973; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2974; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2975; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2976; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2977; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2978; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2979; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2980; GFX1064-NEXT: s_mov_b32 s2, -1 2981; GFX1064-NEXT: ; implicit-def: $vgpr0 2982; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2983; GFX1064-NEXT: s_cbranch_execz BB15_2 2984; GFX1064-NEXT: ; %bb.1: 2985; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2986; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2987; GFX1064-NEXT: s_mov_b32 s3, s7 2988; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2989; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2990; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 2991; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2992; GFX1064-NEXT: buffer_gl0_inv 2993; GFX1064-NEXT: BB15_2: 2994; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2995; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2996; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2997; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2998; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2999; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3000; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3001; GFX1064-NEXT: s_nop 0 3002; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3003; GFX1064-NEXT: s_endpgm 3004; 3005; GFX1032-LABEL: or_i32_varying: 3006; GFX1032: ; %bb.0: ; %entry 3007; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3008; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3009; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3010; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3011; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3012; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3013; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3014; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3015; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3016; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3017; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3018; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3019; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3020; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3021; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3022; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3023; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3024; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3025; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3026; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3027; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3028; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3029; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3030; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3031; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3032; GFX1032-NEXT: s_mov_b32 s2, -1 3033; GFX1032-NEXT: ; implicit-def: $vgpr0 3034; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3035; GFX1032-NEXT: s_cbranch_execz BB15_2 3036; GFX1032-NEXT: ; %bb.1: 3037; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3038; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3039; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3040; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3041; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 3042; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3043; GFX1032-NEXT: buffer_gl0_inv 3044; GFX1032-NEXT: BB15_2: 3045; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3046; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3047; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3048; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3049; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3050; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3051; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3052; GFX1032-NEXT: s_nop 0 3053; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3054; GFX1032-NEXT: s_endpgm 3055entry: 3056 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3057 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3058 store i32 %old, i32 addrspace(1)* %out 3059 ret void 3060} 3061 3062define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3063; 3064; 3065; GFX7LESS-LABEL: xor_i32_varying: 3066; GFX7LESS: ; %bb.0: ; %entry 3067; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3068; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3069; GFX7LESS-NEXT: s_mov_b32 m0, -1 3070; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3071; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3072; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3073; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3074; GFX7LESS-NEXT: s_mov_b32 s2, -1 3075; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3076; GFX7LESS-NEXT: s_endpgm 3077; 3078; GFX8-LABEL: xor_i32_varying: 3079; GFX8: ; %bb.0: ; %entry 3080; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3081; GFX8-NEXT: v_mov_b32_e32 v2, v0 3082; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3083; GFX8-NEXT: v_mov_b32_e32 v1, 0 3084; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3085; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3086; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3087; GFX8-NEXT: s_not_b64 exec, exec 3088; GFX8-NEXT: v_mov_b32_e32 v2, 0 3089; GFX8-NEXT: s_not_b64 exec, exec 3090; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3091; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3092; GFX8-NEXT: s_nop 1 3093; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3094; GFX8-NEXT: s_nop 1 3095; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3096; GFX8-NEXT: s_nop 1 3097; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3098; GFX8-NEXT: s_nop 1 3099; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3100; GFX8-NEXT: s_nop 1 3101; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3102; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3103; GFX8-NEXT: s_nop 0 3104; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3105; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3106; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3107; GFX8-NEXT: ; implicit-def: $vgpr0 3108; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3109; GFX8-NEXT: s_cbranch_execz BB16_2 3110; GFX8-NEXT: ; %bb.1: 3111; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3112; GFX8-NEXT: v_mov_b32_e32 v3, s4 3113; GFX8-NEXT: s_mov_b32 m0, -1 3114; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3115; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3116; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3117; GFX8-NEXT: BB16_2: 3118; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3119; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3120; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3121; GFX8-NEXT: v_mov_b32_e32 v0, v1 3122; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3123; GFX8-NEXT: s_mov_b32 s3, 0xf000 3124; GFX8-NEXT: s_mov_b32 s2, -1 3125; GFX8-NEXT: s_nop 0 3126; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3127; GFX8-NEXT: s_endpgm 3128; 3129; GFX9-LABEL: xor_i32_varying: 3130; GFX9: ; %bb.0: ; %entry 3131; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3132; GFX9-NEXT: v_mov_b32_e32 v2, v0 3133; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3134; GFX9-NEXT: v_mov_b32_e32 v1, 0 3135; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3136; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3137; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3138; GFX9-NEXT: s_not_b64 exec, exec 3139; GFX9-NEXT: v_mov_b32_e32 v2, 0 3140; GFX9-NEXT: s_not_b64 exec, exec 3141; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3142; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3143; GFX9-NEXT: s_nop 1 3144; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3145; GFX9-NEXT: s_nop 1 3146; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3147; GFX9-NEXT: s_nop 1 3148; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3149; GFX9-NEXT: s_nop 1 3150; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3151; GFX9-NEXT: s_nop 1 3152; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3153; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3154; GFX9-NEXT: s_nop 0 3155; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3156; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3157; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3158; GFX9-NEXT: ; implicit-def: $vgpr0 3159; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3160; GFX9-NEXT: s_cbranch_execz BB16_2 3161; GFX9-NEXT: ; %bb.1: 3162; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3163; GFX9-NEXT: v_mov_b32_e32 v3, s4 3164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3165; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3166; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3167; GFX9-NEXT: BB16_2: 3168; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3170; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3171; GFX9-NEXT: v_mov_b32_e32 v0, v1 3172; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3173; GFX9-NEXT: s_mov_b32 s3, 0xf000 3174; GFX9-NEXT: s_mov_b32 s2, -1 3175; GFX9-NEXT: s_nop 0 3176; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3177; GFX9-NEXT: s_endpgm 3178; 3179; GFX1064-LABEL: xor_i32_varying: 3180; GFX1064: ; %bb.0: ; %entry 3181; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3182; GFX1064-NEXT: s_not_b64 exec, exec 3183; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3184; GFX1064-NEXT: s_not_b64 exec, exec 3185; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3186; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3187; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3188; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3189; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3190; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3191; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3192; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3193; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3194; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3195; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3196; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3197; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3198; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3199; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3200; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3201; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3202; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3203; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3204; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3205; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3206; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3207; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3208; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3209; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3210; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3211; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3212; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3213; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3214; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3215; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3216; GFX1064-NEXT: s_mov_b32 s2, -1 3217; GFX1064-NEXT: ; implicit-def: $vgpr0 3218; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3219; GFX1064-NEXT: s_cbranch_execz BB16_2 3220; GFX1064-NEXT: ; %bb.1: 3221; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3222; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3223; GFX1064-NEXT: s_mov_b32 s3, s7 3224; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3225; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3226; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 3227; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3228; GFX1064-NEXT: buffer_gl0_inv 3229; GFX1064-NEXT: BB16_2: 3230; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3231; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3232; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3233; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3234; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3235; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3236; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3237; GFX1064-NEXT: s_nop 0 3238; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3239; GFX1064-NEXT: s_endpgm 3240; 3241; GFX1032-LABEL: xor_i32_varying: 3242; GFX1032: ; %bb.0: ; %entry 3243; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3244; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3245; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3246; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3247; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3248; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3249; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3250; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3251; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3252; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3253; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3254; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3255; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3256; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3257; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3258; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3259; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3260; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3261; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3262; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3263; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3264; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3265; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3266; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3267; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3268; GFX1032-NEXT: s_mov_b32 s2, -1 3269; GFX1032-NEXT: ; implicit-def: $vgpr0 3270; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3271; GFX1032-NEXT: s_cbranch_execz BB16_2 3272; GFX1032-NEXT: ; %bb.1: 3273; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3274; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3275; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3276; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3277; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 3278; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3279; GFX1032-NEXT: buffer_gl0_inv 3280; GFX1032-NEXT: BB16_2: 3281; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3282; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3283; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3284; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3285; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3286; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3287; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3288; GFX1032-NEXT: s_nop 0 3289; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3290; GFX1032-NEXT: s_endpgm 3291entry: 3292 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3293 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3294 store i32 %old, i32 addrspace(1)* %out 3295 ret void 3296} 3297 3298define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3299; 3300; 3301; GFX7LESS-LABEL: max_i32_varying: 3302; GFX7LESS: ; %bb.0: ; %entry 3303; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3304; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3305; GFX7LESS-NEXT: s_mov_b32 m0, -1 3306; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3307; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3308; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3309; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3310; GFX7LESS-NEXT: s_mov_b32 s2, -1 3311; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3312; GFX7LESS-NEXT: s_endpgm 3313; 3314; GFX8-LABEL: max_i32_varying: 3315; GFX8: ; %bb.0: ; %entry 3316; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3317; GFX8-NEXT: v_mov_b32_e32 v2, v0 3318; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3319; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3320; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3321; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3322; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3323; GFX8-NEXT: s_not_b64 exec, exec 3324; GFX8-NEXT: v_mov_b32_e32 v2, v1 3325; GFX8-NEXT: s_not_b64 exec, exec 3326; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3327; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3328; GFX8-NEXT: s_nop 1 3329; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3330; GFX8-NEXT: s_nop 1 3331; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3332; GFX8-NEXT: s_nop 1 3333; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3334; GFX8-NEXT: s_nop 1 3335; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3336; GFX8-NEXT: s_nop 1 3337; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3338; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3339; GFX8-NEXT: s_nop 0 3340; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3341; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3342; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3343; GFX8-NEXT: ; implicit-def: $vgpr0 3344; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3345; GFX8-NEXT: s_cbranch_execz BB17_2 3346; GFX8-NEXT: ; %bb.1: 3347; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3348; GFX8-NEXT: v_mov_b32_e32 v3, s4 3349; GFX8-NEXT: s_mov_b32 m0, -1 3350; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3351; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3352; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3353; GFX8-NEXT: BB17_2: 3354; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3355; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3356; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3357; GFX8-NEXT: v_mov_b32_e32 v0, v1 3358; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3359; GFX8-NEXT: s_mov_b32 s3, 0xf000 3360; GFX8-NEXT: s_mov_b32 s2, -1 3361; GFX8-NEXT: s_nop 0 3362; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3363; GFX8-NEXT: s_endpgm 3364; 3365; GFX9-LABEL: max_i32_varying: 3366; GFX9: ; %bb.0: ; %entry 3367; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3368; GFX9-NEXT: v_mov_b32_e32 v2, v0 3369; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3370; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3371; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3372; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3373; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3374; GFX9-NEXT: s_not_b64 exec, exec 3375; GFX9-NEXT: v_mov_b32_e32 v2, v1 3376; GFX9-NEXT: s_not_b64 exec, exec 3377; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3378; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3379; GFX9-NEXT: s_nop 1 3380; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3381; GFX9-NEXT: s_nop 1 3382; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3383; GFX9-NEXT: s_nop 1 3384; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3385; GFX9-NEXT: s_nop 1 3386; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3387; GFX9-NEXT: s_nop 1 3388; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3389; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3390; GFX9-NEXT: s_nop 0 3391; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3392; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3393; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3394; GFX9-NEXT: ; implicit-def: $vgpr0 3395; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3396; GFX9-NEXT: s_cbranch_execz BB17_2 3397; GFX9-NEXT: ; %bb.1: 3398; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3399; GFX9-NEXT: v_mov_b32_e32 v3, s4 3400; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3401; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3402; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3403; GFX9-NEXT: BB17_2: 3404; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3405; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3406; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3407; GFX9-NEXT: v_mov_b32_e32 v0, v1 3408; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3409; GFX9-NEXT: s_mov_b32 s3, 0xf000 3410; GFX9-NEXT: s_mov_b32 s2, -1 3411; GFX9-NEXT: s_nop 0 3412; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3413; GFX9-NEXT: s_endpgm 3414; 3415; GFX1064-LABEL: max_i32_varying: 3416; GFX1064: ; %bb.0: ; %entry 3417; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3418; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3419; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3420; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3421; GFX1064-NEXT: s_not_b64 exec, exec 3422; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3423; GFX1064-NEXT: s_not_b64 exec, exec 3424; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3425; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3426; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3427; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3428; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3429; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3430; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3431; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3432; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3433; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3434; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3435; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3436; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3437; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3438; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3439; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3440; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3441; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3442; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3443; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3444; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3445; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3446; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3447; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3448; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3449; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3450; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3451; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3452; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3453; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3454; GFX1064-NEXT: s_mov_b32 s2, -1 3455; GFX1064-NEXT: ; implicit-def: $vgpr0 3456; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3457; GFX1064-NEXT: s_cbranch_execz BB17_2 3458; GFX1064-NEXT: ; %bb.1: 3459; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3460; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3461; GFX1064-NEXT: s_mov_b32 s3, s7 3462; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3463; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3464; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 3465; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3466; GFX1064-NEXT: buffer_gl0_inv 3467; GFX1064-NEXT: BB17_2: 3468; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3469; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3470; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3471; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3472; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3473; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3474; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3475; GFX1064-NEXT: s_nop 0 3476; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3477; GFX1064-NEXT: s_endpgm 3478; 3479; GFX1032-LABEL: max_i32_varying: 3480; GFX1032: ; %bb.0: ; %entry 3481; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3482; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3483; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3484; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3485; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3486; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3487; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3488; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3489; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3490; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3491; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3492; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3493; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3494; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3495; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3496; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3497; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3498; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3499; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3500; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3501; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3502; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3503; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3504; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3505; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3506; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3507; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3508; GFX1032-NEXT: s_mov_b32 s2, -1 3509; GFX1032-NEXT: ; implicit-def: $vgpr0 3510; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3511; GFX1032-NEXT: s_cbranch_execz BB17_2 3512; GFX1032-NEXT: ; %bb.1: 3513; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3514; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3515; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3516; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3517; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 3518; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3519; GFX1032-NEXT: buffer_gl0_inv 3520; GFX1032-NEXT: BB17_2: 3521; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3522; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3523; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3524; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3525; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3526; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3527; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3528; GFX1032-NEXT: s_nop 0 3529; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3530; GFX1032-NEXT: s_endpgm 3531entry: 3532 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3533 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3534 store i32 %old, i32 addrspace(1)* %out 3535 ret void 3536} 3537 3538define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3539; 3540; 3541; GFX7LESS-LABEL: max_i64_constant: 3542; GFX7LESS: ; %bb.0: ; %entry 3543; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3544; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3545; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3546; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3547; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3548; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3549; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3550; GFX7LESS-NEXT: ; %bb.1: 3551; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3552; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3553; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3554; GFX7LESS-NEXT: s_mov_b32 m0, -1 3555; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3556; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3557; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3558; GFX7LESS-NEXT: BB18_2: 3559; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3560; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3561; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3562; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3563; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3564; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3565; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3566; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3567; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3568; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3569; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3570; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3571; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3572; GFX7LESS-NEXT: s_mov_b32 s2, -1 3573; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3574; GFX7LESS-NEXT: s_endpgm 3575; 3576; GFX8-LABEL: max_i64_constant: 3577; GFX8: ; %bb.0: ; %entry 3578; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3579; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3580; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3581; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3582; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3583; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3584; GFX8-NEXT: s_cbranch_execz BB18_2 3585; GFX8-NEXT: ; %bb.1: 3586; GFX8-NEXT: v_mov_b32_e32 v0, 5 3587; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3588; GFX8-NEXT: v_mov_b32_e32 v1, 0 3589; GFX8-NEXT: s_mov_b32 m0, -1 3590; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3591; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3592; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3593; GFX8-NEXT: BB18_2: 3594; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3595; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3596; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3597; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3598; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3599; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3600; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3601; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3602; GFX8-NEXT: v_mov_b32_e32 v2, s3 3603; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3604; GFX8-NEXT: v_mov_b32_e32 v2, s2 3605; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3606; GFX8-NEXT: s_mov_b32 s3, 0xf000 3607; GFX8-NEXT: s_mov_b32 s2, -1 3608; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3609; GFX8-NEXT: s_endpgm 3610; 3611; GFX9-LABEL: max_i64_constant: 3612; GFX9: ; %bb.0: ; %entry 3613; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3614; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3615; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3616; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3617; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3618; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3619; GFX9-NEXT: s_cbranch_execz BB18_2 3620; GFX9-NEXT: ; %bb.1: 3621; GFX9-NEXT: v_mov_b32_e32 v0, 5 3622; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3623; GFX9-NEXT: v_mov_b32_e32 v1, 0 3624; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3625; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3626; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3627; GFX9-NEXT: BB18_2: 3628; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3629; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3630; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3631; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3632; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3633; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3634; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3635; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3636; GFX9-NEXT: v_mov_b32_e32 v2, s3 3637; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3638; GFX9-NEXT: v_mov_b32_e32 v2, s2 3639; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3640; GFX9-NEXT: s_mov_b32 s3, 0xf000 3641; GFX9-NEXT: s_mov_b32 s2, -1 3642; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3643; GFX9-NEXT: s_endpgm 3644; 3645; GFX1064-LABEL: max_i64_constant: 3646; GFX1064: ; %bb.0: ; %entry 3647; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3648; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3649; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3650; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3651; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3652; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3653; GFX1064-NEXT: s_cbranch_execz BB18_2 3654; GFX1064-NEXT: ; %bb.1: 3655; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3656; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3657; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3658; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3659; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3660; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3661; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3662; GFX1064-NEXT: buffer_gl0_inv 3663; GFX1064-NEXT: BB18_2: 3664; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3665; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3666; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3667; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3668; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3669; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3670; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3671; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3672; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3673; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3674; GFX1064-NEXT: s_mov_b32 s2, -1 3675; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3676; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3677; GFX1064-NEXT: s_endpgm 3678; 3679; GFX1032-LABEL: max_i64_constant: 3680; GFX1032: ; %bb.0: ; %entry 3681; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3682; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3683; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3684; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3685; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3686; GFX1032-NEXT: s_cbranch_execz BB18_2 3687; GFX1032-NEXT: ; %bb.1: 3688; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3689; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3690; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3691; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3692; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3693; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3694; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3695; GFX1032-NEXT: buffer_gl0_inv 3696; GFX1032-NEXT: BB18_2: 3697; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3698; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3699; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3700; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3701; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3702; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3703; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3704; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3705; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3706; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3707; GFX1032-NEXT: s_mov_b32 s2, -1 3708; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3709; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3710; GFX1032-NEXT: s_endpgm 3711entry: 3712 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3713 store i64 %old, i64 addrspace(1)* %out 3714 ret void 3715} 3716 3717define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3718; 3719; 3720; GFX7LESS-LABEL: min_i32_varying: 3721; GFX7LESS: ; %bb.0: ; %entry 3722; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3723; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3724; GFX7LESS-NEXT: s_mov_b32 m0, -1 3725; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3726; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3727; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3728; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3729; GFX7LESS-NEXT: s_mov_b32 s2, -1 3730; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3731; GFX7LESS-NEXT: s_endpgm 3732; 3733; GFX8-LABEL: min_i32_varying: 3734; GFX8: ; %bb.0: ; %entry 3735; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3736; GFX8-NEXT: v_mov_b32_e32 v2, v0 3737; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3738; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3739; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3740; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3741; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3742; GFX8-NEXT: s_not_b64 exec, exec 3743; GFX8-NEXT: v_mov_b32_e32 v2, v1 3744; GFX8-NEXT: s_not_b64 exec, exec 3745; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3746; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3747; GFX8-NEXT: s_nop 1 3748; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3749; GFX8-NEXT: s_nop 1 3750; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3751; GFX8-NEXT: s_nop 1 3752; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3753; GFX8-NEXT: s_nop 1 3754; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3755; GFX8-NEXT: s_nop 1 3756; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3757; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3758; GFX8-NEXT: s_nop 0 3759; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3760; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3761; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3762; GFX8-NEXT: ; implicit-def: $vgpr0 3763; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3764; GFX8-NEXT: s_cbranch_execz BB19_2 3765; GFX8-NEXT: ; %bb.1: 3766; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3767; GFX8-NEXT: v_mov_b32_e32 v3, s4 3768; GFX8-NEXT: s_mov_b32 m0, -1 3769; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3770; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3771; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3772; GFX8-NEXT: BB19_2: 3773; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3774; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3775; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3776; GFX8-NEXT: v_mov_b32_e32 v0, v1 3777; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3778; GFX8-NEXT: s_mov_b32 s3, 0xf000 3779; GFX8-NEXT: s_mov_b32 s2, -1 3780; GFX8-NEXT: s_nop 0 3781; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3782; GFX8-NEXT: s_endpgm 3783; 3784; GFX9-LABEL: min_i32_varying: 3785; GFX9: ; %bb.0: ; %entry 3786; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3787; GFX9-NEXT: v_mov_b32_e32 v2, v0 3788; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3789; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3790; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3791; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3792; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3793; GFX9-NEXT: s_not_b64 exec, exec 3794; GFX9-NEXT: v_mov_b32_e32 v2, v1 3795; GFX9-NEXT: s_not_b64 exec, exec 3796; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3797; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3798; GFX9-NEXT: s_nop 1 3799; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3800; GFX9-NEXT: s_nop 1 3801; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3802; GFX9-NEXT: s_nop 1 3803; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3804; GFX9-NEXT: s_nop 1 3805; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3806; GFX9-NEXT: s_nop 1 3807; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3808; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3809; GFX9-NEXT: s_nop 0 3810; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3811; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3812; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3813; GFX9-NEXT: ; implicit-def: $vgpr0 3814; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3815; GFX9-NEXT: s_cbranch_execz BB19_2 3816; GFX9-NEXT: ; %bb.1: 3817; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3818; GFX9-NEXT: v_mov_b32_e32 v3, s4 3819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3820; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3821; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3822; GFX9-NEXT: BB19_2: 3823; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3824; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3825; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3826; GFX9-NEXT: v_mov_b32_e32 v0, v1 3827; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3828; GFX9-NEXT: s_mov_b32 s3, 0xf000 3829; GFX9-NEXT: s_mov_b32 s2, -1 3830; GFX9-NEXT: s_nop 0 3831; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3832; GFX9-NEXT: s_endpgm 3833; 3834; GFX1064-LABEL: min_i32_varying: 3835; GFX1064: ; %bb.0: ; %entry 3836; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3837; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3838; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3839; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3840; GFX1064-NEXT: s_not_b64 exec, exec 3841; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3842; GFX1064-NEXT: s_not_b64 exec, exec 3843; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3844; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3845; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3846; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3847; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3848; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3849; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3850; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3851; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3852; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3853; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3854; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3855; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3856; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3857; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3858; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3859; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3860; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3861; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3862; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3863; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3864; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3865; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3866; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3867; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3868; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3869; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3870; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3871; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3872; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3873; GFX1064-NEXT: s_mov_b32 s2, -1 3874; GFX1064-NEXT: ; implicit-def: $vgpr0 3875; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3876; GFX1064-NEXT: s_cbranch_execz BB19_2 3877; GFX1064-NEXT: ; %bb.1: 3878; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3879; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3880; GFX1064-NEXT: s_mov_b32 s3, s7 3881; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3882; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3883; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 3884; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3885; GFX1064-NEXT: buffer_gl0_inv 3886; GFX1064-NEXT: BB19_2: 3887; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3888; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3889; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3890; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3891; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3892; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3893; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3894; GFX1064-NEXT: s_nop 0 3895; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3896; GFX1064-NEXT: s_endpgm 3897; 3898; GFX1032-LABEL: min_i32_varying: 3899; GFX1032: ; %bb.0: ; %entry 3900; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3901; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3902; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3903; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3904; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3905; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3906; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3907; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3908; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3909; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3910; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3911; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3912; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3913; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3914; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3915; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3916; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3917; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3918; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3919; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3920; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3921; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3922; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3923; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3924; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3925; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3926; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3927; GFX1032-NEXT: s_mov_b32 s2, -1 3928; GFX1032-NEXT: ; implicit-def: $vgpr0 3929; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3930; GFX1032-NEXT: s_cbranch_execz BB19_2 3931; GFX1032-NEXT: ; %bb.1: 3932; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3933; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3934; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3935; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3936; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 3937; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3938; GFX1032-NEXT: buffer_gl0_inv 3939; GFX1032-NEXT: BB19_2: 3940; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3941; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3942; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3943; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3944; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3945; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3946; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3947; GFX1032-NEXT: s_nop 0 3948; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3949; GFX1032-NEXT: s_endpgm 3950entry: 3951 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3952 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3953 store i32 %old, i32 addrspace(1)* %out 3954 ret void 3955} 3956 3957define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3958; 3959; 3960; GFX7LESS-LABEL: min_i64_constant: 3961; GFX7LESS: ; %bb.0: ; %entry 3962; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3963; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3964; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3965; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3966; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3967; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3968; GFX7LESS-NEXT: s_cbranch_execz BB20_2 3969; GFX7LESS-NEXT: ; %bb.1: 3970; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3971; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3972; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3973; GFX7LESS-NEXT: s_mov_b32 m0, -1 3974; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3975; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3976; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3977; GFX7LESS-NEXT: BB20_2: 3978; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3979; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3980; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3981; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3982; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3983; GFX7LESS-NEXT: s_mov_b32 s2, -1 3984; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3985; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3986; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3987; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3988; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3989; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3990; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3991; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3992; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3993; GFX7LESS-NEXT: s_endpgm 3994; 3995; GFX8-LABEL: min_i64_constant: 3996; GFX8: ; %bb.0: ; %entry 3997; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3998; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3999; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4000; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4001; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4002; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4003; GFX8-NEXT: s_cbranch_execz BB20_2 4004; GFX8-NEXT: ; %bb.1: 4005; GFX8-NEXT: v_mov_b32_e32 v0, 5 4006; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4007; GFX8-NEXT: v_mov_b32_e32 v1, 0 4008; GFX8-NEXT: s_mov_b32 m0, -1 4009; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4010; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4011; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4012; GFX8-NEXT: BB20_2: 4013; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4014; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4015; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4016; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 4017; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4018; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4019; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4020; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4021; GFX8-NEXT: v_mov_b32_e32 v2, s5 4022; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4023; GFX8-NEXT: v_mov_b32_e32 v2, s4 4024; GFX8-NEXT: s_mov_b32 s2, -1 4025; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4026; GFX8-NEXT: s_mov_b32 s3, 0xf000 4027; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4028; GFX8-NEXT: s_endpgm 4029; 4030; GFX9-LABEL: min_i64_constant: 4031; GFX9: ; %bb.0: ; %entry 4032; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4033; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4034; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4035; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4036; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4037; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4038; GFX9-NEXT: s_cbranch_execz BB20_2 4039; GFX9-NEXT: ; %bb.1: 4040; GFX9-NEXT: v_mov_b32_e32 v0, 5 4041; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4042; GFX9-NEXT: v_mov_b32_e32 v1, 0 4043; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4044; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4045; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4046; GFX9-NEXT: BB20_2: 4047; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4048; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4049; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4050; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4051; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4052; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4053; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4054; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4055; GFX9-NEXT: v_mov_b32_e32 v2, s5 4056; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4057; GFX9-NEXT: v_mov_b32_e32 v2, s4 4058; GFX9-NEXT: s_mov_b32 s2, -1 4059; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4060; GFX9-NEXT: s_mov_b32 s3, 0xf000 4061; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4062; GFX9-NEXT: s_endpgm 4063; 4064; GFX1064-LABEL: min_i64_constant: 4065; GFX1064: ; %bb.0: ; %entry 4066; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4067; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4068; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4069; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4070; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4071; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4072; GFX1064-NEXT: s_cbranch_execz BB20_2 4073; GFX1064-NEXT: ; %bb.1: 4074; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4075; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4076; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4077; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4078; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4079; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4080; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4081; GFX1064-NEXT: buffer_gl0_inv 4082; GFX1064-NEXT: BB20_2: 4083; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4084; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4085; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4086; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4087; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4088; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4089; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 4090; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4091; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4092; GFX1064-NEXT: s_mov_b32 s2, -1 4093; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4094; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4095; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4096; GFX1064-NEXT: s_endpgm 4097; 4098; GFX1032-LABEL: min_i64_constant: 4099; GFX1032: ; %bb.0: ; %entry 4100; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4101; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4102; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4103; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4104; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4105; GFX1032-NEXT: s_cbranch_execz BB20_2 4106; GFX1032-NEXT: ; %bb.1: 4107; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4108; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4109; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4110; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4111; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4112; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4113; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4114; GFX1032-NEXT: buffer_gl0_inv 4115; GFX1032-NEXT: BB20_2: 4116; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4117; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4118; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4119; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4120; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4121; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4122; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 4123; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4124; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4125; GFX1032-NEXT: s_mov_b32 s2, -1 4126; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4127; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4128; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4129; GFX1032-NEXT: s_endpgm 4130entry: 4131 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4132 store i64 %old, i64 addrspace(1)* %out 4133 ret void 4134} 4135 4136define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4137; 4138; 4139; GFX7LESS-LABEL: umax_i32_varying: 4140; GFX7LESS: ; %bb.0: ; %entry 4141; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4142; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4143; GFX7LESS-NEXT: s_mov_b32 m0, -1 4144; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4145; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4146; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4147; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4148; GFX7LESS-NEXT: s_mov_b32 s2, -1 4149; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4150; GFX7LESS-NEXT: s_endpgm 4151; 4152; GFX8-LABEL: umax_i32_varying: 4153; GFX8: ; %bb.0: ; %entry 4154; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4155; GFX8-NEXT: v_mov_b32_e32 v2, v0 4156; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4157; GFX8-NEXT: v_mov_b32_e32 v1, 0 4158; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4159; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4160; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4161; GFX8-NEXT: s_not_b64 exec, exec 4162; GFX8-NEXT: v_mov_b32_e32 v2, 0 4163; GFX8-NEXT: s_not_b64 exec, exec 4164; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4165; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4166; GFX8-NEXT: s_nop 1 4167; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4168; GFX8-NEXT: s_nop 1 4169; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4170; GFX8-NEXT: s_nop 1 4171; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4172; GFX8-NEXT: s_nop 1 4173; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4174; GFX8-NEXT: s_nop 1 4175; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4176; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4177; GFX8-NEXT: s_nop 0 4178; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4179; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4180; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4181; GFX8-NEXT: ; implicit-def: $vgpr0 4182; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4183; GFX8-NEXT: s_cbranch_execz BB21_2 4184; GFX8-NEXT: ; %bb.1: 4185; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4186; GFX8-NEXT: v_mov_b32_e32 v3, s4 4187; GFX8-NEXT: s_mov_b32 m0, -1 4188; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4189; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4190; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4191; GFX8-NEXT: BB21_2: 4192; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4193; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4194; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4195; GFX8-NEXT: v_mov_b32_e32 v0, v1 4196; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4197; GFX8-NEXT: s_mov_b32 s3, 0xf000 4198; GFX8-NEXT: s_mov_b32 s2, -1 4199; GFX8-NEXT: s_nop 0 4200; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4201; GFX8-NEXT: s_endpgm 4202; 4203; GFX9-LABEL: umax_i32_varying: 4204; GFX9: ; %bb.0: ; %entry 4205; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4206; GFX9-NEXT: v_mov_b32_e32 v2, v0 4207; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4208; GFX9-NEXT: v_mov_b32_e32 v1, 0 4209; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4210; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4211; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4212; GFX9-NEXT: s_not_b64 exec, exec 4213; GFX9-NEXT: v_mov_b32_e32 v2, 0 4214; GFX9-NEXT: s_not_b64 exec, exec 4215; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4216; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4217; GFX9-NEXT: s_nop 1 4218; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4219; GFX9-NEXT: s_nop 1 4220; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4221; GFX9-NEXT: s_nop 1 4222; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4223; GFX9-NEXT: s_nop 1 4224; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4225; GFX9-NEXT: s_nop 1 4226; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4227; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4228; GFX9-NEXT: s_nop 0 4229; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4230; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4231; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4232; GFX9-NEXT: ; implicit-def: $vgpr0 4233; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4234; GFX9-NEXT: s_cbranch_execz BB21_2 4235; GFX9-NEXT: ; %bb.1: 4236; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4237; GFX9-NEXT: v_mov_b32_e32 v3, s4 4238; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4239; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4240; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4241; GFX9-NEXT: BB21_2: 4242; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4243; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4244; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4245; GFX9-NEXT: v_mov_b32_e32 v0, v1 4246; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4247; GFX9-NEXT: s_mov_b32 s3, 0xf000 4248; GFX9-NEXT: s_mov_b32 s2, -1 4249; GFX9-NEXT: s_nop 0 4250; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4251; GFX9-NEXT: s_endpgm 4252; 4253; GFX1064-LABEL: umax_i32_varying: 4254; GFX1064: ; %bb.0: ; %entry 4255; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4256; GFX1064-NEXT: s_not_b64 exec, exec 4257; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4258; GFX1064-NEXT: s_not_b64 exec, exec 4259; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4260; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4261; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4262; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4263; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4264; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4265; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4266; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4267; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4268; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4269; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4270; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4271; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4272; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4273; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4274; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4275; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4276; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4277; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4278; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4279; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4280; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4281; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4282; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4283; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4284; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4285; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4286; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4287; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4288; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4289; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4290; GFX1064-NEXT: s_mov_b32 s2, -1 4291; GFX1064-NEXT: ; implicit-def: $vgpr0 4292; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4293; GFX1064-NEXT: s_cbranch_execz BB21_2 4294; GFX1064-NEXT: ; %bb.1: 4295; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4296; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4297; GFX1064-NEXT: s_mov_b32 s3, s7 4298; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4299; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4300; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 4301; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4302; GFX1064-NEXT: buffer_gl0_inv 4303; GFX1064-NEXT: BB21_2: 4304; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4305; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4306; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4307; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4308; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4309; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4310; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4311; GFX1064-NEXT: s_nop 0 4312; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4313; GFX1064-NEXT: s_endpgm 4314; 4315; GFX1032-LABEL: umax_i32_varying: 4316; GFX1032: ; %bb.0: ; %entry 4317; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4318; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4319; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4320; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4321; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4322; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4323; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4324; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4325; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4326; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4327; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4328; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4329; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4330; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4331; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4332; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4333; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4334; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4335; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4336; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4337; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4338; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4339; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4340; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4341; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4342; GFX1032-NEXT: s_mov_b32 s2, -1 4343; GFX1032-NEXT: ; implicit-def: $vgpr0 4344; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4345; GFX1032-NEXT: s_cbranch_execz BB21_2 4346; GFX1032-NEXT: ; %bb.1: 4347; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4348; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4349; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4350; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4351; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 4352; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4353; GFX1032-NEXT: buffer_gl0_inv 4354; GFX1032-NEXT: BB21_2: 4355; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4356; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4357; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4358; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4359; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4360; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4361; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4362; GFX1032-NEXT: s_nop 0 4363; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4364; GFX1032-NEXT: s_endpgm 4365entry: 4366 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4367 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4368 store i32 %old, i32 addrspace(1)* %out 4369 ret void 4370} 4371 4372define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4373; 4374; 4375; GFX7LESS-LABEL: umax_i64_constant: 4376; GFX7LESS: ; %bb.0: ; %entry 4377; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4378; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4379; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4380; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4381; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4382; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4383; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4384; GFX7LESS-NEXT: ; %bb.1: 4385; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4386; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4387; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4388; GFX7LESS-NEXT: s_mov_b32 m0, -1 4389; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4390; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4391; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4392; GFX7LESS-NEXT: BB22_2: 4393; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4394; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4395; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4396; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4397; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4398; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4399; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4400; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4401; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4402; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4403; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4404; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4405; GFX7LESS-NEXT: s_mov_b32 s2, -1 4406; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4407; GFX7LESS-NEXT: s_endpgm 4408; 4409; GFX8-LABEL: umax_i64_constant: 4410; GFX8: ; %bb.0: ; %entry 4411; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4412; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4413; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4414; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4415; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4416; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4417; GFX8-NEXT: s_cbranch_execz BB22_2 4418; GFX8-NEXT: ; %bb.1: 4419; GFX8-NEXT: v_mov_b32_e32 v0, 5 4420; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4421; GFX8-NEXT: v_mov_b32_e32 v1, 0 4422; GFX8-NEXT: s_mov_b32 m0, -1 4423; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4424; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4425; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4426; GFX8-NEXT: BB22_2: 4427; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4428; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4429; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4430; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4431; GFX8-NEXT: v_mov_b32_e32 v1, 0 4432; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4433; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4434; GFX8-NEXT: v_mov_b32_e32 v1, s3 4435; GFX8-NEXT: v_mov_b32_e32 v2, s2 4436; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4437; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4438; GFX8-NEXT: s_mov_b32 s3, 0xf000 4439; GFX8-NEXT: s_mov_b32 s2, -1 4440; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4441; GFX8-NEXT: s_endpgm 4442; 4443; GFX9-LABEL: umax_i64_constant: 4444; GFX9: ; %bb.0: ; %entry 4445; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4446; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4447; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4448; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4449; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4450; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4451; GFX9-NEXT: s_cbranch_execz BB22_2 4452; GFX9-NEXT: ; %bb.1: 4453; GFX9-NEXT: v_mov_b32_e32 v0, 5 4454; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4455; GFX9-NEXT: v_mov_b32_e32 v1, 0 4456; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4457; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4458; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4459; GFX9-NEXT: BB22_2: 4460; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4461; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4462; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4463; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4464; GFX9-NEXT: v_mov_b32_e32 v1, 0 4465; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4466; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4467; GFX9-NEXT: v_mov_b32_e32 v1, s3 4468; GFX9-NEXT: v_mov_b32_e32 v2, s2 4469; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4470; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4471; GFX9-NEXT: s_mov_b32 s3, 0xf000 4472; GFX9-NEXT: s_mov_b32 s2, -1 4473; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4474; GFX9-NEXT: s_endpgm 4475; 4476; GFX1064-LABEL: umax_i64_constant: 4477; GFX1064: ; %bb.0: ; %entry 4478; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4479; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4480; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4481; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4482; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4483; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4484; GFX1064-NEXT: s_cbranch_execz BB22_2 4485; GFX1064-NEXT: ; %bb.1: 4486; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4487; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4488; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4489; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4490; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4491; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4492; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4493; GFX1064-NEXT: buffer_gl0_inv 4494; GFX1064-NEXT: BB22_2: 4495; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4496; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4497; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4498; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4499; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4500; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4501; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4502; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4503; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4504; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4505; GFX1064-NEXT: s_mov_b32 s2, -1 4506; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4507; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4508; GFX1064-NEXT: s_endpgm 4509; 4510; GFX1032-LABEL: umax_i64_constant: 4511; GFX1032: ; %bb.0: ; %entry 4512; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4513; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4514; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4515; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4516; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4517; GFX1032-NEXT: s_cbranch_execz BB22_2 4518; GFX1032-NEXT: ; %bb.1: 4519; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4520; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4521; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4522; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4523; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4524; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4525; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4526; GFX1032-NEXT: buffer_gl0_inv 4527; GFX1032-NEXT: BB22_2: 4528; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4529; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4530; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4531; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4532; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4533; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4534; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4535; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4536; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4537; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4538; GFX1032-NEXT: s_mov_b32 s2, -1 4539; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4540; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4541; GFX1032-NEXT: s_endpgm 4542entry: 4543 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4544 store i64 %old, i64 addrspace(1)* %out 4545 ret void 4546} 4547 4548define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4549; 4550; 4551; GFX7LESS-LABEL: umin_i32_varying: 4552; GFX7LESS: ; %bb.0: ; %entry 4553; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4554; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4555; GFX7LESS-NEXT: s_mov_b32 m0, -1 4556; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4557; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4558; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4559; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4560; GFX7LESS-NEXT: s_mov_b32 s2, -1 4561; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4562; GFX7LESS-NEXT: s_endpgm 4563; 4564; GFX8-LABEL: umin_i32_varying: 4565; GFX8: ; %bb.0: ; %entry 4566; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4567; GFX8-NEXT: v_mov_b32_e32 v2, v0 4568; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4569; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4570; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4571; GFX8-NEXT: v_mov_b32_e32 v1, -1 4572; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4573; GFX8-NEXT: s_not_b64 exec, exec 4574; GFX8-NEXT: v_mov_b32_e32 v2, -1 4575; GFX8-NEXT: s_not_b64 exec, exec 4576; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4577; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4578; GFX8-NEXT: s_nop 1 4579; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4580; GFX8-NEXT: s_nop 1 4581; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4582; GFX8-NEXT: s_nop 1 4583; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4584; GFX8-NEXT: s_nop 1 4585; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4586; GFX8-NEXT: s_nop 1 4587; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4588; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4589; GFX8-NEXT: s_nop 0 4590; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4591; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4592; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4593; GFX8-NEXT: ; implicit-def: $vgpr0 4594; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4595; GFX8-NEXT: s_cbranch_execz BB23_2 4596; GFX8-NEXT: ; %bb.1: 4597; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4598; GFX8-NEXT: v_mov_b32_e32 v3, s4 4599; GFX8-NEXT: s_mov_b32 m0, -1 4600; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4601; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4602; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4603; GFX8-NEXT: BB23_2: 4604; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4605; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4606; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4607; GFX8-NEXT: v_mov_b32_e32 v0, v1 4608; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4609; GFX8-NEXT: s_mov_b32 s3, 0xf000 4610; GFX8-NEXT: s_mov_b32 s2, -1 4611; GFX8-NEXT: s_nop 0 4612; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4613; GFX8-NEXT: s_endpgm 4614; 4615; GFX9-LABEL: umin_i32_varying: 4616; GFX9: ; %bb.0: ; %entry 4617; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4618; GFX9-NEXT: v_mov_b32_e32 v2, v0 4619; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4620; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4621; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4622; GFX9-NEXT: v_mov_b32_e32 v1, -1 4623; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4624; GFX9-NEXT: s_not_b64 exec, exec 4625; GFX9-NEXT: v_mov_b32_e32 v2, -1 4626; GFX9-NEXT: s_not_b64 exec, exec 4627; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4628; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4629; GFX9-NEXT: s_nop 1 4630; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4631; GFX9-NEXT: s_nop 1 4632; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4633; GFX9-NEXT: s_nop 1 4634; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4635; GFX9-NEXT: s_nop 1 4636; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4637; GFX9-NEXT: s_nop 1 4638; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4639; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4640; GFX9-NEXT: s_nop 0 4641; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4642; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4643; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4644; GFX9-NEXT: ; implicit-def: $vgpr0 4645; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4646; GFX9-NEXT: s_cbranch_execz BB23_2 4647; GFX9-NEXT: ; %bb.1: 4648; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4649; GFX9-NEXT: v_mov_b32_e32 v3, s4 4650; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4651; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4652; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4653; GFX9-NEXT: BB23_2: 4654; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4656; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4657; GFX9-NEXT: v_mov_b32_e32 v0, v1 4658; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4659; GFX9-NEXT: s_mov_b32 s3, 0xf000 4660; GFX9-NEXT: s_mov_b32 s2, -1 4661; GFX9-NEXT: s_nop 0 4662; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4663; GFX9-NEXT: s_endpgm 4664; 4665; GFX1064-LABEL: umin_i32_varying: 4666; GFX1064: ; %bb.0: ; %entry 4667; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4668; GFX1064-NEXT: s_not_b64 exec, exec 4669; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4670; GFX1064-NEXT: s_not_b64 exec, exec 4671; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4672; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4673; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4674; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4675; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4676; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4677; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4678; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4679; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4680; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4681; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4682; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4683; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4684; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4685; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4686; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4687; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4688; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4689; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4690; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4691; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4692; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4693; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4694; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4695; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4696; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4697; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4698; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4699; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4700; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4701; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4702; GFX1064-NEXT: s_mov_b32 s2, -1 4703; GFX1064-NEXT: ; implicit-def: $vgpr0 4704; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4705; GFX1064-NEXT: s_cbranch_execz BB23_2 4706; GFX1064-NEXT: ; %bb.1: 4707; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4708; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4709; GFX1064-NEXT: s_mov_b32 s3, s7 4710; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4711; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4712; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 4713; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4714; GFX1064-NEXT: buffer_gl0_inv 4715; GFX1064-NEXT: BB23_2: 4716; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4717; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4718; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4719; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4720; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4721; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4722; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4723; GFX1064-NEXT: s_nop 0 4724; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4725; GFX1064-NEXT: s_endpgm 4726; 4727; GFX1032-LABEL: umin_i32_varying: 4728; GFX1032: ; %bb.0: ; %entry 4729; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4730; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4731; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4732; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4733; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4734; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4735; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4736; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4737; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4738; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4739; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4740; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4741; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4742; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4743; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4744; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4745; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4746; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4747; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4748; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4749; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4750; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4751; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4752; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4753; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4754; GFX1032-NEXT: s_mov_b32 s2, -1 4755; GFX1032-NEXT: ; implicit-def: $vgpr0 4756; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4757; GFX1032-NEXT: s_cbranch_execz BB23_2 4758; GFX1032-NEXT: ; %bb.1: 4759; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4760; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4761; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4762; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4763; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 4764; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4765; GFX1032-NEXT: buffer_gl0_inv 4766; GFX1032-NEXT: BB23_2: 4767; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4768; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4769; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4770; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4771; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4772; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4773; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4774; GFX1032-NEXT: s_nop 0 4775; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4776; GFX1032-NEXT: s_endpgm 4777entry: 4778 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4779 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4780 store i32 %old, i32 addrspace(1)* %out 4781 ret void 4782} 4783 4784define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4785; 4786; 4787; GFX7LESS-LABEL: umin_i64_constant: 4788; GFX7LESS: ; %bb.0: ; %entry 4789; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4790; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4791; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4792; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4793; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4794; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4795; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4796; GFX7LESS-NEXT: ; %bb.1: 4797; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4798; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4799; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4800; GFX7LESS-NEXT: s_mov_b32 m0, -1 4801; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4802; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4803; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4804; GFX7LESS-NEXT: BB24_2: 4805; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4806; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4807; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4808; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4809; GFX7LESS-NEXT: s_mov_b32 s2, -1 4810; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4811; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4812; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4813; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4814; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4815; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4816; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4817; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4818; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4819; GFX7LESS-NEXT: s_endpgm 4820; 4821; GFX8-LABEL: umin_i64_constant: 4822; GFX8: ; %bb.0: ; %entry 4823; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4824; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4825; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4826; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4827; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4828; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4829; GFX8-NEXT: s_cbranch_execz BB24_2 4830; GFX8-NEXT: ; %bb.1: 4831; GFX8-NEXT: v_mov_b32_e32 v0, 5 4832; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4833; GFX8-NEXT: v_mov_b32_e32 v1, 0 4834; GFX8-NEXT: s_mov_b32 m0, -1 4835; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4836; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4837; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4838; GFX8-NEXT: BB24_2: 4839; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4840; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4841; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4842; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4843; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4844; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4845; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4846; GFX8-NEXT: v_mov_b32_e32 v2, s5 4847; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4848; GFX8-NEXT: v_mov_b32_e32 v2, s4 4849; GFX8-NEXT: s_mov_b32 s2, -1 4850; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4851; GFX8-NEXT: s_mov_b32 s3, 0xf000 4852; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4853; GFX8-NEXT: s_endpgm 4854; 4855; GFX9-LABEL: umin_i64_constant: 4856; GFX9: ; %bb.0: ; %entry 4857; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4858; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4859; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4860; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4861; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4862; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4863; GFX9-NEXT: s_cbranch_execz BB24_2 4864; GFX9-NEXT: ; %bb.1: 4865; GFX9-NEXT: v_mov_b32_e32 v0, 5 4866; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4867; GFX9-NEXT: v_mov_b32_e32 v1, 0 4868; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4869; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4870; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4871; GFX9-NEXT: BB24_2: 4872; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4873; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4874; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4875; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4876; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4877; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4878; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4879; GFX9-NEXT: v_mov_b32_e32 v2, s5 4880; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4881; GFX9-NEXT: v_mov_b32_e32 v2, s4 4882; GFX9-NEXT: s_mov_b32 s2, -1 4883; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4884; GFX9-NEXT: s_mov_b32 s3, 0xf000 4885; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4886; GFX9-NEXT: s_endpgm 4887; 4888; GFX1064-LABEL: umin_i64_constant: 4889; GFX1064: ; %bb.0: ; %entry 4890; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4891; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4892; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4893; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4894; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4895; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4896; GFX1064-NEXT: s_cbranch_execz BB24_2 4897; GFX1064-NEXT: ; %bb.1: 4898; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4899; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4900; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4901; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4902; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4903; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4904; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4905; GFX1064-NEXT: buffer_gl0_inv 4906; GFX1064-NEXT: BB24_2: 4907; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4908; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4909; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4910; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4911; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4912; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4913; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4914; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4915; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4916; GFX1064-NEXT: s_mov_b32 s2, -1 4917; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4918; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4919; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4920; GFX1064-NEXT: s_endpgm 4921; 4922; GFX1032-LABEL: umin_i64_constant: 4923; GFX1032: ; %bb.0: ; %entry 4924; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4925; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4926; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4927; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4928; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4929; GFX1032-NEXT: s_cbranch_execz BB24_2 4930; GFX1032-NEXT: ; %bb.1: 4931; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4932; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4933; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4934; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4935; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4936; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4937; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4938; GFX1032-NEXT: buffer_gl0_inv 4939; GFX1032-NEXT: BB24_2: 4940; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4941; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4942; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4943; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4944; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4945; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4946; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4947; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4948; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4949; GFX1032-NEXT: s_mov_b32 s2, -1 4950; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4951; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4952; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4953; GFX1032-NEXT: s_endpgm 4954entry: 4955 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4956 store i64 %old, i64 addrspace(1)* %out 4957 ret void 4958} 4959