1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 31; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 32; GFX7LESS-NEXT: s_mov_b32 m0, -1 33; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 34; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 35; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 36; GFX7LESS-NEXT: BB0_2: 37; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 38; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 39; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 40; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 41; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 42; GFX7LESS-NEXT: s_mov_b32 s2, -1 43; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 44; GFX7LESS-NEXT: s_endpgm 45; 46; GFX8-LABEL: add_i32_constant: 47; GFX8: ; %bb.0: ; %entry 48; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 49; GFX8-NEXT: s_mov_b64 s[2:3], exec 50; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 51; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 52; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 53; GFX8-NEXT: ; implicit-def: $vgpr1 54; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 55; GFX8-NEXT: s_cbranch_execz BB0_2 56; GFX8-NEXT: ; %bb.1: 57; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 58; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 59; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 60; GFX8-NEXT: s_mov_b32 m0, -1 61; GFX8-NEXT: s_waitcnt lgkmcnt(0) 62; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: BB0_2: 65; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 66; GFX8-NEXT: s_waitcnt lgkmcnt(0) 67; GFX8-NEXT: v_readfirstlane_b32 s2, v1 68; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 69; GFX8-NEXT: s_mov_b32 s3, 0xf000 70; GFX8-NEXT: s_mov_b32 s2, -1 71; GFX8-NEXT: s_nop 1 72; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 73; GFX8-NEXT: s_endpgm 74; 75; GFX9-LABEL: add_i32_constant: 76; GFX9: ; %bb.0: ; %entry 77; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 78; GFX9-NEXT: s_mov_b64 s[2:3], exec 79; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 80; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 81; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 82; GFX9-NEXT: ; implicit-def: $vgpr1 83; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 84; GFX9-NEXT: s_cbranch_execz BB0_2 85; GFX9-NEXT: ; %bb.1: 86; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 87; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 88; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 89; GFX9-NEXT: s_waitcnt lgkmcnt(0) 90; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 91; GFX9-NEXT: s_waitcnt lgkmcnt(0) 92; GFX9-NEXT: BB0_2: 93; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: v_readfirstlane_b32 s2, v1 96; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 97; GFX9-NEXT: s_mov_b32 s3, 0xf000 98; GFX9-NEXT: s_mov_b32 s2, -1 99; GFX9-NEXT: s_nop 1 100; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 101; GFX9-NEXT: s_endpgm 102; 103; GFX1064-LABEL: add_i32_constant: 104; GFX1064: ; %bb.0: ; %entry 105; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 106; GFX1064-NEXT: s_mov_b64 s[2:3], exec 107; GFX1064-NEXT: ; implicit-def: $vgpr1 108; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 109; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 110; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 111; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 112; GFX1064-NEXT: s_cbranch_execz BB0_2 113; GFX1064-NEXT: ; %bb.1: 114; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 115; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 116; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 117; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 118; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 119; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 120; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 121; GFX1064-NEXT: buffer_gl0_inv 122; GFX1064-NEXT: BB0_2: 123; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 124; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 125; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 126; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 127; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 128; GFX1064-NEXT: s_mov_b32 s2, -1 129; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 130; GFX1064-NEXT: s_nop 0 131; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 132; GFX1064-NEXT: s_endpgm 133; 134; GFX1032-LABEL: add_i32_constant: 135; GFX1032: ; %bb.0: ; %entry 136; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 137; GFX1032-NEXT: s_mov_b32 s2, exec_lo 138; GFX1032-NEXT: ; implicit-def: $vgpr1 139; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 140; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 141; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 142; GFX1032-NEXT: s_cbranch_execz BB0_2 143; GFX1032-NEXT: ; %bb.1: 144; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 145; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 146; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 147; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 148; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 149; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 150; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 151; GFX1032-NEXT: buffer_gl0_inv 152; GFX1032-NEXT: BB0_2: 153; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 154; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 155; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 156; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 157; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 158; GFX1032-NEXT: s_mov_b32 s2, -1 159; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 160; GFX1032-NEXT: s_nop 0 161; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 162; GFX1032-NEXT: s_endpgm 163entry: 164 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 165 store i32 %old, i32 addrspace(1)* %out 166 ret void 167} 168 169define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 170; 171; 172; GFX7LESS-LABEL: add_i32_uniform: 173; GFX7LESS: ; %bb.0: ; %entry 174; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 175; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 176; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 177; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 178; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 179; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 180; GFX7LESS-NEXT: ; implicit-def: $vgpr1 181; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 182; GFX7LESS-NEXT: s_cbranch_execz BB1_2 183; GFX7LESS-NEXT: ; %bb.1: 184; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 185; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 186; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 187; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 188; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 189; GFX7LESS-NEXT: s_mov_b32 m0, -1 190; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 191; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 192; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 193; GFX7LESS-NEXT: BB1_2: 194; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 197; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 198; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 199; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 200; GFX7LESS-NEXT: s_mov_b32 s6, -1 201; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 202; GFX7LESS-NEXT: s_endpgm 203; 204; GFX8-LABEL: add_i32_uniform: 205; GFX8: ; %bb.0: ; %entry 206; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 207; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 208; GFX8-NEXT: s_mov_b64 s[2:3], exec 209; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 210; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 211; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 212; GFX8-NEXT: ; implicit-def: $vgpr1 213; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 214; GFX8-NEXT: s_cbranch_execz BB1_2 215; GFX8-NEXT: ; %bb.1: 216; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 218; GFX8-NEXT: s_mul_i32 s1, s0, s1 219; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 220; GFX8-NEXT: v_mov_b32_e32 v2, s1 221; GFX8-NEXT: s_mov_b32 m0, -1 222; GFX8-NEXT: s_waitcnt lgkmcnt(0) 223; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 224; GFX8-NEXT: s_waitcnt lgkmcnt(0) 225; GFX8-NEXT: BB1_2: 226; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 229; GFX8-NEXT: v_readfirstlane_b32 s0, v1 230; GFX8-NEXT: s_mov_b32 s7, 0xf000 231; GFX8-NEXT: s_mov_b32 s6, -1 232; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 233; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 234; GFX8-NEXT: s_endpgm 235; 236; GFX9-LABEL: add_i32_uniform: 237; GFX9: ; %bb.0: ; %entry 238; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 239; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 240; GFX9-NEXT: s_mov_b64 s[6:7], exec 241; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 242; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 243; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 244; GFX9-NEXT: ; implicit-def: $vgpr1 245; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 246; GFX9-NEXT: s_cbranch_execz BB1_2 247; GFX9-NEXT: ; %bb.1: 248; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 249; GFX9-NEXT: s_waitcnt lgkmcnt(0) 250; GFX9-NEXT: s_mul_i32 s3, s2, s3 251; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 252; GFX9-NEXT: v_mov_b32_e32 v2, s3 253; GFX9-NEXT: s_waitcnt lgkmcnt(0) 254; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 255; GFX9-NEXT: s_waitcnt lgkmcnt(0) 256; GFX9-NEXT: BB1_2: 257; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 260; GFX9-NEXT: v_readfirstlane_b32 s0, v1 261; GFX9-NEXT: s_mov_b32 s7, 0xf000 262; GFX9-NEXT: s_mov_b32 s6, -1 263; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 264; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 265; GFX9-NEXT: s_endpgm 266; 267; GFX1064-LABEL: add_i32_uniform: 268; GFX1064: ; %bb.0: ; %entry 269; GFX1064-NEXT: s_clause 0x1 270; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 271; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 272; GFX1064-NEXT: s_mov_b64 s[6:7], exec 273; GFX1064-NEXT: ; implicit-def: $vgpr1 274; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 275; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 276; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 277; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 278; GFX1064-NEXT: s_cbranch_execz BB1_2 279; GFX1064-NEXT: ; %bb.1: 280; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 281; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 282; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 283; GFX1064-NEXT: s_mul_i32 s3, s2, s3 284; GFX1064-NEXT: v_mov_b32_e32 v2, s3 285; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 286; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 287; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 288; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 289; GFX1064-NEXT: buffer_gl0_inv 290; GFX1064-NEXT: BB1_2: 291; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 292; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 293; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 294; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 295; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 296; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 297; GFX1064-NEXT: s_mov_b32 s6, -1 298; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 299; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 300; GFX1064-NEXT: s_endpgm 301; 302; GFX1032-LABEL: add_i32_uniform: 303; GFX1032: ; %bb.0: ; %entry 304; GFX1032-NEXT: s_clause 0x1 305; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 306; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 307; GFX1032-NEXT: s_mov_b32 s3, exec_lo 308; GFX1032-NEXT: ; implicit-def: $vgpr1 309; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 310; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 311; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 312; GFX1032-NEXT: s_cbranch_execz BB1_2 313; GFX1032-NEXT: ; %bb.1: 314; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 315; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 316; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 317; GFX1032-NEXT: s_mul_i32 s1, s2, s1 318; GFX1032-NEXT: v_mov_b32_e32 v2, s1 319; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 320; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 321; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 322; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 323; GFX1032-NEXT: buffer_gl0_inv 324; GFX1032-NEXT: BB1_2: 325; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 326; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 327; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 328; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 329; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 330; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 331; GFX1032-NEXT: s_mov_b32 s6, -1 332; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 333; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 334; GFX1032-NEXT: s_endpgm 335entry: 336 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 337 store i32 %old, i32 addrspace(1)* %out 338 ret void 339} 340 341define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 342; 343; 344; GFX7LESS-LABEL: add_i32_varying: 345; GFX7LESS: ; %bb.0: ; %entry 346; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 347; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 348; GFX7LESS-NEXT: s_mov_b32 m0, -1 349; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 350; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 351; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 352; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 353; GFX7LESS-NEXT: s_mov_b32 s2, -1 354; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 355; GFX7LESS-NEXT: s_endpgm 356; 357; GFX8-LABEL: add_i32_varying: 358; GFX8: ; %bb.0: ; %entry 359; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 360; GFX8-NEXT: v_mov_b32_e32 v2, v0 361; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 362; GFX8-NEXT: v_mov_b32_e32 v1, 0 363; GFX8-NEXT: s_mov_b64 exec, s[2:3] 364; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 365; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 366; GFX8-NEXT: s_not_b64 exec, exec 367; GFX8-NEXT: v_mov_b32_e32 v2, 0 368; GFX8-NEXT: s_not_b64 exec, exec 369; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 370; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 371; GFX8-NEXT: s_nop 1 372; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 373; GFX8-NEXT: s_nop 1 374; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 375; GFX8-NEXT: s_nop 1 376; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 377; GFX8-NEXT: s_nop 1 378; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 379; GFX8-NEXT: s_nop 1 380; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 381; GFX8-NEXT: v_readlane_b32 s4, v2, 63 382; GFX8-NEXT: s_nop 0 383; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 384; GFX8-NEXT: s_mov_b64 exec, s[2:3] 385; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 386; GFX8-NEXT: ; implicit-def: $vgpr0 387; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 388; GFX8-NEXT: s_cbranch_execz BB2_2 389; GFX8-NEXT: ; %bb.1: 390; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 391; GFX8-NEXT: v_mov_b32_e32 v3, s4 392; GFX8-NEXT: s_mov_b32 m0, -1 393; GFX8-NEXT: s_waitcnt lgkmcnt(0) 394; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 395; GFX8-NEXT: s_waitcnt lgkmcnt(0) 396; GFX8-NEXT: BB2_2: 397; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: v_readfirstlane_b32 s2, v0 400; GFX8-NEXT: v_mov_b32_e32 v0, v1 401; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 402; GFX8-NEXT: s_mov_b32 s3, 0xf000 403; GFX8-NEXT: s_mov_b32 s2, -1 404; GFX8-NEXT: s_nop 0 405; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 406; GFX8-NEXT: s_endpgm 407; 408; GFX9-LABEL: add_i32_varying: 409; GFX9: ; %bb.0: ; %entry 410; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 411; GFX9-NEXT: v_mov_b32_e32 v2, v0 412; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 413; GFX9-NEXT: v_mov_b32_e32 v1, 0 414; GFX9-NEXT: s_mov_b64 exec, s[2:3] 415; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 416; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 417; GFX9-NEXT: s_not_b64 exec, exec 418; GFX9-NEXT: v_mov_b32_e32 v2, 0 419; GFX9-NEXT: s_not_b64 exec, exec 420; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 421; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 422; GFX9-NEXT: s_nop 1 423; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 424; GFX9-NEXT: s_nop 1 425; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 426; GFX9-NEXT: s_nop 1 427; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 428; GFX9-NEXT: s_nop 1 429; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 430; GFX9-NEXT: s_nop 1 431; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 432; GFX9-NEXT: v_readlane_b32 s4, v2, 63 433; GFX9-NEXT: s_nop 0 434; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 435; GFX9-NEXT: s_mov_b64 exec, s[2:3] 436; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 437; GFX9-NEXT: ; implicit-def: $vgpr0 438; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 439; GFX9-NEXT: s_cbranch_execz BB2_2 440; GFX9-NEXT: ; %bb.1: 441; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 442; GFX9-NEXT: v_mov_b32_e32 v3, s4 443; GFX9-NEXT: s_waitcnt lgkmcnt(0) 444; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 445; GFX9-NEXT: s_waitcnt lgkmcnt(0) 446; GFX9-NEXT: BB2_2: 447; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: v_readfirstlane_b32 s2, v0 450; GFX9-NEXT: v_mov_b32_e32 v0, v1 451; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 452; GFX9-NEXT: s_mov_b32 s3, 0xf000 453; GFX9-NEXT: s_mov_b32 s2, -1 454; GFX9-NEXT: s_nop 0 455; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 456; GFX9-NEXT: s_endpgm 457; 458; GFX1064-LABEL: add_i32_varying: 459; GFX1064: ; %bb.0: ; %entry 460; GFX1064-NEXT: v_mov_b32_e32 v1, v0 461; GFX1064-NEXT: s_not_b64 exec, exec 462; GFX1064-NEXT: v_mov_b32_e32 v1, 0 463; GFX1064-NEXT: s_not_b64 exec, exec 464; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 465; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 466; GFX1064-NEXT: v_mov_b32_e32 v3, 0 467; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 469; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 470; GFX1064-NEXT: v_mov_b32_e32 v2, v1 471; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 473; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 474; GFX1064-NEXT: v_mov_b32_e32 v2, s4 475; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 476; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 477; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 478; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 479; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 480; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 481; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 482; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 483; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 484; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 485; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 486; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 487; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 488; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 489; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 490; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 491; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 492; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 493; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 494; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 495; GFX1064-NEXT: s_mov_b32 s2, -1 496; GFX1064-NEXT: ; implicit-def: $vgpr0 497; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 498; GFX1064-NEXT: s_cbranch_execz BB2_2 499; GFX1064-NEXT: ; %bb.1: 500; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 501; GFX1064-NEXT: v_mov_b32_e32 v4, s7 502; GFX1064-NEXT: s_mov_b32 s3, s7 503; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 504; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 505; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 506; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 507; GFX1064-NEXT: buffer_gl0_inv 508; GFX1064-NEXT: BB2_2: 509; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 510; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 511; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 512; GFX1064-NEXT: v_mov_b32_e32 v0, v3 513; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 514; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 515; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 516; GFX1064-NEXT: s_nop 0 517; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 518; GFX1064-NEXT: s_endpgm 519; 520; GFX1032-LABEL: add_i32_varying: 521; GFX1032: ; %bb.0: ; %entry 522; GFX1032-NEXT: v_mov_b32_e32 v1, v0 523; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 524; GFX1032-NEXT: v_mov_b32_e32 v1, 0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 527; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 528; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 531; GFX1032-NEXT: v_mov_b32_e32 v2, v1 532; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 533; GFX1032-NEXT: s_mov_b32 exec_lo, s2 534; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 535; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 536; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 537; GFX1032-NEXT: v_mov_b32_e32 v3, 0 538; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 539; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 540; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 541; GFX1032-NEXT: s_mov_b32 exec_lo, s2 542; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 543; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 544; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 545; GFX1032-NEXT: s_mov_b32 exec_lo, s2 546; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 547; GFX1032-NEXT: s_mov_b32 s2, -1 548; GFX1032-NEXT: ; implicit-def: $vgpr0 549; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 550; GFX1032-NEXT: s_cbranch_execz BB2_2 551; GFX1032-NEXT: ; %bb.1: 552; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 553; GFX1032-NEXT: v_mov_b32_e32 v4, s4 554; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 555; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 556; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 557; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 558; GFX1032-NEXT: buffer_gl0_inv 559; GFX1032-NEXT: BB2_2: 560; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 561; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 562; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 563; GFX1032-NEXT: v_mov_b32_e32 v0, v3 564; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 565; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 566; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 567; GFX1032-NEXT: s_nop 0 568; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 569; GFX1032-NEXT: s_endpgm 570entry: 571 %lane = call i32 @llvm.amdgcn.workitem.id.x() 572 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 573 store i32 %old, i32 addrspace(1)* %out 574 ret void 575} 576 577define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 578; 579; 580; GFX7LESS-LABEL: add_i32_varying_gfx1032: 581; GFX7LESS: ; %bb.0: ; %entry 582; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 583; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 584; GFX7LESS-NEXT: s_mov_b32 m0, -1 585; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 586; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 587; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 588; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 589; GFX7LESS-NEXT: s_mov_b32 s2, -1 590; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 591; GFX7LESS-NEXT: s_endpgm 592; 593; GFX8-LABEL: add_i32_varying_gfx1032: 594; GFX8: ; %bb.0: ; %entry 595; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 596; GFX8-NEXT: v_mov_b32_e32 v2, v0 597; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 598; GFX8-NEXT: v_mov_b32_e32 v1, 0 599; GFX8-NEXT: s_mov_b64 exec, s[2:3] 600; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 601; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 602; GFX8-NEXT: s_not_b64 exec, exec 603; GFX8-NEXT: v_mov_b32_e32 v2, 0 604; GFX8-NEXT: s_not_b64 exec, exec 605; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 606; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 607; GFX8-NEXT: s_nop 1 608; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 609; GFX8-NEXT: s_nop 1 610; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 611; GFX8-NEXT: s_nop 1 612; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 613; GFX8-NEXT: s_nop 1 614; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 615; GFX8-NEXT: s_nop 1 616; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 617; GFX8-NEXT: v_readlane_b32 s4, v2, 63 618; GFX8-NEXT: s_nop 0 619; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 620; GFX8-NEXT: s_mov_b64 exec, s[2:3] 621; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 622; GFX8-NEXT: ; implicit-def: $vgpr0 623; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 624; GFX8-NEXT: s_cbranch_execz BB3_2 625; GFX8-NEXT: ; %bb.1: 626; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 627; GFX8-NEXT: v_mov_b32_e32 v3, s4 628; GFX8-NEXT: s_mov_b32 m0, -1 629; GFX8-NEXT: s_waitcnt lgkmcnt(0) 630; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 631; GFX8-NEXT: s_waitcnt lgkmcnt(0) 632; GFX8-NEXT: BB3_2: 633; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 634; GFX8-NEXT: s_waitcnt lgkmcnt(0) 635; GFX8-NEXT: v_readfirstlane_b32 s2, v0 636; GFX8-NEXT: v_mov_b32_e32 v0, v1 637; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 638; GFX8-NEXT: s_mov_b32 s3, 0xf000 639; GFX8-NEXT: s_mov_b32 s2, -1 640; GFX8-NEXT: s_nop 0 641; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 642; GFX8-NEXT: s_endpgm 643; 644; GFX9-LABEL: add_i32_varying_gfx1032: 645; GFX9: ; %bb.0: ; %entry 646; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 647; GFX9-NEXT: v_mov_b32_e32 v2, v0 648; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 649; GFX9-NEXT: v_mov_b32_e32 v1, 0 650; GFX9-NEXT: s_mov_b64 exec, s[2:3] 651; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 652; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 653; GFX9-NEXT: s_not_b64 exec, exec 654; GFX9-NEXT: v_mov_b32_e32 v2, 0 655; GFX9-NEXT: s_not_b64 exec, exec 656; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 657; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 658; GFX9-NEXT: s_nop 1 659; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 660; GFX9-NEXT: s_nop 1 661; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 662; GFX9-NEXT: s_nop 1 663; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 664; GFX9-NEXT: s_nop 1 665; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 666; GFX9-NEXT: s_nop 1 667; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 668; GFX9-NEXT: v_readlane_b32 s4, v2, 63 669; GFX9-NEXT: s_nop 0 670; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 671; GFX9-NEXT: s_mov_b64 exec, s[2:3] 672; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 673; GFX9-NEXT: ; implicit-def: $vgpr0 674; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 675; GFX9-NEXT: s_cbranch_execz BB3_2 676; GFX9-NEXT: ; %bb.1: 677; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 678; GFX9-NEXT: v_mov_b32_e32 v3, s4 679; GFX9-NEXT: s_waitcnt lgkmcnt(0) 680; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 681; GFX9-NEXT: s_waitcnt lgkmcnt(0) 682; GFX9-NEXT: BB3_2: 683; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 685; GFX9-NEXT: v_readfirstlane_b32 s2, v0 686; GFX9-NEXT: v_mov_b32_e32 v0, v1 687; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 688; GFX9-NEXT: s_mov_b32 s3, 0xf000 689; GFX9-NEXT: s_mov_b32 s2, -1 690; GFX9-NEXT: s_nop 0 691; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 692; GFX9-NEXT: s_endpgm 693; 694; GFX1064-LABEL: add_i32_varying_gfx1032: 695; GFX1064: ; %bb.0: ; %entry 696; GFX1064-NEXT: v_mov_b32_e32 v1, v0 697; GFX1064-NEXT: s_not_b64 exec, exec 698; GFX1064-NEXT: v_mov_b32_e32 v1, 0 699; GFX1064-NEXT: s_not_b64 exec, exec 700; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 701; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 702; GFX1064-NEXT: v_mov_b32_e32 v3, 0 703; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 704; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 705; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 706; GFX1064-NEXT: v_mov_b32_e32 v2, v1 707; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 708; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 709; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 710; GFX1064-NEXT: v_mov_b32_e32 v2, s4 711; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 712; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 713; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 714; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 715; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 716; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 717; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 718; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 719; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 720; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 721; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 722; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 723; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 724; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 725; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 726; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 727; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 728; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 729; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 730; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 731; GFX1064-NEXT: s_mov_b32 s2, -1 732; GFX1064-NEXT: ; implicit-def: $vgpr0 733; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 734; GFX1064-NEXT: s_cbranch_execz BB3_2 735; GFX1064-NEXT: ; %bb.1: 736; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 737; GFX1064-NEXT: v_mov_b32_e32 v4, s7 738; GFX1064-NEXT: s_mov_b32 s3, s7 739; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 740; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 741; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 742; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 743; GFX1064-NEXT: buffer_gl0_inv 744; GFX1064-NEXT: BB3_2: 745; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 746; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 747; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 748; GFX1064-NEXT: v_mov_b32_e32 v0, v3 749; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 750; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 751; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 752; GFX1064-NEXT: s_nop 0 753; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 754; GFX1064-NEXT: s_endpgm 755; 756; GFX1032-LABEL: add_i32_varying_gfx1032: 757; GFX1032: ; %bb.0: ; %entry 758; GFX1032-NEXT: v_mov_b32_e32 v1, v0 759; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 760; GFX1032-NEXT: v_mov_b32_e32 v1, 0 761; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 762; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 763; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 764; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 765; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 766; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 767; GFX1032-NEXT: v_mov_b32_e32 v2, v1 768; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 769; GFX1032-NEXT: s_mov_b32 exec_lo, s2 770; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 771; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 772; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 773; GFX1032-NEXT: v_mov_b32_e32 v3, 0 774; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 775; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 776; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 777; GFX1032-NEXT: s_mov_b32 exec_lo, s2 778; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 779; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 780; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 781; GFX1032-NEXT: s_mov_b32 exec_lo, s2 782; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 783; GFX1032-NEXT: s_mov_b32 s2, -1 784; GFX1032-NEXT: ; implicit-def: $vgpr0 785; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 786; GFX1032-NEXT: s_cbranch_execz BB3_2 787; GFX1032-NEXT: ; %bb.1: 788; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 789; GFX1032-NEXT: v_mov_b32_e32 v4, s4 790; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 791; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 792; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 793; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 794; GFX1032-NEXT: buffer_gl0_inv 795; GFX1032-NEXT: BB3_2: 796; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 797; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 798; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 799; GFX1032-NEXT: v_mov_b32_e32 v0, v3 800; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 801; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 802; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 803; GFX1032-NEXT: s_nop 0 804; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 805; GFX1032-NEXT: s_endpgm 806entry: 807 %lane = call i32 @llvm.amdgcn.workitem.id.x() 808 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 809 store i32 %old, i32 addrspace(1)* %out 810 ret void 811} 812 813define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 814; 815; 816; GFX7LESS-LABEL: add_i32_varying_gfx1064: 817; GFX7LESS: ; %bb.0: ; %entry 818; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 819; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 820; GFX7LESS-NEXT: s_mov_b32 m0, -1 821; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 822; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 823; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 824; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 825; GFX7LESS-NEXT: s_mov_b32 s2, -1 826; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 827; GFX7LESS-NEXT: s_endpgm 828; 829; GFX8-LABEL: add_i32_varying_gfx1064: 830; GFX8: ; %bb.0: ; %entry 831; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 832; GFX8-NEXT: v_mov_b32_e32 v2, v0 833; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 834; GFX8-NEXT: v_mov_b32_e32 v1, 0 835; GFX8-NEXT: s_mov_b64 exec, s[2:3] 836; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 837; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 838; GFX8-NEXT: s_not_b64 exec, exec 839; GFX8-NEXT: v_mov_b32_e32 v2, 0 840; GFX8-NEXT: s_not_b64 exec, exec 841; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 842; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 843; GFX8-NEXT: s_nop 1 844; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 845; GFX8-NEXT: s_nop 1 846; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 847; GFX8-NEXT: s_nop 1 848; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 849; GFX8-NEXT: s_nop 1 850; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 851; GFX8-NEXT: s_nop 1 852; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 853; GFX8-NEXT: v_readlane_b32 s4, v2, 63 854; GFX8-NEXT: s_nop 0 855; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 856; GFX8-NEXT: s_mov_b64 exec, s[2:3] 857; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 858; GFX8-NEXT: ; implicit-def: $vgpr0 859; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 860; GFX8-NEXT: s_cbranch_execz BB4_2 861; GFX8-NEXT: ; %bb.1: 862; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 863; GFX8-NEXT: v_mov_b32_e32 v3, s4 864; GFX8-NEXT: s_mov_b32 m0, -1 865; GFX8-NEXT: s_waitcnt lgkmcnt(0) 866; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 867; GFX8-NEXT: s_waitcnt lgkmcnt(0) 868; GFX8-NEXT: BB4_2: 869; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 870; GFX8-NEXT: s_waitcnt lgkmcnt(0) 871; GFX8-NEXT: v_readfirstlane_b32 s2, v0 872; GFX8-NEXT: v_mov_b32_e32 v0, v1 873; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 874; GFX8-NEXT: s_mov_b32 s3, 0xf000 875; GFX8-NEXT: s_mov_b32 s2, -1 876; GFX8-NEXT: s_nop 0 877; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 878; GFX8-NEXT: s_endpgm 879; 880; GFX9-LABEL: add_i32_varying_gfx1064: 881; GFX9: ; %bb.0: ; %entry 882; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 883; GFX9-NEXT: v_mov_b32_e32 v2, v0 884; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 885; GFX9-NEXT: v_mov_b32_e32 v1, 0 886; GFX9-NEXT: s_mov_b64 exec, s[2:3] 887; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 888; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 889; GFX9-NEXT: s_not_b64 exec, exec 890; GFX9-NEXT: v_mov_b32_e32 v2, 0 891; GFX9-NEXT: s_not_b64 exec, exec 892; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 893; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 894; GFX9-NEXT: s_nop 1 895; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 896; GFX9-NEXT: s_nop 1 897; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 898; GFX9-NEXT: s_nop 1 899; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 900; GFX9-NEXT: s_nop 1 901; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 902; GFX9-NEXT: s_nop 1 903; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 904; GFX9-NEXT: v_readlane_b32 s4, v2, 63 905; GFX9-NEXT: s_nop 0 906; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 907; GFX9-NEXT: s_mov_b64 exec, s[2:3] 908; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 909; GFX9-NEXT: ; implicit-def: $vgpr0 910; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 911; GFX9-NEXT: s_cbranch_execz BB4_2 912; GFX9-NEXT: ; %bb.1: 913; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 914; GFX9-NEXT: v_mov_b32_e32 v3, s4 915; GFX9-NEXT: s_waitcnt lgkmcnt(0) 916; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 918; GFX9-NEXT: BB4_2: 919; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 921; GFX9-NEXT: v_readfirstlane_b32 s2, v0 922; GFX9-NEXT: v_mov_b32_e32 v0, v1 923; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 924; GFX9-NEXT: s_mov_b32 s3, 0xf000 925; GFX9-NEXT: s_mov_b32 s2, -1 926; GFX9-NEXT: s_nop 0 927; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 928; GFX9-NEXT: s_endpgm 929; 930; GFX1064-LABEL: add_i32_varying_gfx1064: 931; GFX1064: ; %bb.0: ; %entry 932; GFX1064-NEXT: v_mov_b32_e32 v1, v0 933; GFX1064-NEXT: s_not_b64 exec, exec 934; GFX1064-NEXT: v_mov_b32_e32 v1, 0 935; GFX1064-NEXT: s_not_b64 exec, exec 936; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 937; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 938; GFX1064-NEXT: v_mov_b32_e32 v3, 0 939; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 940; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 941; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 942; GFX1064-NEXT: v_mov_b32_e32 v2, v1 943; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 944; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 945; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 946; GFX1064-NEXT: v_mov_b32_e32 v2, s4 947; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 948; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 949; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 950; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 951; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 952; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 953; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 954; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 955; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 956; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 957; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 958; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 959; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 960; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 961; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 962; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 963; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 964; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 965; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 966; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 967; GFX1064-NEXT: s_mov_b32 s2, -1 968; GFX1064-NEXT: ; implicit-def: $vgpr0 969; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 970; GFX1064-NEXT: s_cbranch_execz BB4_2 971; GFX1064-NEXT: ; %bb.1: 972; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 973; GFX1064-NEXT: v_mov_b32_e32 v4, s7 974; GFX1064-NEXT: s_mov_b32 s3, s7 975; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 976; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 977; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 978; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 979; GFX1064-NEXT: buffer_gl0_inv 980; GFX1064-NEXT: BB4_2: 981; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 982; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 983; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 984; GFX1064-NEXT: v_mov_b32_e32 v0, v3 985; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 986; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 987; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 988; GFX1064-NEXT: s_nop 0 989; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 990; GFX1064-NEXT: s_endpgm 991; 992; GFX1032-LABEL: add_i32_varying_gfx1064: 993; GFX1032: ; %bb.0: ; %entry 994; GFX1032-NEXT: v_mov_b32_e32 v1, v0 995; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 996; GFX1032-NEXT: v_mov_b32_e32 v1, 0 997; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 998; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 999; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1000; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1001; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1002; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1003; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1004; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1005; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1006; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1007; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1008; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1009; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1010; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1011; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1012; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1013; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1014; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1015; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1016; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1017; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1018; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1019; GFX1032-NEXT: s_mov_b32 s2, -1 1020; GFX1032-NEXT: ; implicit-def: $vgpr0 1021; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1022; GFX1032-NEXT: s_cbranch_execz BB4_2 1023; GFX1032-NEXT: ; %bb.1: 1024; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1025; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1026; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1027; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1028; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 1029; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1030; GFX1032-NEXT: buffer_gl0_inv 1031; GFX1032-NEXT: BB4_2: 1032; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1033; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1034; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1035; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1036; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1037; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1038; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1039; GFX1032-NEXT: s_nop 0 1040; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1041; GFX1032-NEXT: s_endpgm 1042entry: 1043 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1044 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1045 store i32 %old, i32 addrspace(1)* %out 1046 ret void 1047} 1048 1049define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1050; 1051; 1052; GFX7LESS-LABEL: add_i64_constant: 1053; GFX7LESS: ; %bb.0: ; %entry 1054; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1055; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1056; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1057; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1058; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1059; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1060; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1061; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1062; GFX7LESS-NEXT: ; %bb.1: 1063; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1064; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1065; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1066; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1067; GFX7LESS-NEXT: s_mov_b32 m0, -1 1068; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1069; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1070; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX7LESS-NEXT: BB5_2: 1072; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1073; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1074; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1075; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1076; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1077; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1078; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1079; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1080; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1081; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1082; GFX7LESS-NEXT: s_mov_b32 s2, -1 1083; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1084; GFX7LESS-NEXT: s_endpgm 1085; 1086; GFX8-LABEL: add_i64_constant: 1087; GFX8: ; %bb.0: ; %entry 1088; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1089; GFX8-NEXT: s_mov_b64 s[4:5], exec 1090; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1091; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1092; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1093; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1094; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1095; GFX8-NEXT: s_cbranch_execz BB5_2 1096; GFX8-NEXT: ; %bb.1: 1097; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1098; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1099; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1100; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1101; GFX8-NEXT: s_mov_b32 m0, -1 1102; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1103; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1104; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1105; GFX8-NEXT: BB5_2: 1106; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1107; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1108; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1109; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1110; GFX8-NEXT: v_mov_b32_e32 v1, s2 1111; GFX8-NEXT: v_mov_b32_e32 v2, s3 1112; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1113; GFX8-NEXT: s_mov_b32 s3, 0xf000 1114; GFX8-NEXT: s_mov_b32 s2, -1 1115; GFX8-NEXT: s_nop 2 1116; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1117; GFX8-NEXT: s_endpgm 1118; 1119; GFX9-LABEL: add_i64_constant: 1120; GFX9: ; %bb.0: ; %entry 1121; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1122; GFX9-NEXT: s_mov_b64 s[4:5], exec 1123; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1124; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1125; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1126; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1127; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1128; GFX9-NEXT: s_cbranch_execz BB5_2 1129; GFX9-NEXT: ; %bb.1: 1130; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1131; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1132; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1133; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1134; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1135; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1136; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX9-NEXT: BB5_2: 1138; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1141; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1142; GFX9-NEXT: v_mov_b32_e32 v1, s2 1143; GFX9-NEXT: v_mov_b32_e32 v2, s3 1144; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1145; GFX9-NEXT: s_mov_b32 s3, 0xf000 1146; GFX9-NEXT: s_mov_b32 s2, -1 1147; GFX9-NEXT: s_nop 2 1148; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1149; GFX9-NEXT: s_endpgm 1150; 1151; GFX1064-LABEL: add_i64_constant: 1152; GFX1064: ; %bb.0: ; %entry 1153; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1154; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1155; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1156; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1157; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 1158; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1159; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1160; GFX1064-NEXT: s_cbranch_execz BB5_2 1161; GFX1064-NEXT: ; %bb.1: 1162; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1163; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1164; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1165; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 1166; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1167; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1168; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1169; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX1064-NEXT: buffer_gl0_inv 1171; GFX1064-NEXT: BB5_2: 1172; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1173; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1174; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1175; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1176; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1177; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1178; GFX1064-NEXT: s_mov_b32 s2, -1 1179; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1180; GFX1064-NEXT: s_nop 1 1181; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1182; GFX1064-NEXT: s_endpgm 1183; 1184; GFX1032-LABEL: add_i64_constant: 1185; GFX1032: ; %bb.0: ; %entry 1186; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1187; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1188; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1189; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1190; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1191; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1192; GFX1032-NEXT: s_cbranch_execz BB5_2 1193; GFX1032-NEXT: ; %bb.1: 1194; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1195; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1196; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1197; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 1198; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1199; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1200; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1201; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1202; GFX1032-NEXT: buffer_gl0_inv 1203; GFX1032-NEXT: BB5_2: 1204; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1205; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1206; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1207; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1208; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1209; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1210; GFX1032-NEXT: s_mov_b32 s2, -1 1211; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1212; GFX1032-NEXT: s_nop 1 1213; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1214; GFX1032-NEXT: s_endpgm 1215entry: 1216 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1217 store i64 %old, i64 addrspace(1)* %out 1218 ret void 1219} 1220 1221define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1222; 1223; 1224; GFX7LESS-LABEL: add_i64_uniform: 1225; GFX7LESS: ; %bb.0: ; %entry 1226; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1227; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1228; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1229; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1230; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1231; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1232; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1233; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1234; GFX7LESS-NEXT: ; %bb.1: 1235; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1236; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1237; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1239; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1240; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1241; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1242; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1243; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1244; GFX7LESS-NEXT: s_mov_b32 m0, -1 1245; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1247; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1248; GFX7LESS-NEXT: BB6_2: 1249; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1250; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1251; GFX7LESS-NEXT: s_mov_b32 s6, -1 1252; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1253; GFX7LESS-NEXT: s_mov_b32 s4, s0 1254; GFX7LESS-NEXT: s_mov_b32 s5, s1 1255; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1256; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1257; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1258; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1259; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1260; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1261; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1262; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1263; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1264; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1265; GFX7LESS-NEXT: s_endpgm 1266; 1267; GFX8-LABEL: add_i64_uniform: 1268; GFX8: ; %bb.0: ; %entry 1269; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1270; GFX8-NEXT: s_mov_b64 s[6:7], exec 1271; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1272; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1273; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1274; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1275; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1276; GFX8-NEXT: s_cbranch_execz BB6_2 1277; GFX8-NEXT: ; %bb.1: 1278; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1279; GFX8-NEXT: v_mov_b32_e32 v1, s6 1280; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1281; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1282; GFX8-NEXT: s_mul_i32 s7, s3, s6 1283; GFX8-NEXT: s_mul_i32 s6, s2, s6 1284; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1285; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1286; GFX8-NEXT: v_mov_b32_e32 v1, s6 1287; GFX8-NEXT: s_mov_b32 m0, -1 1288; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1289; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1290; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX8-NEXT: BB6_2: 1292; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1293; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX8-NEXT: s_mov_b32 s4, s0 1295; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1296; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1297; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1298; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1299; GFX8-NEXT: s_mov_b32 s5, s1 1300; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1301; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1302; GFX8-NEXT: v_mov_b32_e32 v2, s1 1303; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1304; GFX8-NEXT: s_mov_b32 s7, 0xf000 1305; GFX8-NEXT: s_mov_b32 s6, -1 1306; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1307; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1308; GFX8-NEXT: s_endpgm 1309; 1310; GFX9-LABEL: add_i64_uniform: 1311; GFX9: ; %bb.0: ; %entry 1312; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1313; GFX9-NEXT: s_mov_b64 s[6:7], exec 1314; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1315; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1316; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1317; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1318; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1319; GFX9-NEXT: s_cbranch_execz BB6_2 1320; GFX9-NEXT: ; %bb.1: 1321; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1323; GFX9-NEXT: s_mul_i32 s7, s3, s6 1324; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1325; GFX9-NEXT: s_add_i32 s8, s8, s7 1326; GFX9-NEXT: s_mul_i32 s6, s2, s6 1327; GFX9-NEXT: v_mov_b32_e32 v1, s6 1328; GFX9-NEXT: v_mov_b32_e32 v2, s8 1329; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1330; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1333; GFX9-NEXT: BB6_2: 1334; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1335; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1336; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1337; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1338; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1339; GFX9-NEXT: s_mov_b32 s4, s0 1340; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1341; GFX9-NEXT: s_mov_b32 s5, s1 1342; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1343; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1344; GFX9-NEXT: v_mov_b32_e32 v2, s1 1345; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1346; GFX9-NEXT: s_mov_b32 s7, 0xf000 1347; GFX9-NEXT: s_mov_b32 s6, -1 1348; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1349; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1350; GFX9-NEXT: s_endpgm 1351; 1352; GFX1064-LABEL: add_i64_uniform: 1353; GFX1064: ; %bb.0: ; %entry 1354; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1355; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1356; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1357; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1358; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1359; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1360; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1361; GFX1064-NEXT: s_cbranch_execz BB6_2 1362; GFX1064-NEXT: ; %bb.1: 1363; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1364; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1365; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1366; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1367; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1368; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1369; GFX1064-NEXT: s_add_i32 s8, s8, s7 1370; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1371; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1372; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1373; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1374; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1375; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1376; GFX1064-NEXT: buffer_gl0_inv 1377; GFX1064-NEXT: BB6_2: 1378; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1379; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1380; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1381; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1382; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1383; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1384; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1385; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1386; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1387; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1388; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 1389; GFX1064-NEXT: s_mov_b32 s2, -1 1390; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1391; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1392; GFX1064-NEXT: s_endpgm 1393; 1394; GFX1032-LABEL: add_i64_uniform: 1395; GFX1032: ; %bb.0: ; %entry 1396; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1397; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1398; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1399; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1400; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1401; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1402; GFX1032-NEXT: s_cbranch_execz BB6_2 1403; GFX1032-NEXT: ; %bb.1: 1404; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1405; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1406; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1407; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1408; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1409; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1410; GFX1032-NEXT: s_add_i32 s7, s7, s6 1411; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1412; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1413; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1414; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1415; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1416; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX1032-NEXT: buffer_gl0_inv 1418; GFX1032-NEXT: BB6_2: 1419; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1420; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1421; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1423; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1424; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1425; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1426; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1427; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1428; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1429; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 1430; GFX1032-NEXT: s_mov_b32 s2, -1 1431; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1432; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1433; GFX1032-NEXT: s_endpgm 1434entry: 1435 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1436 store i64 %old, i64 addrspace(1)* %out 1437 ret void 1438} 1439 1440; GCN-NOT: v_mbcnt_lo_u32_b32 1441; GCN-NOT: v_mbcnt_hi_u32_b32 1442; GCN-NOT: s_bcnt1_i32_b64 1443define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1444; 1445; 1446; GFX7LESS-LABEL: add_i64_varying: 1447; GFX7LESS: ; %bb.0: ; %entry 1448; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1449; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1450; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1451; GFX7LESS-NEXT: s_mov_b32 m0, -1 1452; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1454; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1455; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1456; GFX7LESS-NEXT: s_mov_b32 s2, -1 1457; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1458; GFX7LESS-NEXT: s_endpgm 1459; 1460; GFX8-LABEL: add_i64_varying: 1461; GFX8: ; %bb.0: ; %entry 1462; GFX8-NEXT: v_mov_b32_e32 v1, 0 1463; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1464; GFX8-NEXT: s_mov_b32 m0, -1 1465; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1466; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1467; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1468; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1469; GFX8-NEXT: s_mov_b32 s3, 0xf000 1470; GFX8-NEXT: s_mov_b32 s2, -1 1471; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1472; GFX8-NEXT: s_endpgm 1473; 1474; GFX9-LABEL: add_i64_varying: 1475; GFX9: ; %bb.0: ; %entry 1476; GFX9-NEXT: v_mov_b32_e32 v1, 0 1477; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1478; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1479; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1481; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1482; GFX9-NEXT: s_mov_b32 s3, 0xf000 1483; GFX9-NEXT: s_mov_b32 s2, -1 1484; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1485; GFX9-NEXT: s_endpgm 1486; 1487; GFX1064-LABEL: add_i64_varying: 1488; GFX1064: ; %bb.0: ; %entry 1489; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1490; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1491; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1492; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1493; GFX1064-NEXT: s_mov_b32 s2, -1 1494; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1495; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1496; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1497; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1498; GFX1064-NEXT: buffer_gl0_inv 1499; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1500; GFX1064-NEXT: s_endpgm 1501; 1502; GFX1032-LABEL: add_i64_varying: 1503; GFX1032: ; %bb.0: ; %entry 1504; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1505; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1506; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1507; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1508; GFX1032-NEXT: s_mov_b32 s2, -1 1509; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1510; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1511; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1512; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1513; GFX1032-NEXT: buffer_gl0_inv 1514; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1515; GFX1032-NEXT: s_endpgm 1516entry: 1517 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1518 %zext = zext i32 %lane to i64 1519 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1520 store i64 %old, i64 addrspace(1)* %out 1521 ret void 1522} 1523 1524define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1525; 1526; 1527; GFX7LESS-LABEL: sub_i32_constant: 1528; GFX7LESS: ; %bb.0: ; %entry 1529; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1530; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1531; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1532; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1533; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1534; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1535; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1536; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1537; GFX7LESS-NEXT: ; %bb.1: 1538; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1539; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1540; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 1541; GFX7LESS-NEXT: s_mov_b32 m0, -1 1542; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1543; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1544; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1545; GFX7LESS-NEXT: BB8_2: 1546; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1547; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1548; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1549; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1550; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1551; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1552; GFX7LESS-NEXT: s_mov_b32 s2, -1 1553; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1554; GFX7LESS-NEXT: s_endpgm 1555; 1556; GFX8-LABEL: sub_i32_constant: 1557; GFX8: ; %bb.0: ; %entry 1558; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1559; GFX8-NEXT: s_mov_b64 s[2:3], exec 1560; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1561; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1562; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1563; GFX8-NEXT: ; implicit-def: $vgpr1 1564; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1565; GFX8-NEXT: s_cbranch_execz BB8_2 1566; GFX8-NEXT: ; %bb.1: 1567; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1568; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1569; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1570; GFX8-NEXT: s_mov_b32 m0, -1 1571; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1572; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 1573; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1574; GFX8-NEXT: BB8_2: 1575; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1576; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1577; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1578; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1579; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1580; GFX8-NEXT: s_mov_b32 s3, 0xf000 1581; GFX8-NEXT: s_mov_b32 s2, -1 1582; GFX8-NEXT: s_nop 0 1583; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1584; GFX8-NEXT: s_endpgm 1585; 1586; GFX9-LABEL: sub_i32_constant: 1587; GFX9: ; %bb.0: ; %entry 1588; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1589; GFX9-NEXT: s_mov_b64 s[2:3], exec 1590; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1591; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1592; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1593; GFX9-NEXT: ; implicit-def: $vgpr1 1594; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1595; GFX9-NEXT: s_cbranch_execz BB8_2 1596; GFX9-NEXT: ; %bb.1: 1597; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1598; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1599; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1600; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 1602; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1603; GFX9-NEXT: BB8_2: 1604; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1605; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1606; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1607; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1608; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1609; GFX9-NEXT: s_mov_b32 s3, 0xf000 1610; GFX9-NEXT: s_mov_b32 s2, -1 1611; GFX9-NEXT: s_nop 0 1612; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1613; GFX9-NEXT: s_endpgm 1614; 1615; GFX1064-LABEL: sub_i32_constant: 1616; GFX1064: ; %bb.0: ; %entry 1617; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1618; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1619; GFX1064-NEXT: ; implicit-def: $vgpr1 1620; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1621; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1622; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1623; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1624; GFX1064-NEXT: s_cbranch_execz BB8_2 1625; GFX1064-NEXT: ; %bb.1: 1626; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1627; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1628; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1629; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1630; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1631; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 1632; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1633; GFX1064-NEXT: buffer_gl0_inv 1634; GFX1064-NEXT: BB8_2: 1635; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1636; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1637; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1638; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1639; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1640; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1641; GFX1064-NEXT: s_mov_b32 s2, -1 1642; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1643; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1644; GFX1064-NEXT: s_endpgm 1645; 1646; GFX1032-LABEL: sub_i32_constant: 1647; GFX1032: ; %bb.0: ; %entry 1648; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1649; GFX1032-NEXT: s_mov_b32 s2, exec_lo 1650; GFX1032-NEXT: ; implicit-def: $vgpr1 1651; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1652; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1653; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1654; GFX1032-NEXT: s_cbranch_execz BB8_2 1655; GFX1032-NEXT: ; %bb.1: 1656; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 1657; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo 1658; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 1659; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1660; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1661; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 1662; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1663; GFX1032-NEXT: buffer_gl0_inv 1664; GFX1032-NEXT: BB8_2: 1665; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1666; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1667; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1668; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1669; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1670; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1671; GFX1032-NEXT: s_mov_b32 s2, -1 1672; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1673; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1674; GFX1032-NEXT: s_endpgm 1675entry: 1676 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1677 store i32 %old, i32 addrspace(1)* %out 1678 ret void 1679} 1680 1681define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1682; 1683; 1684; GFX7LESS-LABEL: sub_i32_uniform: 1685; GFX7LESS: ; %bb.0: ; %entry 1686; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1687; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1688; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 1689; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1690; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1691; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1692; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1693; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 1694; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1695; GFX7LESS-NEXT: ; %bb.1: 1696; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1697; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1698; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 1699; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1700; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1701; GFX7LESS-NEXT: s_mov_b32 m0, -1 1702; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1703; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1704; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1705; GFX7LESS-NEXT: BB9_2: 1706; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 1707; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1708; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1709; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1710; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1711; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 1712; GFX7LESS-NEXT: s_mov_b32 s6, -1 1713; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1714; GFX7LESS-NEXT: s_endpgm 1715; 1716; GFX8-LABEL: sub_i32_uniform: 1717; GFX8: ; %bb.0: ; %entry 1718; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1719; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1720; GFX8-NEXT: s_mov_b64 s[2:3], exec 1721; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1722; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1723; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1724; GFX8-NEXT: ; implicit-def: $vgpr1 1725; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1726; GFX8-NEXT: s_cbranch_execz BB9_2 1727; GFX8-NEXT: ; %bb.1: 1728; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1729; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1730; GFX8-NEXT: s_mul_i32 s1, s0, s1 1731; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1732; GFX8-NEXT: v_mov_b32_e32 v2, s1 1733; GFX8-NEXT: s_mov_b32 m0, -1 1734; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1735; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1736; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1737; GFX8-NEXT: BB9_2: 1738; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1739; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1741; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1742; GFX8-NEXT: s_mov_b32 s7, 0xf000 1743; GFX8-NEXT: s_mov_b32 s6, -1 1744; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1745; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1746; GFX8-NEXT: s_endpgm 1747; 1748; GFX9-LABEL: sub_i32_uniform: 1749; GFX9: ; %bb.0: ; %entry 1750; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1751; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1752; GFX9-NEXT: s_mov_b64 s[6:7], exec 1753; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1754; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1755; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1756; GFX9-NEXT: ; implicit-def: $vgpr1 1757; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1758; GFX9-NEXT: s_cbranch_execz BB9_2 1759; GFX9-NEXT: ; %bb.1: 1760; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1761; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1762; GFX9-NEXT: s_mul_i32 s3, s2, s3 1763; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1764; GFX9-NEXT: v_mov_b32_e32 v2, s3 1765; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1766; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1767; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1768; GFX9-NEXT: BB9_2: 1769; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1770; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1771; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1772; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1773; GFX9-NEXT: s_mov_b32 s7, 0xf000 1774; GFX9-NEXT: s_mov_b32 s6, -1 1775; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1776; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1777; GFX9-NEXT: s_endpgm 1778; 1779; GFX1064-LABEL: sub_i32_uniform: 1780; GFX1064: ; %bb.0: ; %entry 1781; GFX1064-NEXT: s_clause 0x1 1782; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1783; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 1784; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1785; GFX1064-NEXT: ; implicit-def: $vgpr1 1786; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1787; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1788; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1789; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1790; GFX1064-NEXT: s_cbranch_execz BB9_2 1791; GFX1064-NEXT: ; %bb.1: 1792; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1793; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1794; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1795; GFX1064-NEXT: s_mul_i32 s3, s2, s3 1796; GFX1064-NEXT: v_mov_b32_e32 v2, s3 1797; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1798; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1799; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1800; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1801; GFX1064-NEXT: buffer_gl0_inv 1802; GFX1064-NEXT: BB9_2: 1803; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1804; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1805; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1806; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1807; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1808; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1809; GFX1064-NEXT: s_mov_b32 s6, -1 1810; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1811; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1812; GFX1064-NEXT: s_endpgm 1813; 1814; GFX1032-LABEL: sub_i32_uniform: 1815; GFX1032: ; %bb.0: ; %entry 1816; GFX1032-NEXT: s_clause 0x1 1817; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1818; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1819; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1820; GFX1032-NEXT: ; implicit-def: $vgpr1 1821; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1822; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1823; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1824; GFX1032-NEXT: s_cbranch_execz BB9_2 1825; GFX1032-NEXT: ; %bb.1: 1826; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1827; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1828; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1829; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1830; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1831; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1832; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1833; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1834; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1835; GFX1032-NEXT: buffer_gl0_inv 1836; GFX1032-NEXT: BB9_2: 1837; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1838; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1839; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1840; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1841; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1842; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1843; GFX1032-NEXT: s_mov_b32 s6, -1 1844; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1845; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1846; GFX1032-NEXT: s_endpgm 1847entry: 1848 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1849 store i32 %old, i32 addrspace(1)* %out 1850 ret void 1851} 1852 1853define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1854; 1855; 1856; GFX7LESS-LABEL: sub_i32_varying: 1857; GFX7LESS: ; %bb.0: ; %entry 1858; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1859; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1860; GFX7LESS-NEXT: s_mov_b32 m0, -1 1861; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1862; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1863; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1864; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1865; GFX7LESS-NEXT: s_mov_b32 s2, -1 1866; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1867; GFX7LESS-NEXT: s_endpgm 1868; 1869; GFX8-LABEL: sub_i32_varying: 1870; GFX8: ; %bb.0: ; %entry 1871; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1872; GFX8-NEXT: v_mov_b32_e32 v2, v0 1873; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1874; GFX8-NEXT: v_mov_b32_e32 v1, 0 1875; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1876; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1877; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1878; GFX8-NEXT: s_not_b64 exec, exec 1879; GFX8-NEXT: v_mov_b32_e32 v2, 0 1880; GFX8-NEXT: s_not_b64 exec, exec 1881; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1882; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1883; GFX8-NEXT: s_nop 1 1884; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1885; GFX8-NEXT: s_nop 1 1886; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1887; GFX8-NEXT: s_nop 1 1888; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1889; GFX8-NEXT: s_nop 1 1890; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1891; GFX8-NEXT: s_nop 1 1892; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1893; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1894; GFX8-NEXT: s_nop 0 1895; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1896; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1897; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1898; GFX8-NEXT: ; implicit-def: $vgpr0 1899; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1900; GFX8-NEXT: s_cbranch_execz BB10_2 1901; GFX8-NEXT: ; %bb.1: 1902; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1903; GFX8-NEXT: v_mov_b32_e32 v3, s4 1904; GFX8-NEXT: s_mov_b32 m0, -1 1905; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1906; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1907; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1908; GFX8-NEXT: BB10_2: 1909; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1910; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1912; GFX8-NEXT: v_mov_b32_e32 v0, v1 1913; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1914; GFX8-NEXT: s_mov_b32 s3, 0xf000 1915; GFX8-NEXT: s_mov_b32 s2, -1 1916; GFX8-NEXT: s_nop 0 1917; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1918; GFX8-NEXT: s_endpgm 1919; 1920; GFX9-LABEL: sub_i32_varying: 1921; GFX9: ; %bb.0: ; %entry 1922; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1923; GFX9-NEXT: v_mov_b32_e32 v2, v0 1924; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1925; GFX9-NEXT: v_mov_b32_e32 v1, 0 1926; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1927; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1928; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1929; GFX9-NEXT: s_not_b64 exec, exec 1930; GFX9-NEXT: v_mov_b32_e32 v2, 0 1931; GFX9-NEXT: s_not_b64 exec, exec 1932; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1933; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1934; GFX9-NEXT: s_nop 1 1935; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1936; GFX9-NEXT: s_nop 1 1937; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1938; GFX9-NEXT: s_nop 1 1939; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1940; GFX9-NEXT: s_nop 1 1941; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1942; GFX9-NEXT: s_nop 1 1943; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1944; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1945; GFX9-NEXT: s_nop 0 1946; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1947; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1948; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1949; GFX9-NEXT: ; implicit-def: $vgpr0 1950; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1951; GFX9-NEXT: s_cbranch_execz BB10_2 1952; GFX9-NEXT: ; %bb.1: 1953; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1954; GFX9-NEXT: v_mov_b32_e32 v3, s4 1955; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1956; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1957; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1958; GFX9-NEXT: BB10_2: 1959; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1960; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1961; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1962; GFX9-NEXT: v_mov_b32_e32 v0, v1 1963; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1964; GFX9-NEXT: s_mov_b32 s3, 0xf000 1965; GFX9-NEXT: s_mov_b32 s2, -1 1966; GFX9-NEXT: s_nop 0 1967; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1968; GFX9-NEXT: s_endpgm 1969; 1970; GFX1064-LABEL: sub_i32_varying: 1971; GFX1064: ; %bb.0: ; %entry 1972; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1973; GFX1064-NEXT: s_not_b64 exec, exec 1974; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1975; GFX1064-NEXT: s_not_b64 exec, exec 1976; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1977; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 1978; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1979; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 1980; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 1981; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 1982; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1983; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1984; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1985; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1986; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1987; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1988; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1989; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1990; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1991; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1992; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1993; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1994; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 1995; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1996; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1997; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1998; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 1999; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2000; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2001; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2002; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2003; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2004; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2005; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2006; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2007; GFX1064-NEXT: s_mov_b32 s2, -1 2008; GFX1064-NEXT: ; implicit-def: $vgpr0 2009; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2010; GFX1064-NEXT: s_cbranch_execz BB10_2 2011; GFX1064-NEXT: ; %bb.1: 2012; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2013; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2014; GFX1064-NEXT: s_mov_b32 s3, s7 2015; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2016; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2017; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 2018; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2019; GFX1064-NEXT: buffer_gl0_inv 2020; GFX1064-NEXT: BB10_2: 2021; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2022; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2023; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2024; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2025; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2026; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2027; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2028; GFX1064-NEXT: s_nop 0 2029; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2030; GFX1064-NEXT: s_endpgm 2031; 2032; GFX1032-LABEL: sub_i32_varying: 2033; GFX1032: ; %bb.0: ; %entry 2034; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2035; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2036; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2037; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2038; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2039; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2040; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2041; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2042; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2043; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2044; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2045; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2046; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2047; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2048; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2049; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2050; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2051; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2052; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2053; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2054; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2055; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2056; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2057; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2058; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2059; GFX1032-NEXT: s_mov_b32 s2, -1 2060; GFX1032-NEXT: ; implicit-def: $vgpr0 2061; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2062; GFX1032-NEXT: s_cbranch_execz BB10_2 2063; GFX1032-NEXT: ; %bb.1: 2064; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2065; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2066; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2067; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2068; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 2069; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2070; GFX1032-NEXT: buffer_gl0_inv 2071; GFX1032-NEXT: BB10_2: 2072; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2073; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2074; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2075; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2076; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2077; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2078; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2079; GFX1032-NEXT: s_nop 0 2080; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2081; GFX1032-NEXT: s_endpgm 2082entry: 2083 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2084 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2085 store i32 %old, i32 addrspace(1)* %out 2086 ret void 2087} 2088 2089define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2090; 2091; 2092; GFX7LESS-LABEL: sub_i64_constant: 2093; GFX7LESS: ; %bb.0: ; %entry 2094; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2095; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2096; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2097; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2098; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2099; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2100; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2101; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2102; GFX7LESS-NEXT: ; %bb.1: 2103; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2104; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2105; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2106; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2107; GFX7LESS-NEXT: s_mov_b32 m0, -1 2108; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2109; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2110; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2111; GFX7LESS-NEXT: BB11_2: 2112; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2113; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2114; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2115; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2116; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2117; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2118; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2119; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2120; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2121; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2122; GFX7LESS-NEXT: s_mov_b32 s2, -1 2123; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2124; GFX7LESS-NEXT: s_endpgm 2125; 2126; GFX8-LABEL: sub_i64_constant: 2127; GFX8: ; %bb.0: ; %entry 2128; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2129; GFX8-NEXT: s_mov_b64 s[4:5], exec 2130; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2131; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2132; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2133; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2134; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2135; GFX8-NEXT: s_cbranch_execz BB11_2 2136; GFX8-NEXT: ; %bb.1: 2137; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2138; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2139; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2140; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2141; GFX8-NEXT: s_mov_b32 m0, -1 2142; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2143; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2144; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2145; GFX8-NEXT: BB11_2: 2146; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2147; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2148; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2149; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2150; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2151; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2152; GFX8-NEXT: v_mov_b32_e32 v2, s3 2153; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2154; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2155; GFX8-NEXT: s_mov_b32 s3, 0xf000 2156; GFX8-NEXT: s_mov_b32 s2, -1 2157; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2158; GFX8-NEXT: s_endpgm 2159; 2160; GFX9-LABEL: sub_i64_constant: 2161; GFX9: ; %bb.0: ; %entry 2162; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2163; GFX9-NEXT: s_mov_b64 s[4:5], exec 2164; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2165; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2166; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2167; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2168; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2169; GFX9-NEXT: s_cbranch_execz BB11_2 2170; GFX9-NEXT: ; %bb.1: 2171; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2172; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2173; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2174; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2175; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2176; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2177; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2178; GFX9-NEXT: BB11_2: 2179; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2180; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2181; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2182; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2183; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2184; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2185; GFX9-NEXT: v_mov_b32_e32 v2, s3 2186; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2187; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2188; GFX9-NEXT: s_mov_b32 s3, 0xf000 2189; GFX9-NEXT: s_mov_b32 s2, -1 2190; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2191; GFX9-NEXT: s_endpgm 2192; 2193; GFX1064-LABEL: sub_i64_constant: 2194; GFX1064: ; %bb.0: ; %entry 2195; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2196; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2197; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2198; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2199; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2200; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2201; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2202; GFX1064-NEXT: s_cbranch_execz BB11_2 2203; GFX1064-NEXT: ; %bb.1: 2204; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2205; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2206; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2207; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 2208; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2209; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2210; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2211; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2212; GFX1064-NEXT: buffer_gl0_inv 2213; GFX1064-NEXT: BB11_2: 2214; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2215; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2216; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2217; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2218; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2219; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2220; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2221; GFX1064-NEXT: s_mov_b32 s2, -1 2222; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2223; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2224; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2225; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2226; GFX1064-NEXT: s_endpgm 2227; 2228; GFX1032-LABEL: sub_i64_constant: 2229; GFX1032: ; %bb.0: ; %entry 2230; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2231; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2232; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2233; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2234; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2235; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2236; GFX1032-NEXT: s_cbranch_execz BB11_2 2237; GFX1032-NEXT: ; %bb.1: 2238; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2239; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2240; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2241; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 2242; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2243; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2244; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2245; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2246; GFX1032-NEXT: buffer_gl0_inv 2247; GFX1032-NEXT: BB11_2: 2248; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2249; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2250; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2251; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2252; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2253; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2254; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2255; GFX1032-NEXT: s_mov_b32 s2, -1 2256; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2257; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2258; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2259; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2260; GFX1032-NEXT: s_endpgm 2261entry: 2262 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2263 store i64 %old, i64 addrspace(1)* %out 2264 ret void 2265} 2266 2267define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2268; 2269; 2270; GFX7LESS-LABEL: sub_i64_uniform: 2271; GFX7LESS: ; %bb.0: ; %entry 2272; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2273; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2274; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2275; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2276; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2277; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2278; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2279; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2280; GFX7LESS-NEXT: ; %bb.1: 2281; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2282; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2283; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2284; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2285; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2286; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2287; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2288; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2289; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2290; GFX7LESS-NEXT: s_mov_b32 m0, -1 2291; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2292; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2293; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2294; GFX7LESS-NEXT: BB12_2: 2295; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2296; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2297; GFX7LESS-NEXT: s_mov_b32 s6, -1 2298; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2299; GFX7LESS-NEXT: s_mov_b32 s4, s0 2300; GFX7LESS-NEXT: s_mov_b32 s5, s1 2301; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2302; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2303; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2304; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2305; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2306; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2307; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2308; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2309; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2310; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2311; GFX7LESS-NEXT: s_endpgm 2312; 2313; GFX8-LABEL: sub_i64_uniform: 2314; GFX8: ; %bb.0: ; %entry 2315; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2316; GFX8-NEXT: s_mov_b64 s[6:7], exec 2317; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2318; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2319; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2320; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2321; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2322; GFX8-NEXT: s_cbranch_execz BB12_2 2323; GFX8-NEXT: ; %bb.1: 2324; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2325; GFX8-NEXT: v_mov_b32_e32 v1, s6 2326; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2327; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2328; GFX8-NEXT: s_mul_i32 s7, s3, s6 2329; GFX8-NEXT: s_mul_i32 s6, s2, s6 2330; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2331; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2332; GFX8-NEXT: v_mov_b32_e32 v1, s6 2333; GFX8-NEXT: s_mov_b32 m0, -1 2334; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2335; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2336; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2337; GFX8-NEXT: BB12_2: 2338; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2339; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2340; GFX8-NEXT: s_mov_b32 s4, s0 2341; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2342; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2343; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2344; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2345; GFX8-NEXT: s_mov_b32 s5, s1 2346; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2347; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2348; GFX8-NEXT: v_mov_b32_e32 v2, s1 2349; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2350; GFX8-NEXT: s_mov_b32 s7, 0xf000 2351; GFX8-NEXT: s_mov_b32 s6, -1 2352; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2353; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2354; GFX8-NEXT: s_endpgm 2355; 2356; GFX9-LABEL: sub_i64_uniform: 2357; GFX9: ; %bb.0: ; %entry 2358; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2359; GFX9-NEXT: s_mov_b64 s[6:7], exec 2360; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2361; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2362; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2363; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2364; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2365; GFX9-NEXT: s_cbranch_execz BB12_2 2366; GFX9-NEXT: ; %bb.1: 2367; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2368; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2369; GFX9-NEXT: s_mul_i32 s7, s3, s6 2370; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2371; GFX9-NEXT: s_add_i32 s8, s8, s7 2372; GFX9-NEXT: s_mul_i32 s6, s2, s6 2373; GFX9-NEXT: v_mov_b32_e32 v1, s6 2374; GFX9-NEXT: v_mov_b32_e32 v2, s8 2375; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2376; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2377; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2378; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2379; GFX9-NEXT: BB12_2: 2380; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2381; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2382; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2383; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2384; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2385; GFX9-NEXT: s_mov_b32 s4, s0 2386; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2387; GFX9-NEXT: s_mov_b32 s5, s1 2388; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2389; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2390; GFX9-NEXT: v_mov_b32_e32 v2, s1 2391; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2392; GFX9-NEXT: s_mov_b32 s7, 0xf000 2393; GFX9-NEXT: s_mov_b32 s6, -1 2394; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2395; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2396; GFX9-NEXT: s_endpgm 2397; 2398; GFX1064-LABEL: sub_i64_uniform: 2399; GFX1064: ; %bb.0: ; %entry 2400; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2401; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2402; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2403; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2404; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2405; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2406; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2407; GFX1064-NEXT: s_cbranch_execz BB12_2 2408; GFX1064-NEXT: ; %bb.1: 2409; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2410; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2411; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2412; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2413; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2414; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2415; GFX1064-NEXT: s_add_i32 s8, s8, s7 2416; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2417; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2418; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2419; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2420; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2421; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2422; GFX1064-NEXT: buffer_gl0_inv 2423; GFX1064-NEXT: BB12_2: 2424; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2425; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2426; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2427; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2428; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2429; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2430; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2431; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2432; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2433; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2434; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0 2435; GFX1064-NEXT: s_mov_b32 s2, -1 2436; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2437; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2438; GFX1064-NEXT: s_endpgm 2439; 2440; GFX1032-LABEL: sub_i64_uniform: 2441; GFX1032: ; %bb.0: ; %entry 2442; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2443; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2444; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2445; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2446; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2447; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2448; GFX1032-NEXT: s_cbranch_execz BB12_2 2449; GFX1032-NEXT: ; %bb.1: 2450; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2451; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2452; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2453; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2454; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2455; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2456; GFX1032-NEXT: s_add_i32 s7, s7, s6 2457; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2458; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2459; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2460; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2461; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2462; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2463; GFX1032-NEXT: buffer_gl0_inv 2464; GFX1032-NEXT: BB12_2: 2465; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2466; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2467; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2468; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2469; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2470; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2471; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2472; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2473; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2474; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2475; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0 2476; GFX1032-NEXT: s_mov_b32 s2, -1 2477; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2478; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2479; GFX1032-NEXT: s_endpgm 2480entry: 2481 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2482 store i64 %old, i64 addrspace(1)* %out 2483 ret void 2484} 2485 2486; GCN-NOT: v_mbcnt_lo_u32_b32 2487; GCN-NOT: v_mbcnt_hi_u32_b32 2488; GCN-NOT: s_bcnt1_i32_b64 2489define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2490; 2491; 2492; GFX7LESS-LABEL: sub_i64_varying: 2493; GFX7LESS: ; %bb.0: ; %entry 2494; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2495; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2496; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2497; GFX7LESS-NEXT: s_mov_b32 m0, -1 2498; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2499; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2500; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2501; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2502; GFX7LESS-NEXT: s_mov_b32 s2, -1 2503; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2504; GFX7LESS-NEXT: s_endpgm 2505; 2506; GFX8-LABEL: sub_i64_varying: 2507; GFX8: ; %bb.0: ; %entry 2508; GFX8-NEXT: v_mov_b32_e32 v1, 0 2509; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2510; GFX8-NEXT: s_mov_b32 m0, -1 2511; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2512; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2513; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2514; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2515; GFX8-NEXT: s_mov_b32 s3, 0xf000 2516; GFX8-NEXT: s_mov_b32 s2, -1 2517; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2518; GFX8-NEXT: s_endpgm 2519; 2520; GFX9-LABEL: sub_i64_varying: 2521; GFX9: ; %bb.0: ; %entry 2522; GFX9-NEXT: v_mov_b32_e32 v1, 0 2523; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2524; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2525; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2526; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2527; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2528; GFX9-NEXT: s_mov_b32 s3, 0xf000 2529; GFX9-NEXT: s_mov_b32 s2, -1 2530; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2531; GFX9-NEXT: s_endpgm 2532; 2533; GFX1064-LABEL: sub_i64_varying: 2534; GFX1064: ; %bb.0: ; %entry 2535; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2536; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2537; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2538; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2539; GFX1064-NEXT: s_mov_b32 s2, -1 2540; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2541; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2542; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2543; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2544; GFX1064-NEXT: buffer_gl0_inv 2545; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2546; GFX1064-NEXT: s_endpgm 2547; 2548; GFX1032-LABEL: sub_i64_varying: 2549; GFX1032: ; %bb.0: ; %entry 2550; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2551; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2552; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2553; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2554; GFX1032-NEXT: s_mov_b32 s2, -1 2555; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2556; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2557; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2558; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2559; GFX1032-NEXT: buffer_gl0_inv 2560; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2561; GFX1032-NEXT: s_endpgm 2562entry: 2563 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2564 %zext = zext i32 %lane to i64 2565 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2566 store i64 %old, i64 addrspace(1)* %out 2567 ret void 2568} 2569 2570define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2571; 2572; 2573; GFX7LESS-LABEL: and_i32_varying: 2574; GFX7LESS: ; %bb.0: ; %entry 2575; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2576; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2577; GFX7LESS-NEXT: s_mov_b32 m0, -1 2578; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2579; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2580; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2581; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2582; GFX7LESS-NEXT: s_mov_b32 s2, -1 2583; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2584; GFX7LESS-NEXT: s_endpgm 2585; 2586; GFX8-LABEL: and_i32_varying: 2587; GFX8: ; %bb.0: ; %entry 2588; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2589; GFX8-NEXT: v_mov_b32_e32 v2, v0 2590; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2591; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2592; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2593; GFX8-NEXT: v_mov_b32_e32 v1, -1 2594; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2595; GFX8-NEXT: s_not_b64 exec, exec 2596; GFX8-NEXT: v_mov_b32_e32 v2, -1 2597; GFX8-NEXT: s_not_b64 exec, exec 2598; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2599; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2600; GFX8-NEXT: s_nop 1 2601; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2602; GFX8-NEXT: s_nop 1 2603; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2604; GFX8-NEXT: s_nop 1 2605; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2606; GFX8-NEXT: s_nop 1 2607; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2608; GFX8-NEXT: s_nop 1 2609; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2610; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2611; GFX8-NEXT: s_nop 0 2612; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2613; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2614; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2615; GFX8-NEXT: ; implicit-def: $vgpr0 2616; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2617; GFX8-NEXT: s_cbranch_execz BB14_2 2618; GFX8-NEXT: ; %bb.1: 2619; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2620; GFX8-NEXT: v_mov_b32_e32 v3, s4 2621; GFX8-NEXT: s_mov_b32 m0, -1 2622; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2623; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2624; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2625; GFX8-NEXT: BB14_2: 2626; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2627; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2628; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2629; GFX8-NEXT: v_mov_b32_e32 v0, v1 2630; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2631; GFX8-NEXT: s_mov_b32 s3, 0xf000 2632; GFX8-NEXT: s_mov_b32 s2, -1 2633; GFX8-NEXT: s_nop 0 2634; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2635; GFX8-NEXT: s_endpgm 2636; 2637; GFX9-LABEL: and_i32_varying: 2638; GFX9: ; %bb.0: ; %entry 2639; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2640; GFX9-NEXT: v_mov_b32_e32 v2, v0 2641; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2642; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2643; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2644; GFX9-NEXT: v_mov_b32_e32 v1, -1 2645; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2646; GFX9-NEXT: s_not_b64 exec, exec 2647; GFX9-NEXT: v_mov_b32_e32 v2, -1 2648; GFX9-NEXT: s_not_b64 exec, exec 2649; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2650; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2651; GFX9-NEXT: s_nop 1 2652; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2653; GFX9-NEXT: s_nop 1 2654; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2655; GFX9-NEXT: s_nop 1 2656; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2657; GFX9-NEXT: s_nop 1 2658; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2659; GFX9-NEXT: s_nop 1 2660; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2661; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2662; GFX9-NEXT: s_nop 0 2663; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2664; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2665; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2666; GFX9-NEXT: ; implicit-def: $vgpr0 2667; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2668; GFX9-NEXT: s_cbranch_execz BB14_2 2669; GFX9-NEXT: ; %bb.1: 2670; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2671; GFX9-NEXT: v_mov_b32_e32 v3, s4 2672; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2673; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2674; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2675; GFX9-NEXT: BB14_2: 2676; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2677; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2678; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2679; GFX9-NEXT: v_mov_b32_e32 v0, v1 2680; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2681; GFX9-NEXT: s_mov_b32 s3, 0xf000 2682; GFX9-NEXT: s_mov_b32 s2, -1 2683; GFX9-NEXT: s_nop 0 2684; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2685; GFX9-NEXT: s_endpgm 2686; 2687; GFX1064-LABEL: and_i32_varying: 2688; GFX1064: ; %bb.0: ; %entry 2689; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2690; GFX1064-NEXT: s_not_b64 exec, exec 2691; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2692; GFX1064-NEXT: s_not_b64 exec, exec 2693; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2694; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2695; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2696; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2697; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2698; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2699; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2700; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2701; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2702; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2703; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2704; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2705; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2706; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2707; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2708; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2709; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2710; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2711; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2712; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2713; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2714; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2715; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2716; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2717; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2718; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2719; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2720; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2721; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2722; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2723; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2724; GFX1064-NEXT: s_mov_b32 s2, -1 2725; GFX1064-NEXT: ; implicit-def: $vgpr0 2726; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2727; GFX1064-NEXT: s_cbranch_execz BB14_2 2728; GFX1064-NEXT: ; %bb.1: 2729; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2730; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2731; GFX1064-NEXT: s_mov_b32 s3, s7 2732; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2733; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2734; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 2735; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2736; GFX1064-NEXT: buffer_gl0_inv 2737; GFX1064-NEXT: BB14_2: 2738; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2739; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2740; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2741; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2742; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2743; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2744; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2745; GFX1064-NEXT: s_nop 0 2746; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2747; GFX1064-NEXT: s_endpgm 2748; 2749; GFX1032-LABEL: and_i32_varying: 2750; GFX1032: ; %bb.0: ; %entry 2751; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2752; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2753; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2754; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2755; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2756; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2757; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2758; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2759; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2760; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2761; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2762; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2763; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2764; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2765; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2766; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2767; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2768; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2769; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2770; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2771; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2772; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2773; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2774; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2775; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2776; GFX1032-NEXT: s_mov_b32 s2, -1 2777; GFX1032-NEXT: ; implicit-def: $vgpr0 2778; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2779; GFX1032-NEXT: s_cbranch_execz BB14_2 2780; GFX1032-NEXT: ; %bb.1: 2781; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2782; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2783; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2784; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2785; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 2786; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2787; GFX1032-NEXT: buffer_gl0_inv 2788; GFX1032-NEXT: BB14_2: 2789; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2790; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2791; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2792; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2793; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2794; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2795; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2796; GFX1032-NEXT: s_nop 0 2797; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2798; GFX1032-NEXT: s_endpgm 2799entry: 2800 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2801 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2802 store i32 %old, i32 addrspace(1)* %out 2803 ret void 2804} 2805 2806define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2807; 2808; 2809; GFX7LESS-LABEL: or_i32_varying: 2810; GFX7LESS: ; %bb.0: ; %entry 2811; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2812; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2813; GFX7LESS-NEXT: s_mov_b32 m0, -1 2814; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2815; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2816; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2817; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2818; GFX7LESS-NEXT: s_mov_b32 s2, -1 2819; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2820; GFX7LESS-NEXT: s_endpgm 2821; 2822; GFX8-LABEL: or_i32_varying: 2823; GFX8: ; %bb.0: ; %entry 2824; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2825; GFX8-NEXT: v_mov_b32_e32 v2, v0 2826; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2827; GFX8-NEXT: v_mov_b32_e32 v1, 0 2828; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2829; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2830; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2831; GFX8-NEXT: s_not_b64 exec, exec 2832; GFX8-NEXT: v_mov_b32_e32 v2, 0 2833; GFX8-NEXT: s_not_b64 exec, exec 2834; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2835; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2836; GFX8-NEXT: s_nop 1 2837; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2838; GFX8-NEXT: s_nop 1 2839; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2840; GFX8-NEXT: s_nop 1 2841; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2842; GFX8-NEXT: s_nop 1 2843; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2844; GFX8-NEXT: s_nop 1 2845; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2846; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2847; GFX8-NEXT: s_nop 0 2848; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2849; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2850; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2851; GFX8-NEXT: ; implicit-def: $vgpr0 2852; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2853; GFX8-NEXT: s_cbranch_execz BB15_2 2854; GFX8-NEXT: ; %bb.1: 2855; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2856; GFX8-NEXT: v_mov_b32_e32 v3, s4 2857; GFX8-NEXT: s_mov_b32 m0, -1 2858; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2859; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2860; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2861; GFX8-NEXT: BB15_2: 2862; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2863; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2864; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2865; GFX8-NEXT: v_mov_b32_e32 v0, v1 2866; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2867; GFX8-NEXT: s_mov_b32 s3, 0xf000 2868; GFX8-NEXT: s_mov_b32 s2, -1 2869; GFX8-NEXT: s_nop 0 2870; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2871; GFX8-NEXT: s_endpgm 2872; 2873; GFX9-LABEL: or_i32_varying: 2874; GFX9: ; %bb.0: ; %entry 2875; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2876; GFX9-NEXT: v_mov_b32_e32 v2, v0 2877; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2878; GFX9-NEXT: v_mov_b32_e32 v1, 0 2879; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2880; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2881; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2882; GFX9-NEXT: s_not_b64 exec, exec 2883; GFX9-NEXT: v_mov_b32_e32 v2, 0 2884; GFX9-NEXT: s_not_b64 exec, exec 2885; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2886; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2887; GFX9-NEXT: s_nop 1 2888; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2889; GFX9-NEXT: s_nop 1 2890; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2891; GFX9-NEXT: s_nop 1 2892; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2893; GFX9-NEXT: s_nop 1 2894; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2895; GFX9-NEXT: s_nop 1 2896; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2897; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2898; GFX9-NEXT: s_nop 0 2899; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2900; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2901; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2902; GFX9-NEXT: ; implicit-def: $vgpr0 2903; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2904; GFX9-NEXT: s_cbranch_execz BB15_2 2905; GFX9-NEXT: ; %bb.1: 2906; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2907; GFX9-NEXT: v_mov_b32_e32 v3, s4 2908; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2909; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2910; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2911; GFX9-NEXT: BB15_2: 2912; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2913; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2914; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2915; GFX9-NEXT: v_mov_b32_e32 v0, v1 2916; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2917; GFX9-NEXT: s_mov_b32 s3, 0xf000 2918; GFX9-NEXT: s_mov_b32 s2, -1 2919; GFX9-NEXT: s_nop 0 2920; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2921; GFX9-NEXT: s_endpgm 2922; 2923; GFX1064-LABEL: or_i32_varying: 2924; GFX1064: ; %bb.0: ; %entry 2925; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2926; GFX1064-NEXT: s_not_b64 exec, exec 2927; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2928; GFX1064-NEXT: s_not_b64 exec, exec 2929; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2930; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2931; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2932; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2933; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2934; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2935; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2936; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2937; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2938; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2939; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2940; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2941; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2942; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2943; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2944; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2945; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2946; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2947; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2948; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2949; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2950; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2951; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2952; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2953; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2954; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2955; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2956; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2957; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2958; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2959; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2960; GFX1064-NEXT: s_mov_b32 s2, -1 2961; GFX1064-NEXT: ; implicit-def: $vgpr0 2962; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2963; GFX1064-NEXT: s_cbranch_execz BB15_2 2964; GFX1064-NEXT: ; %bb.1: 2965; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2966; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2967; GFX1064-NEXT: s_mov_b32 s3, s7 2968; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2969; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2970; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 2971; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2972; GFX1064-NEXT: buffer_gl0_inv 2973; GFX1064-NEXT: BB15_2: 2974; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2975; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2976; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2977; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2978; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2979; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2980; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2981; GFX1064-NEXT: s_nop 0 2982; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2983; GFX1064-NEXT: s_endpgm 2984; 2985; GFX1032-LABEL: or_i32_varying: 2986; GFX1032: ; %bb.0: ; %entry 2987; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2988; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2989; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2990; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2991; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2992; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 2993; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 2994; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 2995; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 2996; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2997; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2998; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2999; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3000; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3001; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3002; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3003; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3004; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3005; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3006; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3007; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3008; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3009; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3010; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3011; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3012; GFX1032-NEXT: s_mov_b32 s2, -1 3013; GFX1032-NEXT: ; implicit-def: $vgpr0 3014; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3015; GFX1032-NEXT: s_cbranch_execz BB15_2 3016; GFX1032-NEXT: ; %bb.1: 3017; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3018; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3019; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3020; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3021; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 3022; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3023; GFX1032-NEXT: buffer_gl0_inv 3024; GFX1032-NEXT: BB15_2: 3025; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3026; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3027; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3028; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3029; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3030; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3031; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3032; GFX1032-NEXT: s_nop 0 3033; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3034; GFX1032-NEXT: s_endpgm 3035entry: 3036 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3037 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3038 store i32 %old, i32 addrspace(1)* %out 3039 ret void 3040} 3041 3042define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3043; 3044; 3045; GFX7LESS-LABEL: xor_i32_varying: 3046; GFX7LESS: ; %bb.0: ; %entry 3047; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3048; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3049; GFX7LESS-NEXT: s_mov_b32 m0, -1 3050; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3051; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3052; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3053; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3054; GFX7LESS-NEXT: s_mov_b32 s2, -1 3055; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3056; GFX7LESS-NEXT: s_endpgm 3057; 3058; GFX8-LABEL: xor_i32_varying: 3059; GFX8: ; %bb.0: ; %entry 3060; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3061; GFX8-NEXT: v_mov_b32_e32 v2, v0 3062; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3063; GFX8-NEXT: v_mov_b32_e32 v1, 0 3064; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3065; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3066; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3067; GFX8-NEXT: s_not_b64 exec, exec 3068; GFX8-NEXT: v_mov_b32_e32 v2, 0 3069; GFX8-NEXT: s_not_b64 exec, exec 3070; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3071; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3072; GFX8-NEXT: s_nop 1 3073; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3074; GFX8-NEXT: s_nop 1 3075; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3076; GFX8-NEXT: s_nop 1 3077; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3078; GFX8-NEXT: s_nop 1 3079; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3080; GFX8-NEXT: s_nop 1 3081; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3082; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3083; GFX8-NEXT: s_nop 0 3084; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3085; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3086; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3087; GFX8-NEXT: ; implicit-def: $vgpr0 3088; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3089; GFX8-NEXT: s_cbranch_execz BB16_2 3090; GFX8-NEXT: ; %bb.1: 3091; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3092; GFX8-NEXT: v_mov_b32_e32 v3, s4 3093; GFX8-NEXT: s_mov_b32 m0, -1 3094; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3095; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3096; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3097; GFX8-NEXT: BB16_2: 3098; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3099; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3100; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3101; GFX8-NEXT: v_mov_b32_e32 v0, v1 3102; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3103; GFX8-NEXT: s_mov_b32 s3, 0xf000 3104; GFX8-NEXT: s_mov_b32 s2, -1 3105; GFX8-NEXT: s_nop 0 3106; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3107; GFX8-NEXT: s_endpgm 3108; 3109; GFX9-LABEL: xor_i32_varying: 3110; GFX9: ; %bb.0: ; %entry 3111; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3112; GFX9-NEXT: v_mov_b32_e32 v2, v0 3113; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3114; GFX9-NEXT: v_mov_b32_e32 v1, 0 3115; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3116; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3117; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3118; GFX9-NEXT: s_not_b64 exec, exec 3119; GFX9-NEXT: v_mov_b32_e32 v2, 0 3120; GFX9-NEXT: s_not_b64 exec, exec 3121; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3122; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3123; GFX9-NEXT: s_nop 1 3124; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3125; GFX9-NEXT: s_nop 1 3126; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3127; GFX9-NEXT: s_nop 1 3128; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3129; GFX9-NEXT: s_nop 1 3130; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3131; GFX9-NEXT: s_nop 1 3132; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3133; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3134; GFX9-NEXT: s_nop 0 3135; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3136; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3137; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3138; GFX9-NEXT: ; implicit-def: $vgpr0 3139; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3140; GFX9-NEXT: s_cbranch_execz BB16_2 3141; GFX9-NEXT: ; %bb.1: 3142; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3143; GFX9-NEXT: v_mov_b32_e32 v3, s4 3144; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3145; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3147; GFX9-NEXT: BB16_2: 3148; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3149; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3150; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3151; GFX9-NEXT: v_mov_b32_e32 v0, v1 3152; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3153; GFX9-NEXT: s_mov_b32 s3, 0xf000 3154; GFX9-NEXT: s_mov_b32 s2, -1 3155; GFX9-NEXT: s_nop 0 3156; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3157; GFX9-NEXT: s_endpgm 3158; 3159; GFX1064-LABEL: xor_i32_varying: 3160; GFX1064: ; %bb.0: ; %entry 3161; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3162; GFX1064-NEXT: s_not_b64 exec, exec 3163; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3164; GFX1064-NEXT: s_not_b64 exec, exec 3165; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3166; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3167; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3168; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3169; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3170; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3171; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3172; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3173; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3174; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3175; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3176; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3177; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3178; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3179; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3180; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3181; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3182; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3183; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3184; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3185; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3186; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3187; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3188; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3189; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3190; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3191; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3192; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3193; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3194; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3195; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3196; GFX1064-NEXT: s_mov_b32 s2, -1 3197; GFX1064-NEXT: ; implicit-def: $vgpr0 3198; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3199; GFX1064-NEXT: s_cbranch_execz BB16_2 3200; GFX1064-NEXT: ; %bb.1: 3201; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3202; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3203; GFX1064-NEXT: s_mov_b32 s3, s7 3204; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3205; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3206; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 3207; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3208; GFX1064-NEXT: buffer_gl0_inv 3209; GFX1064-NEXT: BB16_2: 3210; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3211; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3212; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3213; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3214; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3215; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3216; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3217; GFX1064-NEXT: s_nop 0 3218; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3219; GFX1064-NEXT: s_endpgm 3220; 3221; GFX1032-LABEL: xor_i32_varying: 3222; GFX1032: ; %bb.0: ; %entry 3223; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3224; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3225; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3226; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3227; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3228; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 3229; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 3230; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 3231; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 3232; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3233; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3234; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3235; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3236; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3237; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3238; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3239; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3240; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3241; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3242; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3243; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3244; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3245; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3246; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3247; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3248; GFX1032-NEXT: s_mov_b32 s2, -1 3249; GFX1032-NEXT: ; implicit-def: $vgpr0 3250; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3251; GFX1032-NEXT: s_cbranch_execz BB16_2 3252; GFX1032-NEXT: ; %bb.1: 3253; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3254; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3255; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3256; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3257; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 3258; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3259; GFX1032-NEXT: buffer_gl0_inv 3260; GFX1032-NEXT: BB16_2: 3261; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3262; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3263; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3264; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3265; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3266; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3267; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3268; GFX1032-NEXT: s_nop 0 3269; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3270; GFX1032-NEXT: s_endpgm 3271entry: 3272 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3273 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3274 store i32 %old, i32 addrspace(1)* %out 3275 ret void 3276} 3277 3278define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3279; 3280; 3281; GFX7LESS-LABEL: max_i32_varying: 3282; GFX7LESS: ; %bb.0: ; %entry 3283; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3284; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3285; GFX7LESS-NEXT: s_mov_b32 m0, -1 3286; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3287; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3288; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3289; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3290; GFX7LESS-NEXT: s_mov_b32 s2, -1 3291; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3292; GFX7LESS-NEXT: s_endpgm 3293; 3294; GFX8-LABEL: max_i32_varying: 3295; GFX8: ; %bb.0: ; %entry 3296; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3297; GFX8-NEXT: v_mov_b32_e32 v2, v0 3298; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3299; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3300; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3301; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3302; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3303; GFX8-NEXT: s_not_b64 exec, exec 3304; GFX8-NEXT: v_mov_b32_e32 v2, v1 3305; GFX8-NEXT: s_not_b64 exec, exec 3306; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3307; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3308; GFX8-NEXT: s_nop 1 3309; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3310; GFX8-NEXT: s_nop 1 3311; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3312; GFX8-NEXT: s_nop 1 3313; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3314; GFX8-NEXT: s_nop 1 3315; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3316; GFX8-NEXT: s_nop 1 3317; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3318; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3319; GFX8-NEXT: s_nop 0 3320; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3321; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3322; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3323; GFX8-NEXT: ; implicit-def: $vgpr0 3324; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3325; GFX8-NEXT: s_cbranch_execz BB17_2 3326; GFX8-NEXT: ; %bb.1: 3327; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3328; GFX8-NEXT: v_mov_b32_e32 v3, s4 3329; GFX8-NEXT: s_mov_b32 m0, -1 3330; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3331; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3333; GFX8-NEXT: BB17_2: 3334; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3335; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3336; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3337; GFX8-NEXT: v_mov_b32_e32 v0, v1 3338; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3339; GFX8-NEXT: s_mov_b32 s3, 0xf000 3340; GFX8-NEXT: s_mov_b32 s2, -1 3341; GFX8-NEXT: s_nop 0 3342; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3343; GFX8-NEXT: s_endpgm 3344; 3345; GFX9-LABEL: max_i32_varying: 3346; GFX9: ; %bb.0: ; %entry 3347; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3348; GFX9-NEXT: v_mov_b32_e32 v2, v0 3349; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3350; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3351; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3352; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3353; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3354; GFX9-NEXT: s_not_b64 exec, exec 3355; GFX9-NEXT: v_mov_b32_e32 v2, v1 3356; GFX9-NEXT: s_not_b64 exec, exec 3357; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3358; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3359; GFX9-NEXT: s_nop 1 3360; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3361; GFX9-NEXT: s_nop 1 3362; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3363; GFX9-NEXT: s_nop 1 3364; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3365; GFX9-NEXT: s_nop 1 3366; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3367; GFX9-NEXT: s_nop 1 3368; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3369; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3370; GFX9-NEXT: s_nop 0 3371; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3372; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3373; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3374; GFX9-NEXT: ; implicit-def: $vgpr0 3375; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3376; GFX9-NEXT: s_cbranch_execz BB17_2 3377; GFX9-NEXT: ; %bb.1: 3378; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3379; GFX9-NEXT: v_mov_b32_e32 v3, s4 3380; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3381; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3383; GFX9-NEXT: BB17_2: 3384; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3385; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3386; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3387; GFX9-NEXT: v_mov_b32_e32 v0, v1 3388; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3389; GFX9-NEXT: s_mov_b32 s3, 0xf000 3390; GFX9-NEXT: s_mov_b32 s2, -1 3391; GFX9-NEXT: s_nop 0 3392; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3393; GFX9-NEXT: s_endpgm 3394; 3395; GFX1064-LABEL: max_i32_varying: 3396; GFX1064: ; %bb.0: ; %entry 3397; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3398; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3399; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3400; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3401; GFX1064-NEXT: s_not_b64 exec, exec 3402; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3403; GFX1064-NEXT: s_not_b64 exec, exec 3404; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3405; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3406; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3407; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3408; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3409; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3410; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3411; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3412; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3413; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3414; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3415; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3416; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3417; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3418; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3419; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3420; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3421; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3422; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3423; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3424; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3425; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3426; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3427; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3428; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3429; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3430; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3431; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3432; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3433; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3434; GFX1064-NEXT: s_mov_b32 s2, -1 3435; GFX1064-NEXT: ; implicit-def: $vgpr0 3436; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3437; GFX1064-NEXT: s_cbranch_execz BB17_2 3438; GFX1064-NEXT: ; %bb.1: 3439; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3440; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3441; GFX1064-NEXT: s_mov_b32 s3, s7 3442; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3443; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3444; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 3445; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3446; GFX1064-NEXT: buffer_gl0_inv 3447; GFX1064-NEXT: BB17_2: 3448; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3449; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3450; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3451; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3452; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3453; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3454; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3455; GFX1064-NEXT: s_nop 0 3456; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3457; GFX1064-NEXT: s_endpgm 3458; 3459; GFX1032-LABEL: max_i32_varying: 3460; GFX1032: ; %bb.0: ; %entry 3461; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3462; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3463; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3464; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3465; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3466; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3467; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3468; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3469; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3470; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3471; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3472; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3473; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3474; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3475; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3476; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3477; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3478; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3479; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3480; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3481; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3482; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3483; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3484; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3485; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3486; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3487; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3488; GFX1032-NEXT: s_mov_b32 s2, -1 3489; GFX1032-NEXT: ; implicit-def: $vgpr0 3490; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3491; GFX1032-NEXT: s_cbranch_execz BB17_2 3492; GFX1032-NEXT: ; %bb.1: 3493; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3494; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3495; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3496; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3497; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 3498; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3499; GFX1032-NEXT: buffer_gl0_inv 3500; GFX1032-NEXT: BB17_2: 3501; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3502; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3503; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3504; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3505; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3506; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3507; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3508; GFX1032-NEXT: s_nop 0 3509; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3510; GFX1032-NEXT: s_endpgm 3511entry: 3512 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3513 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3514 store i32 %old, i32 addrspace(1)* %out 3515 ret void 3516} 3517 3518define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3519; 3520; 3521; GFX7LESS-LABEL: max_i64_constant: 3522; GFX7LESS: ; %bb.0: ; %entry 3523; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3524; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3525; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3526; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3527; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3528; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3529; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3530; GFX7LESS-NEXT: ; %bb.1: 3531; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3532; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3533; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3534; GFX7LESS-NEXT: s_mov_b32 m0, -1 3535; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3536; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3537; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3538; GFX7LESS-NEXT: BB18_2: 3539; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3540; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3541; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3542; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3543; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3544; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3545; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3546; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3547; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3548; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3549; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3550; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3551; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3552; GFX7LESS-NEXT: s_mov_b32 s2, -1 3553; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3554; GFX7LESS-NEXT: s_endpgm 3555; 3556; GFX8-LABEL: max_i64_constant: 3557; GFX8: ; %bb.0: ; %entry 3558; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3559; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3560; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3561; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3562; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3563; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3564; GFX8-NEXT: s_cbranch_execz BB18_2 3565; GFX8-NEXT: ; %bb.1: 3566; GFX8-NEXT: v_mov_b32_e32 v0, 5 3567; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3568; GFX8-NEXT: v_mov_b32_e32 v1, 0 3569; GFX8-NEXT: s_mov_b32 m0, -1 3570; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3571; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3572; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3573; GFX8-NEXT: BB18_2: 3574; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3575; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3576; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3577; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3578; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3579; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3580; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3581; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3582; GFX8-NEXT: v_mov_b32_e32 v2, s3 3583; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3584; GFX8-NEXT: v_mov_b32_e32 v2, s2 3585; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3586; GFX8-NEXT: s_mov_b32 s3, 0xf000 3587; GFX8-NEXT: s_mov_b32 s2, -1 3588; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3589; GFX8-NEXT: s_endpgm 3590; 3591; GFX9-LABEL: max_i64_constant: 3592; GFX9: ; %bb.0: ; %entry 3593; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3594; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3595; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3596; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3597; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3598; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3599; GFX9-NEXT: s_cbranch_execz BB18_2 3600; GFX9-NEXT: ; %bb.1: 3601; GFX9-NEXT: v_mov_b32_e32 v0, 5 3602; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3603; GFX9-NEXT: v_mov_b32_e32 v1, 0 3604; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3605; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3606; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3607; GFX9-NEXT: BB18_2: 3608; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3609; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3610; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3611; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3612; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3613; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3614; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3615; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3616; GFX9-NEXT: v_mov_b32_e32 v2, s3 3617; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3618; GFX9-NEXT: v_mov_b32_e32 v2, s2 3619; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3620; GFX9-NEXT: s_mov_b32 s3, 0xf000 3621; GFX9-NEXT: s_mov_b32 s2, -1 3622; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3623; GFX9-NEXT: s_endpgm 3624; 3625; GFX1064-LABEL: max_i64_constant: 3626; GFX1064: ; %bb.0: ; %entry 3627; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3628; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3629; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3630; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3631; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3632; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3633; GFX1064-NEXT: s_cbranch_execz BB18_2 3634; GFX1064-NEXT: ; %bb.1: 3635; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3636; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3637; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3638; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3639; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3640; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3641; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3642; GFX1064-NEXT: buffer_gl0_inv 3643; GFX1064-NEXT: BB18_2: 3644; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3645; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3646; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3647; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3648; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3649; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3650; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3651; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3652; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3653; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3654; GFX1064-NEXT: s_mov_b32 s2, -1 3655; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3656; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3657; GFX1064-NEXT: s_endpgm 3658; 3659; GFX1032-LABEL: max_i64_constant: 3660; GFX1032: ; %bb.0: ; %entry 3661; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3662; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3663; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3664; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3665; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3666; GFX1032-NEXT: s_cbranch_execz BB18_2 3667; GFX1032-NEXT: ; %bb.1: 3668; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3669; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3670; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3671; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3672; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3673; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3674; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3675; GFX1032-NEXT: buffer_gl0_inv 3676; GFX1032-NEXT: BB18_2: 3677; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3678; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3679; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3680; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3681; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3682; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3683; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3684; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3685; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3686; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3687; GFX1032-NEXT: s_mov_b32 s2, -1 3688; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3689; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3690; GFX1032-NEXT: s_endpgm 3691entry: 3692 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3693 store i64 %old, i64 addrspace(1)* %out 3694 ret void 3695} 3696 3697define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3698; 3699; 3700; GFX7LESS-LABEL: min_i32_varying: 3701; GFX7LESS: ; %bb.0: ; %entry 3702; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3703; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3704; GFX7LESS-NEXT: s_mov_b32 m0, -1 3705; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3706; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3707; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3708; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3709; GFX7LESS-NEXT: s_mov_b32 s2, -1 3710; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3711; GFX7LESS-NEXT: s_endpgm 3712; 3713; GFX8-LABEL: min_i32_varying: 3714; GFX8: ; %bb.0: ; %entry 3715; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3716; GFX8-NEXT: v_mov_b32_e32 v2, v0 3717; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3718; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3719; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3720; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3721; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3722; GFX8-NEXT: s_not_b64 exec, exec 3723; GFX8-NEXT: v_mov_b32_e32 v2, v1 3724; GFX8-NEXT: s_not_b64 exec, exec 3725; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3726; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3727; GFX8-NEXT: s_nop 1 3728; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3729; GFX8-NEXT: s_nop 1 3730; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3731; GFX8-NEXT: s_nop 1 3732; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3733; GFX8-NEXT: s_nop 1 3734; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3735; GFX8-NEXT: s_nop 1 3736; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3737; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3738; GFX8-NEXT: s_nop 0 3739; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3740; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3741; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3742; GFX8-NEXT: ; implicit-def: $vgpr0 3743; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3744; GFX8-NEXT: s_cbranch_execz BB19_2 3745; GFX8-NEXT: ; %bb.1: 3746; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3747; GFX8-NEXT: v_mov_b32_e32 v3, s4 3748; GFX8-NEXT: s_mov_b32 m0, -1 3749; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3750; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3751; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3752; GFX8-NEXT: BB19_2: 3753; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3754; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3755; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3756; GFX8-NEXT: v_mov_b32_e32 v0, v1 3757; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3758; GFX8-NEXT: s_mov_b32 s3, 0xf000 3759; GFX8-NEXT: s_mov_b32 s2, -1 3760; GFX8-NEXT: s_nop 0 3761; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3762; GFX8-NEXT: s_endpgm 3763; 3764; GFX9-LABEL: min_i32_varying: 3765; GFX9: ; %bb.0: ; %entry 3766; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3767; GFX9-NEXT: v_mov_b32_e32 v2, v0 3768; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3769; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3770; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3771; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3772; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3773; GFX9-NEXT: s_not_b64 exec, exec 3774; GFX9-NEXT: v_mov_b32_e32 v2, v1 3775; GFX9-NEXT: s_not_b64 exec, exec 3776; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3777; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3778; GFX9-NEXT: s_nop 1 3779; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3780; GFX9-NEXT: s_nop 1 3781; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3782; GFX9-NEXT: s_nop 1 3783; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3784; GFX9-NEXT: s_nop 1 3785; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3786; GFX9-NEXT: s_nop 1 3787; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3788; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3789; GFX9-NEXT: s_nop 0 3790; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3791; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3792; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3793; GFX9-NEXT: ; implicit-def: $vgpr0 3794; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3795; GFX9-NEXT: s_cbranch_execz BB19_2 3796; GFX9-NEXT: ; %bb.1: 3797; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3798; GFX9-NEXT: v_mov_b32_e32 v3, s4 3799; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3800; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3801; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3802; GFX9-NEXT: BB19_2: 3803; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3805; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3806; GFX9-NEXT: v_mov_b32_e32 v0, v1 3807; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3808; GFX9-NEXT: s_mov_b32 s3, 0xf000 3809; GFX9-NEXT: s_mov_b32 s2, -1 3810; GFX9-NEXT: s_nop 0 3811; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3812; GFX9-NEXT: s_endpgm 3813; 3814; GFX1064-LABEL: min_i32_varying: 3815; GFX1064: ; %bb.0: ; %entry 3816; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3817; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3818; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3819; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3820; GFX1064-NEXT: s_not_b64 exec, exec 3821; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3822; GFX1064-NEXT: s_not_b64 exec, exec 3823; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3824; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3825; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3826; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3827; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3828; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3829; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3830; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3831; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3832; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3833; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3834; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3835; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3836; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3837; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3838; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3839; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3840; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3841; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3842; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3843; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3844; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3845; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3846; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3847; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3848; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3849; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3850; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3851; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3852; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3853; GFX1064-NEXT: s_mov_b32 s2, -1 3854; GFX1064-NEXT: ; implicit-def: $vgpr0 3855; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3856; GFX1064-NEXT: s_cbranch_execz BB19_2 3857; GFX1064-NEXT: ; %bb.1: 3858; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3859; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3860; GFX1064-NEXT: s_mov_b32 s3, s7 3861; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3862; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3863; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 3864; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3865; GFX1064-NEXT: buffer_gl0_inv 3866; GFX1064-NEXT: BB19_2: 3867; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3868; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3869; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3870; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3871; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3872; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3873; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3874; GFX1064-NEXT: s_nop 0 3875; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3876; GFX1064-NEXT: s_endpgm 3877; 3878; GFX1032-LABEL: min_i32_varying: 3879; GFX1032: ; %bb.0: ; %entry 3880; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3881; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3882; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3883; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3884; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3885; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3886; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3887; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3888; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3889; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3890; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3891; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3892; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3893; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3894; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3895; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3896; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3897; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3898; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3899; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3900; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3901; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3902; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3903; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3904; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3905; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3906; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3907; GFX1032-NEXT: s_mov_b32 s2, -1 3908; GFX1032-NEXT: ; implicit-def: $vgpr0 3909; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3910; GFX1032-NEXT: s_cbranch_execz BB19_2 3911; GFX1032-NEXT: ; %bb.1: 3912; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3913; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3914; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3915; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3916; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 3917; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3918; GFX1032-NEXT: buffer_gl0_inv 3919; GFX1032-NEXT: BB19_2: 3920; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3921; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3922; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3923; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3924; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3925; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3926; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3927; GFX1032-NEXT: s_nop 0 3928; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3929; GFX1032-NEXT: s_endpgm 3930entry: 3931 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3932 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3933 store i32 %old, i32 addrspace(1)* %out 3934 ret void 3935} 3936 3937define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3938; 3939; 3940; GFX7LESS-LABEL: min_i64_constant: 3941; GFX7LESS: ; %bb.0: ; %entry 3942; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3943; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3944; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3945; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3946; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3947; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3948; GFX7LESS-NEXT: s_cbranch_execz BB20_2 3949; GFX7LESS-NEXT: ; %bb.1: 3950; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3951; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3952; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3953; GFX7LESS-NEXT: s_mov_b32 m0, -1 3954; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3955; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3956; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3957; GFX7LESS-NEXT: BB20_2: 3958; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3959; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3960; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3961; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3962; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3963; GFX7LESS-NEXT: s_mov_b32 s2, -1 3964; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3965; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3966; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3967; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3968; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3969; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3970; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3971; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3972; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3973; GFX7LESS-NEXT: s_endpgm 3974; 3975; GFX8-LABEL: min_i64_constant: 3976; GFX8: ; %bb.0: ; %entry 3977; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3978; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3979; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3980; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3981; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3982; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3983; GFX8-NEXT: s_cbranch_execz BB20_2 3984; GFX8-NEXT: ; %bb.1: 3985; GFX8-NEXT: v_mov_b32_e32 v0, 5 3986; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3987; GFX8-NEXT: v_mov_b32_e32 v1, 0 3988; GFX8-NEXT: s_mov_b32 m0, -1 3989; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3990; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3991; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3992; GFX8-NEXT: BB20_2: 3993; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3994; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3995; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3996; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3997; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3998; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3999; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4000; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4001; GFX8-NEXT: v_mov_b32_e32 v2, s5 4002; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4003; GFX8-NEXT: v_mov_b32_e32 v2, s4 4004; GFX8-NEXT: s_mov_b32 s2, -1 4005; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4006; GFX8-NEXT: s_mov_b32 s3, 0xf000 4007; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4008; GFX8-NEXT: s_endpgm 4009; 4010; GFX9-LABEL: min_i64_constant: 4011; GFX9: ; %bb.0: ; %entry 4012; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4013; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4014; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4015; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4016; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4017; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4018; GFX9-NEXT: s_cbranch_execz BB20_2 4019; GFX9-NEXT: ; %bb.1: 4020; GFX9-NEXT: v_mov_b32_e32 v0, 5 4021; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4022; GFX9-NEXT: v_mov_b32_e32 v1, 0 4023; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4024; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4025; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4026; GFX9-NEXT: BB20_2: 4027; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4028; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4029; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4030; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4031; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4032; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4033; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4034; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4035; GFX9-NEXT: v_mov_b32_e32 v2, s5 4036; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4037; GFX9-NEXT: v_mov_b32_e32 v2, s4 4038; GFX9-NEXT: s_mov_b32 s2, -1 4039; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4040; GFX9-NEXT: s_mov_b32 s3, 0xf000 4041; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4042; GFX9-NEXT: s_endpgm 4043; 4044; GFX1064-LABEL: min_i64_constant: 4045; GFX1064: ; %bb.0: ; %entry 4046; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4047; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4048; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4049; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4050; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4051; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4052; GFX1064-NEXT: s_cbranch_execz BB20_2 4053; GFX1064-NEXT: ; %bb.1: 4054; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4055; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4056; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4057; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4058; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4059; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4060; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4061; GFX1064-NEXT: buffer_gl0_inv 4062; GFX1064-NEXT: BB20_2: 4063; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4064; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4065; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4066; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4067; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4068; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4069; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 4070; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4071; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4072; GFX1064-NEXT: s_mov_b32 s2, -1 4073; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4074; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4075; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4076; GFX1064-NEXT: s_endpgm 4077; 4078; GFX1032-LABEL: min_i64_constant: 4079; GFX1032: ; %bb.0: ; %entry 4080; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4081; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4082; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4083; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4084; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4085; GFX1032-NEXT: s_cbranch_execz BB20_2 4086; GFX1032-NEXT: ; %bb.1: 4087; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4088; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4089; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4090; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4091; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4092; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4093; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4094; GFX1032-NEXT: buffer_gl0_inv 4095; GFX1032-NEXT: BB20_2: 4096; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4097; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4098; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4099; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4100; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4101; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4102; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 4103; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4104; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4105; GFX1032-NEXT: s_mov_b32 s2, -1 4106; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4107; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4108; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4109; GFX1032-NEXT: s_endpgm 4110entry: 4111 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4112 store i64 %old, i64 addrspace(1)* %out 4113 ret void 4114} 4115 4116define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4117; 4118; 4119; GFX7LESS-LABEL: umax_i32_varying: 4120; GFX7LESS: ; %bb.0: ; %entry 4121; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4122; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4123; GFX7LESS-NEXT: s_mov_b32 m0, -1 4124; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4125; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4126; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4127; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4128; GFX7LESS-NEXT: s_mov_b32 s2, -1 4129; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4130; GFX7LESS-NEXT: s_endpgm 4131; 4132; GFX8-LABEL: umax_i32_varying: 4133; GFX8: ; %bb.0: ; %entry 4134; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4135; GFX8-NEXT: v_mov_b32_e32 v2, v0 4136; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4137; GFX8-NEXT: v_mov_b32_e32 v1, 0 4138; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4139; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4140; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4141; GFX8-NEXT: s_not_b64 exec, exec 4142; GFX8-NEXT: v_mov_b32_e32 v2, 0 4143; GFX8-NEXT: s_not_b64 exec, exec 4144; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4145; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4146; GFX8-NEXT: s_nop 1 4147; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4148; GFX8-NEXT: s_nop 1 4149; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4150; GFX8-NEXT: s_nop 1 4151; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4152; GFX8-NEXT: s_nop 1 4153; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4154; GFX8-NEXT: s_nop 1 4155; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4156; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4157; GFX8-NEXT: s_nop 0 4158; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4159; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4160; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4161; GFX8-NEXT: ; implicit-def: $vgpr0 4162; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4163; GFX8-NEXT: s_cbranch_execz BB21_2 4164; GFX8-NEXT: ; %bb.1: 4165; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4166; GFX8-NEXT: v_mov_b32_e32 v3, s4 4167; GFX8-NEXT: s_mov_b32 m0, -1 4168; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4169; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4170; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4171; GFX8-NEXT: BB21_2: 4172; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4173; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4174; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4175; GFX8-NEXT: v_mov_b32_e32 v0, v1 4176; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4177; GFX8-NEXT: s_mov_b32 s3, 0xf000 4178; GFX8-NEXT: s_mov_b32 s2, -1 4179; GFX8-NEXT: s_nop 0 4180; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4181; GFX8-NEXT: s_endpgm 4182; 4183; GFX9-LABEL: umax_i32_varying: 4184; GFX9: ; %bb.0: ; %entry 4185; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4186; GFX9-NEXT: v_mov_b32_e32 v2, v0 4187; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4188; GFX9-NEXT: v_mov_b32_e32 v1, 0 4189; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4190; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4191; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4192; GFX9-NEXT: s_not_b64 exec, exec 4193; GFX9-NEXT: v_mov_b32_e32 v2, 0 4194; GFX9-NEXT: s_not_b64 exec, exec 4195; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4196; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4197; GFX9-NEXT: s_nop 1 4198; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4199; GFX9-NEXT: s_nop 1 4200; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4201; GFX9-NEXT: s_nop 1 4202; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4203; GFX9-NEXT: s_nop 1 4204; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4205; GFX9-NEXT: s_nop 1 4206; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4207; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4208; GFX9-NEXT: s_nop 0 4209; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4210; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4211; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4212; GFX9-NEXT: ; implicit-def: $vgpr0 4213; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4214; GFX9-NEXT: s_cbranch_execz BB21_2 4215; GFX9-NEXT: ; %bb.1: 4216; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4217; GFX9-NEXT: v_mov_b32_e32 v3, s4 4218; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4219; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4220; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4221; GFX9-NEXT: BB21_2: 4222; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4223; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4224; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4225; GFX9-NEXT: v_mov_b32_e32 v0, v1 4226; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4227; GFX9-NEXT: s_mov_b32 s3, 0xf000 4228; GFX9-NEXT: s_mov_b32 s2, -1 4229; GFX9-NEXT: s_nop 0 4230; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4231; GFX9-NEXT: s_endpgm 4232; 4233; GFX1064-LABEL: umax_i32_varying: 4234; GFX1064: ; %bb.0: ; %entry 4235; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4236; GFX1064-NEXT: s_not_b64 exec, exec 4237; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4238; GFX1064-NEXT: s_not_b64 exec, exec 4239; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4240; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4241; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4242; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4243; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4244; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4245; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4246; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4247; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4248; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4249; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4250; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4251; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4252; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4253; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4254; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4255; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4256; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4257; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4258; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4259; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4260; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4261; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4262; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4263; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4264; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4265; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4266; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4267; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4268; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4269; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4270; GFX1064-NEXT: s_mov_b32 s2, -1 4271; GFX1064-NEXT: ; implicit-def: $vgpr0 4272; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4273; GFX1064-NEXT: s_cbranch_execz BB21_2 4274; GFX1064-NEXT: ; %bb.1: 4275; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4276; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4277; GFX1064-NEXT: s_mov_b32 s3, s7 4278; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4279; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4280; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 4281; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4282; GFX1064-NEXT: buffer_gl0_inv 4283; GFX1064-NEXT: BB21_2: 4284; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4285; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4286; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4287; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4288; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4289; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4290; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4291; GFX1064-NEXT: s_nop 0 4292; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4293; GFX1064-NEXT: s_endpgm 4294; 4295; GFX1032-LABEL: umax_i32_varying: 4296; GFX1032: ; %bb.0: ; %entry 4297; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4298; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4299; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4300; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4301; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4302; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 4303; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 4304; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 4305; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 4306; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4307; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4308; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4309; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4310; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4311; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4312; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4313; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4314; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4315; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4316; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4317; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4318; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4319; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4320; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4321; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4322; GFX1032-NEXT: s_mov_b32 s2, -1 4323; GFX1032-NEXT: ; implicit-def: $vgpr0 4324; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4325; GFX1032-NEXT: s_cbranch_execz BB21_2 4326; GFX1032-NEXT: ; %bb.1: 4327; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4328; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4329; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4330; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4331; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 4332; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX1032-NEXT: buffer_gl0_inv 4334; GFX1032-NEXT: BB21_2: 4335; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4336; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4337; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4338; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4339; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4340; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4341; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4342; GFX1032-NEXT: s_nop 0 4343; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4344; GFX1032-NEXT: s_endpgm 4345entry: 4346 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4347 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4348 store i32 %old, i32 addrspace(1)* %out 4349 ret void 4350} 4351 4352define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4353; 4354; 4355; GFX7LESS-LABEL: umax_i64_constant: 4356; GFX7LESS: ; %bb.0: ; %entry 4357; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4358; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4359; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4360; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4361; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4362; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4363; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4364; GFX7LESS-NEXT: ; %bb.1: 4365; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4366; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4367; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4368; GFX7LESS-NEXT: s_mov_b32 m0, -1 4369; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4370; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4371; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4372; GFX7LESS-NEXT: BB22_2: 4373; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4374; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4375; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4376; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4377; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4378; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4379; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4380; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4381; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4382; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4383; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4384; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4385; GFX7LESS-NEXT: s_mov_b32 s2, -1 4386; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4387; GFX7LESS-NEXT: s_endpgm 4388; 4389; GFX8-LABEL: umax_i64_constant: 4390; GFX8: ; %bb.0: ; %entry 4391; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4392; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4393; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4394; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4395; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4396; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4397; GFX8-NEXT: s_cbranch_execz BB22_2 4398; GFX8-NEXT: ; %bb.1: 4399; GFX8-NEXT: v_mov_b32_e32 v0, 5 4400; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4401; GFX8-NEXT: v_mov_b32_e32 v1, 0 4402; GFX8-NEXT: s_mov_b32 m0, -1 4403; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4404; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4405; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4406; GFX8-NEXT: BB22_2: 4407; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4408; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4409; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4410; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4411; GFX8-NEXT: v_mov_b32_e32 v1, 0 4412; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4413; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4414; GFX8-NEXT: v_mov_b32_e32 v1, s3 4415; GFX8-NEXT: v_mov_b32_e32 v2, s2 4416; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4417; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4418; GFX8-NEXT: s_mov_b32 s3, 0xf000 4419; GFX8-NEXT: s_mov_b32 s2, -1 4420; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4421; GFX8-NEXT: s_endpgm 4422; 4423; GFX9-LABEL: umax_i64_constant: 4424; GFX9: ; %bb.0: ; %entry 4425; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4426; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4427; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4428; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4429; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4430; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4431; GFX9-NEXT: s_cbranch_execz BB22_2 4432; GFX9-NEXT: ; %bb.1: 4433; GFX9-NEXT: v_mov_b32_e32 v0, 5 4434; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4435; GFX9-NEXT: v_mov_b32_e32 v1, 0 4436; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4437; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4438; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4439; GFX9-NEXT: BB22_2: 4440; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4441; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4442; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4443; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4444; GFX9-NEXT: v_mov_b32_e32 v1, 0 4445; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4446; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4447; GFX9-NEXT: v_mov_b32_e32 v1, s3 4448; GFX9-NEXT: v_mov_b32_e32 v2, s2 4449; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4450; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4451; GFX9-NEXT: s_mov_b32 s3, 0xf000 4452; GFX9-NEXT: s_mov_b32 s2, -1 4453; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4454; GFX9-NEXT: s_endpgm 4455; 4456; GFX1064-LABEL: umax_i64_constant: 4457; GFX1064: ; %bb.0: ; %entry 4458; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4459; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4460; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4461; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4462; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4463; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4464; GFX1064-NEXT: s_cbranch_execz BB22_2 4465; GFX1064-NEXT: ; %bb.1: 4466; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4467; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4468; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4469; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4470; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4471; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4472; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4473; GFX1064-NEXT: buffer_gl0_inv 4474; GFX1064-NEXT: BB22_2: 4475; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4476; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4477; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4478; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4479; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4480; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4481; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4482; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4483; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4484; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4485; GFX1064-NEXT: s_mov_b32 s2, -1 4486; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4487; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4488; GFX1064-NEXT: s_endpgm 4489; 4490; GFX1032-LABEL: umax_i64_constant: 4491; GFX1032: ; %bb.0: ; %entry 4492; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4493; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4494; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4495; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4496; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4497; GFX1032-NEXT: s_cbranch_execz BB22_2 4498; GFX1032-NEXT: ; %bb.1: 4499; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4500; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4501; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4502; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4503; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4504; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4505; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4506; GFX1032-NEXT: buffer_gl0_inv 4507; GFX1032-NEXT: BB22_2: 4508; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4509; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4510; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4511; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4512; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4513; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4514; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4515; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4516; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4517; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4518; GFX1032-NEXT: s_mov_b32 s2, -1 4519; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4520; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4521; GFX1032-NEXT: s_endpgm 4522entry: 4523 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4524 store i64 %old, i64 addrspace(1)* %out 4525 ret void 4526} 4527 4528define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4529; 4530; 4531; GFX7LESS-LABEL: umin_i32_varying: 4532; GFX7LESS: ; %bb.0: ; %entry 4533; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4534; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4535; GFX7LESS-NEXT: s_mov_b32 m0, -1 4536; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4537; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4538; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4539; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4540; GFX7LESS-NEXT: s_mov_b32 s2, -1 4541; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4542; GFX7LESS-NEXT: s_endpgm 4543; 4544; GFX8-LABEL: umin_i32_varying: 4545; GFX8: ; %bb.0: ; %entry 4546; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4547; GFX8-NEXT: v_mov_b32_e32 v2, v0 4548; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4549; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4550; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4551; GFX8-NEXT: v_mov_b32_e32 v1, -1 4552; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4553; GFX8-NEXT: s_not_b64 exec, exec 4554; GFX8-NEXT: v_mov_b32_e32 v2, -1 4555; GFX8-NEXT: s_not_b64 exec, exec 4556; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4557; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4558; GFX8-NEXT: s_nop 1 4559; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4560; GFX8-NEXT: s_nop 1 4561; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4562; GFX8-NEXT: s_nop 1 4563; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4564; GFX8-NEXT: s_nop 1 4565; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4566; GFX8-NEXT: s_nop 1 4567; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4568; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4569; GFX8-NEXT: s_nop 0 4570; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4571; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4572; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4573; GFX8-NEXT: ; implicit-def: $vgpr0 4574; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4575; GFX8-NEXT: s_cbranch_execz BB23_2 4576; GFX8-NEXT: ; %bb.1: 4577; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4578; GFX8-NEXT: v_mov_b32_e32 v3, s4 4579; GFX8-NEXT: s_mov_b32 m0, -1 4580; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4581; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4582; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4583; GFX8-NEXT: BB23_2: 4584; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4585; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4586; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4587; GFX8-NEXT: v_mov_b32_e32 v0, v1 4588; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4589; GFX8-NEXT: s_mov_b32 s3, 0xf000 4590; GFX8-NEXT: s_mov_b32 s2, -1 4591; GFX8-NEXT: s_nop 0 4592; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4593; GFX8-NEXT: s_endpgm 4594; 4595; GFX9-LABEL: umin_i32_varying: 4596; GFX9: ; %bb.0: ; %entry 4597; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4598; GFX9-NEXT: v_mov_b32_e32 v2, v0 4599; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4600; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4601; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4602; GFX9-NEXT: v_mov_b32_e32 v1, -1 4603; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4604; GFX9-NEXT: s_not_b64 exec, exec 4605; GFX9-NEXT: v_mov_b32_e32 v2, -1 4606; GFX9-NEXT: s_not_b64 exec, exec 4607; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4608; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4609; GFX9-NEXT: s_nop 1 4610; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4611; GFX9-NEXT: s_nop 1 4612; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4613; GFX9-NEXT: s_nop 1 4614; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4615; GFX9-NEXT: s_nop 1 4616; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4617; GFX9-NEXT: s_nop 1 4618; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4619; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4620; GFX9-NEXT: s_nop 0 4621; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4622; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4623; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4624; GFX9-NEXT: ; implicit-def: $vgpr0 4625; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4626; GFX9-NEXT: s_cbranch_execz BB23_2 4627; GFX9-NEXT: ; %bb.1: 4628; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4629; GFX9-NEXT: v_mov_b32_e32 v3, s4 4630; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4631; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4632; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4633; GFX9-NEXT: BB23_2: 4634; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4635; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4636; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4637; GFX9-NEXT: v_mov_b32_e32 v0, v1 4638; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4639; GFX9-NEXT: s_mov_b32 s3, 0xf000 4640; GFX9-NEXT: s_mov_b32 s2, -1 4641; GFX9-NEXT: s_nop 0 4642; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4643; GFX9-NEXT: s_endpgm 4644; 4645; GFX1064-LABEL: umin_i32_varying: 4646; GFX1064: ; %bb.0: ; %entry 4647; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4648; GFX1064-NEXT: s_not_b64 exec, exec 4649; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4650; GFX1064-NEXT: s_not_b64 exec, exec 4651; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4652; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4653; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4654; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4655; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4656; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4657; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4658; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4659; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4660; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4661; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4662; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4663; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4664; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4665; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4666; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4667; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4668; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4669; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4670; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4671; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4672; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4673; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4674; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4675; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4676; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4677; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4678; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4679; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4680; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4681; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4682; GFX1064-NEXT: s_mov_b32 s2, -1 4683; GFX1064-NEXT: ; implicit-def: $vgpr0 4684; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4685; GFX1064-NEXT: s_cbranch_execz BB23_2 4686; GFX1064-NEXT: ; %bb.1: 4687; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4688; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4689; GFX1064-NEXT: s_mov_b32 s3, s7 4690; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4691; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4692; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 4693; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4694; GFX1064-NEXT: buffer_gl0_inv 4695; GFX1064-NEXT: BB23_2: 4696; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4697; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4698; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4699; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4700; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4701; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4702; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4703; GFX1064-NEXT: s_nop 0 4704; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4705; GFX1064-NEXT: s_endpgm 4706; 4707; GFX1032-LABEL: umin_i32_varying: 4708; GFX1032: ; %bb.0: ; %entry 4709; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4710; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4711; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4712; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4713; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4714; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4715; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4716; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4717; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4718; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4719; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4720; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4721; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4722; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4723; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4724; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4725; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4726; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4727; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4728; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4729; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4730; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4731; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4732; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4733; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4734; GFX1032-NEXT: s_mov_b32 s2, -1 4735; GFX1032-NEXT: ; implicit-def: $vgpr0 4736; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4737; GFX1032-NEXT: s_cbranch_execz BB23_2 4738; GFX1032-NEXT: ; %bb.1: 4739; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4740; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4741; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4742; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4743; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 4744; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4745; GFX1032-NEXT: buffer_gl0_inv 4746; GFX1032-NEXT: BB23_2: 4747; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4748; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4749; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4750; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4751; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4752; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4753; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4754; GFX1032-NEXT: s_nop 0 4755; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4756; GFX1032-NEXT: s_endpgm 4757entry: 4758 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4759 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4760 store i32 %old, i32 addrspace(1)* %out 4761 ret void 4762} 4763 4764define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4765; 4766; 4767; GFX7LESS-LABEL: umin_i64_constant: 4768; GFX7LESS: ; %bb.0: ; %entry 4769; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4770; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4771; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4772; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4773; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4774; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4775; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4776; GFX7LESS-NEXT: ; %bb.1: 4777; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4778; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4779; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4780; GFX7LESS-NEXT: s_mov_b32 m0, -1 4781; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4782; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4783; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4784; GFX7LESS-NEXT: BB24_2: 4785; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4786; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4787; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4788; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4789; GFX7LESS-NEXT: s_mov_b32 s2, -1 4790; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4791; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4792; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4793; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4794; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4795; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4796; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4797; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4798; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4799; GFX7LESS-NEXT: s_endpgm 4800; 4801; GFX8-LABEL: umin_i64_constant: 4802; GFX8: ; %bb.0: ; %entry 4803; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4804; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4805; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4806; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4807; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4808; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4809; GFX8-NEXT: s_cbranch_execz BB24_2 4810; GFX8-NEXT: ; %bb.1: 4811; GFX8-NEXT: v_mov_b32_e32 v0, 5 4812; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4813; GFX8-NEXT: v_mov_b32_e32 v1, 0 4814; GFX8-NEXT: s_mov_b32 m0, -1 4815; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4816; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4817; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4818; GFX8-NEXT: BB24_2: 4819; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4820; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4821; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4822; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4823; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4824; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4825; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4826; GFX8-NEXT: v_mov_b32_e32 v2, s5 4827; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4828; GFX8-NEXT: v_mov_b32_e32 v2, s4 4829; GFX8-NEXT: s_mov_b32 s2, -1 4830; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4831; GFX8-NEXT: s_mov_b32 s3, 0xf000 4832; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4833; GFX8-NEXT: s_endpgm 4834; 4835; GFX9-LABEL: umin_i64_constant: 4836; GFX9: ; %bb.0: ; %entry 4837; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4838; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4839; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4840; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4841; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4842; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4843; GFX9-NEXT: s_cbranch_execz BB24_2 4844; GFX9-NEXT: ; %bb.1: 4845; GFX9-NEXT: v_mov_b32_e32 v0, 5 4846; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4847; GFX9-NEXT: v_mov_b32_e32 v1, 0 4848; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4849; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4850; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4851; GFX9-NEXT: BB24_2: 4852; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4853; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4854; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4855; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4856; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4857; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4858; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4859; GFX9-NEXT: v_mov_b32_e32 v2, s5 4860; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4861; GFX9-NEXT: v_mov_b32_e32 v2, s4 4862; GFX9-NEXT: s_mov_b32 s2, -1 4863; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4864; GFX9-NEXT: s_mov_b32 s3, 0xf000 4865; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4866; GFX9-NEXT: s_endpgm 4867; 4868; GFX1064-LABEL: umin_i64_constant: 4869; GFX1064: ; %bb.0: ; %entry 4870; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4871; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4872; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4873; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4874; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4875; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4876; GFX1064-NEXT: s_cbranch_execz BB24_2 4877; GFX1064-NEXT: ; %bb.1: 4878; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4879; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4880; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4881; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4882; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4883; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4884; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4885; GFX1064-NEXT: buffer_gl0_inv 4886; GFX1064-NEXT: BB24_2: 4887; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4888; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4889; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4890; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4891; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4892; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4893; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4894; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4895; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4896; GFX1064-NEXT: s_mov_b32 s2, -1 4897; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4898; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4899; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4900; GFX1064-NEXT: s_endpgm 4901; 4902; GFX1032-LABEL: umin_i64_constant: 4903; GFX1032: ; %bb.0: ; %entry 4904; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4905; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4906; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4907; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4908; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4909; GFX1032-NEXT: s_cbranch_execz BB24_2 4910; GFX1032-NEXT: ; %bb.1: 4911; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4912; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4913; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4914; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4915; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4916; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4917; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4918; GFX1032-NEXT: buffer_gl0_inv 4919; GFX1032-NEXT: BB24_2: 4920; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4921; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4922; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4923; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4924; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4925; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4926; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4927; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4928; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4929; GFX1032-NEXT: s_mov_b32 s2, -1 4930; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4931; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4932; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4933; GFX1032-NEXT: s_endpgm 4934entry: 4935 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4936 store i64 %old, i64 addrspace(1)* %out 4937 ret void 4938} 4939