1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s 6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 140; GFX1032-NEXT: s_mov_b32 s3, exec_lo 141; GFX1032-NEXT: ; implicit-def: $vgpr1 142; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 143; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 144; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 145; GFX1032-NEXT: s_cbranch_execz BB0_2 146; GFX1032-NEXT: ; %bb.1: 147; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 148; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 149; GFX1032-NEXT: s_mul_i32 s3, s3, 5 150; GFX1032-NEXT: v_mov_b32_e32 v2, s3 151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 154; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 155; GFX1032-NEXT: buffer_gl0_inv 156; GFX1032-NEXT: BB0_2: 157; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 158; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 159; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 161; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 162; GFX1032-NEXT: s_mov_b32 s2, -1 163; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 164; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 165; GFX1032-NEXT: s_endpgm 166entry: 167 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 168 store i32 %old, i32 addrspace(1)* %out 169 ret void 170} 171 172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 173; 174; 175; GFX7LESS-LABEL: add_i32_uniform: 176; GFX7LESS: ; %bb.0: ; %entry 177; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 178; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 179; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 180; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 181; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 182; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 183; GFX7LESS-NEXT: ; implicit-def: $vgpr1 184; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 185; GFX7LESS-NEXT: s_cbranch_execz BB1_2 186; GFX7LESS-NEXT: ; %bb.1: 187; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 190; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 191; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 192; GFX7LESS-NEXT: s_mov_b32 m0, -1 193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 194; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: BB1_2: 197; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 198; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 199; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 200; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 201; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 202; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 203; GFX7LESS-NEXT: s_mov_b32 s6, -1 204; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 205; GFX7LESS-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_uniform: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 210; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 211; GFX8-NEXT: s_mov_b64 s[2:3], exec 212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 215; GFX8-NEXT: ; implicit-def: $vgpr1 216; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 217; GFX8-NEXT: s_cbranch_execz BB1_2 218; GFX8-NEXT: ; %bb.1: 219; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_mul_i32 s1, s0, s1 222; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 223; GFX8-NEXT: v_mov_b32_e32 v2, s1 224; GFX8-NEXT: s_mov_b32 m0, -1 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: BB1_2: 229; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 231; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 232; GFX8-NEXT: v_readfirstlane_b32 s0, v1 233; GFX8-NEXT: s_mov_b32 s7, 0xf000 234; GFX8-NEXT: s_mov_b32 s6, -1 235; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 236; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 237; GFX8-NEXT: s_endpgm 238; 239; GFX9-LABEL: add_i32_uniform: 240; GFX9: ; %bb.0: ; %entry 241; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 242; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 243; GFX9-NEXT: s_mov_b64 s[6:7], exec 244; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 245; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX9-NEXT: ; implicit-def: $vgpr1 248; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 249; GFX9-NEXT: s_cbranch_execz BB1_2 250; GFX9-NEXT: ; %bb.1: 251; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: s_mul_i32 s3, s2, s3 254; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 255; GFX9-NEXT: v_mov_b32_e32 v2, s3 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: BB1_2: 260; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 263; GFX9-NEXT: v_readfirstlane_b32 s0, v1 264; GFX9-NEXT: s_mov_b32 s7, 0xf000 265; GFX9-NEXT: s_mov_b32 s6, -1 266; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 267; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 268; GFX9-NEXT: s_endpgm 269; 270; GFX1064-LABEL: add_i32_uniform: 271; GFX1064: ; %bb.0: ; %entry 272; GFX1064-NEXT: s_clause 0x1 273; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 274; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 275; GFX1064-NEXT: s_mov_b64 s[6:7], exec 276; GFX1064-NEXT: ; implicit-def: $vgpr1 277; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 278; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 279; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 280; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 281; GFX1064-NEXT: s_cbranch_execz BB1_2 282; GFX1064-NEXT: ; %bb.1: 283; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 284; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 286; GFX1064-NEXT: s_mul_i32 s3, s2, s3 287; GFX1064-NEXT: v_mov_b32_e32 v2, s3 288; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: buffer_gl0_inv 293; GFX1064-NEXT: BB1_2: 294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 295; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 296; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 297; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 298; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 299; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 300; GFX1064-NEXT: s_mov_b32 s6, -1 301; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 302; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 303; GFX1064-NEXT: s_endpgm 304; 305; GFX1032-LABEL: add_i32_uniform: 306; GFX1032: ; %bb.0: ; %entry 307; GFX1032-NEXT: s_clause 0x1 308; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 309; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 310; GFX1032-NEXT: s_mov_b32 s3, exec_lo 311; GFX1032-NEXT: ; implicit-def: $vgpr1 312; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 313; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 314; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 315; GFX1032-NEXT: s_cbranch_execz BB1_2 316; GFX1032-NEXT: ; %bb.1: 317; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 318; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 319; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 320; GFX1032-NEXT: s_mul_i32 s1, s2, s1 321; GFX1032-NEXT: v_mov_b32_e32 v2, s1 322; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 323; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 324; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: buffer_gl0_inv 327; GFX1032-NEXT: BB1_2: 328; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 329; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 330; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 331; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 332; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 333; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 334; GFX1032-NEXT: s_mov_b32 s6, -1 335; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 336; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX1032-NEXT: s_endpgm 338entry: 339 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 340 store i32 %old, i32 addrspace(1)* %out 341 ret void 342} 343 344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 345; 346; 347; GFX7LESS-LABEL: add_i32_varying: 348; GFX7LESS: ; %bb.0: ; %entry 349; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 350; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 351; GFX7LESS-NEXT: s_mov_b32 m0, -1 352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 356; GFX7LESS-NEXT: s_mov_b32 s2, -1 357; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 358; GFX7LESS-NEXT: s_endpgm 359; 360; GFX8-LABEL: add_i32_varying: 361; GFX8: ; %bb.0: ; %entry 362; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 363; GFX8-NEXT: v_mov_b32_e32 v2, v0 364; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 365; GFX8-NEXT: v_mov_b32_e32 v1, 0 366; GFX8-NEXT: s_mov_b64 exec, s[2:3] 367; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 368; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 369; GFX8-NEXT: s_not_b64 exec, exec 370; GFX8-NEXT: v_mov_b32_e32 v2, 0 371; GFX8-NEXT: s_not_b64 exec, exec 372; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 373; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 374; GFX8-NEXT: s_nop 1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 382; GFX8-NEXT: s_nop 1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 384; GFX8-NEXT: v_readlane_b32 s4, v2, 63 385; GFX8-NEXT: s_nop 0 386; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 387; GFX8-NEXT: s_mov_b64 exec, s[2:3] 388; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 389; GFX8-NEXT: ; implicit-def: $vgpr0 390; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 391; GFX8-NEXT: s_cbranch_execz BB2_2 392; GFX8-NEXT: ; %bb.1: 393; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 394; GFX8-NEXT: v_mov_b32_e32 v3, s4 395; GFX8-NEXT: s_mov_b32 m0, -1 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: BB2_2: 400; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 402; GFX8-NEXT: v_readfirstlane_b32 s2, v0 403; GFX8-NEXT: v_mov_b32_e32 v0, v1 404; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 405; GFX8-NEXT: s_mov_b32 s3, 0xf000 406; GFX8-NEXT: s_mov_b32 s2, -1 407; GFX8-NEXT: s_nop 0 408; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 409; GFX8-NEXT: s_endpgm 410; 411; GFX9-LABEL: add_i32_varying: 412; GFX9: ; %bb.0: ; %entry 413; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 414; GFX9-NEXT: v_mov_b32_e32 v2, v0 415; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 416; GFX9-NEXT: v_mov_b32_e32 v1, 0 417; GFX9-NEXT: s_mov_b64 exec, s[2:3] 418; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 419; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 420; GFX9-NEXT: s_not_b64 exec, exec 421; GFX9-NEXT: v_mov_b32_e32 v2, 0 422; GFX9-NEXT: s_not_b64 exec, exec 423; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 424; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 425; GFX9-NEXT: s_nop 1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 435; GFX9-NEXT: v_readlane_b32 s4, v2, 63 436; GFX9-NEXT: s_nop 0 437; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 438; GFX9-NEXT: s_mov_b64 exec, s[2:3] 439; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 440; GFX9-NEXT: ; implicit-def: $vgpr0 441; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 442; GFX9-NEXT: s_cbranch_execz BB2_2 443; GFX9-NEXT: ; %bb.1: 444; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 445; GFX9-NEXT: v_mov_b32_e32 v3, s4 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: BB2_2: 450; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: v_readfirstlane_b32 s2, v0 453; GFX9-NEXT: v_mov_b32_e32 v0, v1 454; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 455; GFX9-NEXT: s_mov_b32 s3, 0xf000 456; GFX9-NEXT: s_mov_b32 s2, -1 457; GFX9-NEXT: s_nop 0 458; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX9-NEXT: s_endpgm 460; 461; GFX1064-LABEL: add_i32_varying: 462; GFX1064: ; %bb.0: ; %entry 463; GFX1064-NEXT: v_mov_b32_e32 v1, v0 464; GFX1064-NEXT: s_not_b64 exec, exec 465; GFX1064-NEXT: v_mov_b32_e32 v1, 0 466; GFX1064-NEXT: s_not_b64 exec, exec 467; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX1064-NEXT: v_mov_b32_e32 v3, 0 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX1064-NEXT: v_mov_b32_e32 v2, v1 474; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 475; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 476; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 477; GFX1064-NEXT: v_mov_b32_e32 v2, s4 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 479; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 480; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 481; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 482; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 484; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 485; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 486; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 487; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 488; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 489; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 490; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 491; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 492; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 493; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 498; GFX1064-NEXT: s_mov_b32 s2, -1 499; GFX1064-NEXT: ; implicit-def: $vgpr0 500; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 501; GFX1064-NEXT: s_cbranch_execz BB2_2 502; GFX1064-NEXT: ; %bb.1: 503; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 504; GFX1064-NEXT: v_mov_b32_e32 v4, s7 505; GFX1064-NEXT: s_mov_b32 s3, s7 506; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 507; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 508; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 509; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 510; GFX1064-NEXT: buffer_gl0_inv 511; GFX1064-NEXT: BB2_2: 512; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 513; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 514; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 515; GFX1064-NEXT: v_mov_b32_e32 v0, v3 516; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 517; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 518; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 519; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 520; GFX1064-NEXT: s_endpgm 521; 522; GFX1032-LABEL: add_i32_varying: 523; GFX1032: ; %bb.0: ; %entry 524; GFX1032-NEXT: v_mov_b32_e32 v1, v0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: v_mov_b32_e32 v1, 0 527; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 528; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 532; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX1032-NEXT: v_mov_b32_e32 v2, v1 534; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 535; GFX1032-NEXT: s_mov_b32 exec_lo, s2 536; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 538; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 539; GFX1032-NEXT: v_mov_b32_e32 v3, 0 540; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 541; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 542; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 543; GFX1032-NEXT: s_mov_b32 exec_lo, s2 544; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 545; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 546; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 549; GFX1032-NEXT: s_mov_b32 s2, -1 550; GFX1032-NEXT: ; implicit-def: $vgpr0 551; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 552; GFX1032-NEXT: s_cbranch_execz BB2_2 553; GFX1032-NEXT: ; %bb.1: 554; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 555; GFX1032-NEXT: v_mov_b32_e32 v4, s4 556; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 557; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 558; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 559; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 560; GFX1032-NEXT: buffer_gl0_inv 561; GFX1032-NEXT: BB2_2: 562; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 563; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 564; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 565; GFX1032-NEXT: v_mov_b32_e32 v0, v3 566; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 567; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 568; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 569; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; GFX1032-NEXT: s_endpgm 571entry: 572 %lane = call i32 @llvm.amdgcn.workitem.id.x() 573 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 574 store i32 %old, i32 addrspace(1)* %out 575 ret void 576} 577 578define amdgpu_kernel void @add_i32_varying_nouse() { 579; GFX7LESS-LABEL: add_i32_varying_nouse: 580; GFX7LESS: ; %bb.0: ; %entry 581; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 582; GFX7LESS-NEXT: s_mov_b32 m0, -1 583; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 584; GFX7LESS-NEXT: ds_add_u32 v1, v0 585; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 586; GFX7LESS-NEXT: s_endpgm 587; 588; GFX8-LABEL: add_i32_varying_nouse: 589; GFX8: ; %bb.0: ; %entry 590; GFX8-NEXT: v_mov_b32_e32 v1, v0 591; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 592; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 593; GFX8-NEXT: s_not_b64 exec, exec 594; GFX8-NEXT: v_mov_b32_e32 v1, 0 595; GFX8-NEXT: s_not_b64 exec, exec 596; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 597; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 598; GFX8-NEXT: s_nop 1 599; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 600; GFX8-NEXT: s_nop 1 601; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 602; GFX8-NEXT: s_nop 1 603; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 604; GFX8-NEXT: s_nop 1 605; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 606; GFX8-NEXT: s_nop 1 607; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 608; GFX8-NEXT: v_readlane_b32 s2, v1, 63 609; GFX8-NEXT: s_mov_b64 exec, s[0:1] 610; GFX8-NEXT: s_mov_b32 s0, s2 611; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 612; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 613; GFX8-NEXT: s_cbranch_execz BB3_2 614; GFX8-NEXT: ; %bb.1: 615; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 616; GFX8-NEXT: v_mov_b32_e32 v2, s0 617; GFX8-NEXT: s_mov_b32 m0, -1 618; GFX8-NEXT: s_waitcnt lgkmcnt(0) 619; GFX8-NEXT: ds_add_u32 v0, v2 620; GFX8-NEXT: s_waitcnt lgkmcnt(0) 621; GFX8-NEXT: BB3_2: 622; GFX8-NEXT: s_endpgm 623; 624; GFX9-LABEL: add_i32_varying_nouse: 625; GFX9: ; %bb.0: ; %entry 626; GFX9-NEXT: v_mov_b32_e32 v1, v0 627; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 628; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 629; GFX9-NEXT: s_not_b64 exec, exec 630; GFX9-NEXT: v_mov_b32_e32 v1, 0 631; GFX9-NEXT: s_not_b64 exec, exec 632; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 633; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 634; GFX9-NEXT: s_nop 1 635; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 636; GFX9-NEXT: s_nop 1 637; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 638; GFX9-NEXT: s_nop 1 639; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 640; GFX9-NEXT: s_nop 1 641; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 642; GFX9-NEXT: s_nop 1 643; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 644; GFX9-NEXT: v_readlane_b32 s2, v1, 63 645; GFX9-NEXT: s_mov_b64 exec, s[0:1] 646; GFX9-NEXT: s_mov_b32 s0, s2 647; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 648; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 649; GFX9-NEXT: s_cbranch_execz BB3_2 650; GFX9-NEXT: ; %bb.1: 651; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 652; GFX9-NEXT: v_mov_b32_e32 v2, s0 653; GFX9-NEXT: s_waitcnt lgkmcnt(0) 654; GFX9-NEXT: ds_add_u32 v0, v2 655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-NEXT: BB3_2: 657; GFX9-NEXT: s_endpgm 658; 659; GFX1064-LABEL: add_i32_varying_nouse: 660; GFX1064: ; %bb.0: ; %entry 661; GFX1064-NEXT: v_mov_b32_e32 v1, v0 662; GFX1064-NEXT: s_not_b64 exec, exec 663; GFX1064-NEXT: v_mov_b32_e32 v1, 0 664; GFX1064-NEXT: s_not_b64 exec, exec 665; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 666; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 667; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 668; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 669; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 670; GFX1064-NEXT: v_mov_b32_e32 v2, v1 671; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 672; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 673; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 674; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 675; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 676; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 677; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 678; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 679; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 680; GFX1064-NEXT: s_add_i32 s0, s2, s3 681; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 682; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 683; GFX1064-NEXT: s_cbranch_execz BB3_2 684; GFX1064-NEXT: ; %bb.1: 685; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 686; GFX1064-NEXT: v_mov_b32_e32 v3, s0 687; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 688; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 689; GFX1064-NEXT: ds_add_u32 v0, v3 690; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 691; GFX1064-NEXT: buffer_gl0_inv 692; GFX1064-NEXT: BB3_2: 693; GFX1064-NEXT: s_endpgm 694; 695; GFX1032-LABEL: add_i32_varying_nouse: 696; GFX1032: ; %bb.0: ; %entry 697; GFX1032-NEXT: v_mov_b32_e32 v1, v0 698; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 699; GFX1032-NEXT: v_mov_b32_e32 v1, 0 700; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 701; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 702; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 703; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 704; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 705; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1032-NEXT: v_mov_b32_e32 v2, v1 707; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 708; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 709; GFX1032-NEXT: s_mov_b32 exec_lo, s0 710; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 711; GFX1032-NEXT: v_mov_b32_e32 v0, v1 712; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 713; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 714; GFX1032-NEXT: s_cbranch_execz BB3_2 715; GFX1032-NEXT: ; %bb.1: 716; GFX1032-NEXT: v_mov_b32_e32 v3, local_var32@abs32@lo 717; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 718; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 719; GFX1032-NEXT: ds_add_u32 v3, v0 720; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 721; GFX1032-NEXT: buffer_gl0_inv 722; GFX1032-NEXT: BB3_2: 723; GFX1032-NEXT: s_endpgm 724entry: 725 %lane = call i32 @llvm.amdgcn.workitem.id.x() 726 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 727 ret void 728} 729 730define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 731; 732; 733; GFX7LESS-LABEL: add_i64_constant: 734; GFX7LESS: ; %bb.0: ; %entry 735; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 736; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 737; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 738; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 739; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 740; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 741; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 742; GFX7LESS-NEXT: s_cbranch_execz BB4_2 743; GFX7LESS-NEXT: ; %bb.1: 744; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 745; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 746; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 747; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 748; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 749; GFX7LESS-NEXT: s_mov_b32 m0, -1 750; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 751; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 752; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 753; GFX7LESS-NEXT: BB4_2: 754; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 755; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 756; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 757; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 758; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 759; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 760; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 761; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 762; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 763; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 764; GFX7LESS-NEXT: s_mov_b32 s2, -1 765; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 766; GFX7LESS-NEXT: s_endpgm 767; 768; GFX8-LABEL: add_i64_constant: 769; GFX8: ; %bb.0: ; %entry 770; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 771; GFX8-NEXT: s_mov_b64 s[4:5], exec 772; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 773; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 774; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 775; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 776; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 777; GFX8-NEXT: s_cbranch_execz BB4_2 778; GFX8-NEXT: ; %bb.1: 779; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 780; GFX8-NEXT: s_mul_i32 s4, s4, 5 781; GFX8-NEXT: v_mov_b32_e32 v1, s4 782; GFX8-NEXT: v_mov_b32_e32 v2, 0 783; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 784; GFX8-NEXT: s_mov_b32 m0, -1 785; GFX8-NEXT: s_waitcnt lgkmcnt(0) 786; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 787; GFX8-NEXT: s_waitcnt lgkmcnt(0) 788; GFX8-NEXT: BB4_2: 789; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 790; GFX8-NEXT: s_waitcnt lgkmcnt(0) 791; GFX8-NEXT: v_readfirstlane_b32 s2, v1 792; GFX8-NEXT: v_readfirstlane_b32 s3, v2 793; GFX8-NEXT: v_mov_b32_e32 v1, s2 794; GFX8-NEXT: v_mov_b32_e32 v2, s3 795; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 796; GFX8-NEXT: s_mov_b32 s3, 0xf000 797; GFX8-NEXT: s_mov_b32 s2, -1 798; GFX8-NEXT: s_nop 2 799; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 800; GFX8-NEXT: s_endpgm 801; 802; GFX9-LABEL: add_i64_constant: 803; GFX9: ; %bb.0: ; %entry 804; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 805; GFX9-NEXT: s_mov_b64 s[4:5], exec 806; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 807; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 808; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 809; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 810; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 811; GFX9-NEXT: s_cbranch_execz BB4_2 812; GFX9-NEXT: ; %bb.1: 813; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 814; GFX9-NEXT: s_mul_i32 s4, s4, 5 815; GFX9-NEXT: v_mov_b32_e32 v1, s4 816; GFX9-NEXT: v_mov_b32_e32 v2, 0 817; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 818; GFX9-NEXT: s_waitcnt lgkmcnt(0) 819; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 820; GFX9-NEXT: s_waitcnt lgkmcnt(0) 821; GFX9-NEXT: BB4_2: 822; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 823; GFX9-NEXT: s_waitcnt lgkmcnt(0) 824; GFX9-NEXT: v_readfirstlane_b32 s2, v1 825; GFX9-NEXT: v_readfirstlane_b32 s3, v2 826; GFX9-NEXT: v_mov_b32_e32 v1, s2 827; GFX9-NEXT: v_mov_b32_e32 v2, s3 828; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 829; GFX9-NEXT: s_mov_b32 s3, 0xf000 830; GFX9-NEXT: s_mov_b32 s2, -1 831; GFX9-NEXT: s_nop 2 832; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 833; GFX9-NEXT: s_endpgm 834; 835; GFX1064-LABEL: add_i64_constant: 836; GFX1064: ; %bb.0: ; %entry 837; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 838; GFX1064-NEXT: s_mov_b64 s[4:5], exec 839; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 840; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 841; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 842; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 843; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 844; GFX1064-NEXT: s_cbranch_execz BB4_2 845; GFX1064-NEXT: ; %bb.1: 846; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 847; GFX1064-NEXT: v_mov_b32_e32 v2, 0 848; GFX1064-NEXT: s_mul_i32 s4, s4, 5 849; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 850; GFX1064-NEXT: v_mov_b32_e32 v1, s4 851; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 852; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 853; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 854; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 855; GFX1064-NEXT: buffer_gl0_inv 856; GFX1064-NEXT: BB4_2: 857; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 858; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 859; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 860; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 861; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 862; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 863; GFX1064-NEXT: s_mov_b32 s2, -1 864; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 865; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 866; GFX1064-NEXT: s_endpgm 867; 868; GFX1032-LABEL: add_i64_constant: 869; GFX1032: ; %bb.0: ; %entry 870; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 871; GFX1032-NEXT: s_mov_b32 s3, exec_lo 872; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 873; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 874; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 875; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 876; GFX1032-NEXT: s_cbranch_execz BB4_2 877; GFX1032-NEXT: ; %bb.1: 878; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 879; GFX1032-NEXT: v_mov_b32_e32 v2, 0 880; GFX1032-NEXT: s_mul_i32 s3, s3, 5 881; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 882; GFX1032-NEXT: v_mov_b32_e32 v1, s3 883; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 884; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 885; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 886; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 887; GFX1032-NEXT: buffer_gl0_inv 888; GFX1032-NEXT: BB4_2: 889; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 890; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 891; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 892; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 893; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 894; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 895; GFX1032-NEXT: s_mov_b32 s2, -1 896; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 897; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 898; GFX1032-NEXT: s_endpgm 899entry: 900 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 901 store i64 %old, i64 addrspace(1)* %out 902 ret void 903} 904 905define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 906; 907; 908; GFX7LESS-LABEL: add_i64_uniform: 909; GFX7LESS: ; %bb.0: ; %entry 910; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 911; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 912; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 913; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 914; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 915; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 916; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 917; GFX7LESS-NEXT: s_cbranch_execz BB5_2 918; GFX7LESS-NEXT: ; %bb.1: 919; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 920; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 921; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 922; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 923; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 924; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 925; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 926; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 927; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 928; GFX7LESS-NEXT: s_mov_b32 m0, -1 929; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 930; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 931; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 932; GFX7LESS-NEXT: BB5_2: 933; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 934; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 935; GFX7LESS-NEXT: s_mov_b32 s6, -1 936; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 937; GFX7LESS-NEXT: s_mov_b32 s4, s0 938; GFX7LESS-NEXT: s_mov_b32 s5, s1 939; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 940; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 941; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 942; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 943; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 944; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 945; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 946; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 947; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 948; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 949; GFX7LESS-NEXT: s_endpgm 950; 951; GFX8-LABEL: add_i64_uniform: 952; GFX8: ; %bb.0: ; %entry 953; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 954; GFX8-NEXT: s_mov_b64 s[6:7], exec 955; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 956; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 957; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 958; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 959; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 960; GFX8-NEXT: s_cbranch_execz BB5_2 961; GFX8-NEXT: ; %bb.1: 962; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 963; GFX8-NEXT: v_mov_b32_e32 v1, s6 964; GFX8-NEXT: s_waitcnt lgkmcnt(0) 965; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 966; GFX8-NEXT: s_mul_i32 s7, s3, s6 967; GFX8-NEXT: s_mul_i32 s6, s2, s6 968; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 969; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 970; GFX8-NEXT: v_mov_b32_e32 v1, s6 971; GFX8-NEXT: s_mov_b32 m0, -1 972; GFX8-NEXT: s_waitcnt lgkmcnt(0) 973; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 974; GFX8-NEXT: s_waitcnt lgkmcnt(0) 975; GFX8-NEXT: BB5_2: 976; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 977; GFX8-NEXT: s_waitcnt lgkmcnt(0) 978; GFX8-NEXT: s_mov_b32 s4, s0 979; GFX8-NEXT: v_readfirstlane_b32 s0, v1 980; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 981; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 982; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 983; GFX8-NEXT: s_mov_b32 s5, s1 984; GFX8-NEXT: v_readfirstlane_b32 s1, v2 985; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 986; GFX8-NEXT: v_mov_b32_e32 v2, s1 987; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 988; GFX8-NEXT: s_mov_b32 s7, 0xf000 989; GFX8-NEXT: s_mov_b32 s6, -1 990; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 991; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 992; GFX8-NEXT: s_endpgm 993; 994; GFX9-LABEL: add_i64_uniform: 995; GFX9: ; %bb.0: ; %entry 996; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 997; GFX9-NEXT: s_mov_b64 s[6:7], exec 998; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 999; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1000; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1001; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1002; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1003; GFX9-NEXT: s_cbranch_execz BB5_2 1004; GFX9-NEXT: ; %bb.1: 1005; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1006; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX9-NEXT: s_mul_i32 s7, s3, s6 1008; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1009; GFX9-NEXT: s_add_i32 s8, s8, s7 1010; GFX9-NEXT: s_mul_i32 s6, s2, s6 1011; GFX9-NEXT: v_mov_b32_e32 v1, s6 1012; GFX9-NEXT: v_mov_b32_e32 v2, s8 1013; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1014; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1016; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1017; GFX9-NEXT: BB5_2: 1018; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1019; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1020; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1021; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1022; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1023; GFX9-NEXT: s_mov_b32 s4, s0 1024; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1025; GFX9-NEXT: s_mov_b32 s5, s1 1026; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1027; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1028; GFX9-NEXT: v_mov_b32_e32 v2, s1 1029; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1030; GFX9-NEXT: s_mov_b32 s7, 0xf000 1031; GFX9-NEXT: s_mov_b32 s6, -1 1032; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1033; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1034; GFX9-NEXT: s_endpgm 1035; 1036; GFX1064-LABEL: add_i64_uniform: 1037; GFX1064: ; %bb.0: ; %entry 1038; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1039; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1040; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1041; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1042; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1043; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1044; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1045; GFX1064-NEXT: s_cbranch_execz BB5_2 1046; GFX1064-NEXT: ; %bb.1: 1047; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1048; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1049; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1050; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1051; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1052; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1053; GFX1064-NEXT: s_add_i32 s8, s8, s7 1054; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1055; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1056; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1057; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1058; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1059; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1060; GFX1064-NEXT: buffer_gl0_inv 1061; GFX1064-NEXT: BB5_2: 1062; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1063; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1064; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1065; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1066; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1067; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1068; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1069; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1070; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1071; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1072; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 1073; GFX1064-NEXT: s_mov_b32 s2, -1 1074; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1075; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1076; GFX1064-NEXT: s_endpgm 1077; 1078; GFX1032-LABEL: add_i64_uniform: 1079; GFX1032: ; %bb.0: ; %entry 1080; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1081; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1082; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1083; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1084; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1085; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1086; GFX1032-NEXT: s_cbranch_execz BB5_2 1087; GFX1032-NEXT: ; %bb.1: 1088; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1089; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1090; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1091; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1092; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1093; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1094; GFX1032-NEXT: s_add_i32 s7, s7, s6 1095; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1096; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1097; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1098; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1099; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1100; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1101; GFX1032-NEXT: buffer_gl0_inv 1102; GFX1032-NEXT: BB5_2: 1103; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1104; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1105; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1107; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1108; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1109; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1110; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1111; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1112; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1113; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 1114; GFX1032-NEXT: s_mov_b32 s2, -1 1115; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1116; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1117; GFX1032-NEXT: s_endpgm 1118entry: 1119 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1120 store i64 %old, i64 addrspace(1)* %out 1121 ret void 1122} 1123 1124define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1125; 1126; 1127; GFX7LESS-LABEL: add_i64_varying: 1128; GFX7LESS: ; %bb.0: ; %entry 1129; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1130; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1131; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1132; GFX7LESS-NEXT: s_mov_b32 m0, -1 1133; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1134; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1135; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1137; GFX7LESS-NEXT: s_mov_b32 s2, -1 1138; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1139; GFX7LESS-NEXT: s_endpgm 1140; 1141; GFX8-LABEL: add_i64_varying: 1142; GFX8: ; %bb.0: ; %entry 1143; GFX8-NEXT: v_mov_b32_e32 v1, 0 1144; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1145; GFX8-NEXT: s_mov_b32 m0, -1 1146; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1147; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1148; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1149; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1150; GFX8-NEXT: s_mov_b32 s3, 0xf000 1151; GFX8-NEXT: s_mov_b32 s2, -1 1152; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1153; GFX8-NEXT: s_endpgm 1154; 1155; GFX9-LABEL: add_i64_varying: 1156; GFX9: ; %bb.0: ; %entry 1157; GFX9-NEXT: v_mov_b32_e32 v1, 0 1158; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1159; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1160; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1161; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1162; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1163; GFX9-NEXT: s_mov_b32 s3, 0xf000 1164; GFX9-NEXT: s_mov_b32 s2, -1 1165; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1166; GFX9-NEXT: s_endpgm 1167; 1168; GFX10-LABEL: add_i64_varying: 1169; GFX10: ; %bb.0: ; %entry 1170; GFX10-NEXT: v_mov_b32_e32 v1, 0 1171; GFX10-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1172; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1173; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1174; GFX10-NEXT: s_mov_b32 s2, -1 1175; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1176; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1177; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1178; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1179; GFX10-NEXT: buffer_gl0_inv 1180; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1181; GFX10-NEXT: s_endpgm 1182entry: 1183 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1184 %zext = zext i32 %lane to i64 1185 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1186 store i64 %old, i64 addrspace(1)* %out 1187 ret void 1188} 1189 1190define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1191; 1192; 1193; GFX7LESS-LABEL: sub_i32_constant: 1194; GFX7LESS: ; %bb.0: ; %entry 1195; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1196; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1197; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1198; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1199; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1200; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1201; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1202; GFX7LESS-NEXT: s_cbranch_execz BB7_2 1203; GFX7LESS-NEXT: ; %bb.1: 1204; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1205; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1206; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1207; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1208; GFX7LESS-NEXT: s_mov_b32 m0, -1 1209; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1210; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1211; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1212; GFX7LESS-NEXT: BB7_2: 1213; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1214; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1215; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1216; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1217; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1218; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1219; GFX7LESS-NEXT: s_mov_b32 s2, -1 1220; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1221; GFX7LESS-NEXT: s_endpgm 1222; 1223; GFX8-LABEL: sub_i32_constant: 1224; GFX8: ; %bb.0: ; %entry 1225; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1226; GFX8-NEXT: s_mov_b64 s[2:3], exec 1227; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1228; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1229; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1230; GFX8-NEXT: ; implicit-def: $vgpr1 1231; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1232; GFX8-NEXT: s_cbranch_execz BB7_2 1233; GFX8-NEXT: ; %bb.1: 1234; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1235; GFX8-NEXT: s_mul_i32 s2, s2, 5 1236; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1237; GFX8-NEXT: v_mov_b32_e32 v2, s2 1238; GFX8-NEXT: s_mov_b32 m0, -1 1239; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1240; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1241; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1242; GFX8-NEXT: BB7_2: 1243; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1244; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1245; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1246; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1247; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1248; GFX8-NEXT: s_mov_b32 s3, 0xf000 1249; GFX8-NEXT: s_mov_b32 s2, -1 1250; GFX8-NEXT: s_nop 0 1251; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1252; GFX8-NEXT: s_endpgm 1253; 1254; GFX9-LABEL: sub_i32_constant: 1255; GFX9: ; %bb.0: ; %entry 1256; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1257; GFX9-NEXT: s_mov_b64 s[2:3], exec 1258; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1259; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1260; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1261; GFX9-NEXT: ; implicit-def: $vgpr1 1262; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1263; GFX9-NEXT: s_cbranch_execz BB7_2 1264; GFX9-NEXT: ; %bb.1: 1265; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1266; GFX9-NEXT: s_mul_i32 s2, s2, 5 1267; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1268; GFX9-NEXT: v_mov_b32_e32 v2, s2 1269; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1271; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1272; GFX9-NEXT: BB7_2: 1273; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1274; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1275; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1276; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1277; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1278; GFX9-NEXT: s_mov_b32 s3, 0xf000 1279; GFX9-NEXT: s_mov_b32 s2, -1 1280; GFX9-NEXT: s_nop 0 1281; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1282; GFX9-NEXT: s_endpgm 1283; 1284; GFX1064-LABEL: sub_i32_constant: 1285; GFX1064: ; %bb.0: ; %entry 1286; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1287; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1288; GFX1064-NEXT: ; implicit-def: $vgpr1 1289; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1290; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1291; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1292; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1293; GFX1064-NEXT: s_cbranch_execz BB7_2 1294; GFX1064-NEXT: ; %bb.1: 1295; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1296; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1297; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1298; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1299; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1300; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1301; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1302; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1303; GFX1064-NEXT: buffer_gl0_inv 1304; GFX1064-NEXT: BB7_2: 1305; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1306; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1307; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1308; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1309; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1310; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1311; GFX1064-NEXT: s_mov_b32 s2, -1 1312; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1313; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1314; GFX1064-NEXT: s_endpgm 1315; 1316; GFX1032-LABEL: sub_i32_constant: 1317; GFX1032: ; %bb.0: ; %entry 1318; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1319; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1320; GFX1032-NEXT: ; implicit-def: $vgpr1 1321; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1322; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1323; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1324; GFX1032-NEXT: s_cbranch_execz BB7_2 1325; GFX1032-NEXT: ; %bb.1: 1326; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1327; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1328; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1329; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1330; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1331; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1332; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1333; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1334; GFX1032-NEXT: buffer_gl0_inv 1335; GFX1032-NEXT: BB7_2: 1336; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1337; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1338; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1339; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1340; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1341; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1342; GFX1032-NEXT: s_mov_b32 s2, -1 1343; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1344; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1345; GFX1032-NEXT: s_endpgm 1346entry: 1347 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1348 store i32 %old, i32 addrspace(1)* %out 1349 ret void 1350} 1351 1352define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1353; 1354; 1355; GFX7LESS-LABEL: sub_i32_uniform: 1356; GFX7LESS: ; %bb.0: ; %entry 1357; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1358; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1359; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 1360; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1361; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1362; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1363; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1364; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 1365; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1366; GFX7LESS-NEXT: ; %bb.1: 1367; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1368; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1369; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 1370; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1371; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1372; GFX7LESS-NEXT: s_mov_b32 m0, -1 1373; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1374; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1375; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1376; GFX7LESS-NEXT: BB8_2: 1377; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 1378; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1379; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1380; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1381; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1382; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 1383; GFX7LESS-NEXT: s_mov_b32 s6, -1 1384; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1385; GFX7LESS-NEXT: s_endpgm 1386; 1387; GFX8-LABEL: sub_i32_uniform: 1388; GFX8: ; %bb.0: ; %entry 1389; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1390; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1391; GFX8-NEXT: s_mov_b64 s[2:3], exec 1392; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1393; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1394; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1395; GFX8-NEXT: ; implicit-def: $vgpr1 1396; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1397; GFX8-NEXT: s_cbranch_execz BB8_2 1398; GFX8-NEXT: ; %bb.1: 1399; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1400; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1401; GFX8-NEXT: s_mul_i32 s1, s0, s1 1402; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1403; GFX8-NEXT: v_mov_b32_e32 v2, s1 1404; GFX8-NEXT: s_mov_b32 m0, -1 1405; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1406; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1407; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1408; GFX8-NEXT: BB8_2: 1409; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1410; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1412; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1413; GFX8-NEXT: s_mov_b32 s7, 0xf000 1414; GFX8-NEXT: s_mov_b32 s6, -1 1415; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1416; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1417; GFX8-NEXT: s_endpgm 1418; 1419; GFX9-LABEL: sub_i32_uniform: 1420; GFX9: ; %bb.0: ; %entry 1421; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1422; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1423; GFX9-NEXT: s_mov_b64 s[6:7], exec 1424; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1425; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1426; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1427; GFX9-NEXT: ; implicit-def: $vgpr1 1428; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1429; GFX9-NEXT: s_cbranch_execz BB8_2 1430; GFX9-NEXT: ; %bb.1: 1431; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1432; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX9-NEXT: s_mul_i32 s3, s2, s3 1434; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1435; GFX9-NEXT: v_mov_b32_e32 v2, s3 1436; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1437; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1438; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1439; GFX9-NEXT: BB8_2: 1440; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1441; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1442; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1443; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1444; GFX9-NEXT: s_mov_b32 s7, 0xf000 1445; GFX9-NEXT: s_mov_b32 s6, -1 1446; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1447; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1448; GFX9-NEXT: s_endpgm 1449; 1450; GFX1064-LABEL: sub_i32_uniform: 1451; GFX1064: ; %bb.0: ; %entry 1452; GFX1064-NEXT: s_clause 0x1 1453; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1454; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 1455; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1456; GFX1064-NEXT: ; implicit-def: $vgpr1 1457; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1458; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1459; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1460; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1461; GFX1064-NEXT: s_cbranch_execz BB8_2 1462; GFX1064-NEXT: ; %bb.1: 1463; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1464; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1465; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1466; GFX1064-NEXT: s_mul_i32 s3, s2, s3 1467; GFX1064-NEXT: v_mov_b32_e32 v2, s3 1468; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1469; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1470; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1471; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1472; GFX1064-NEXT: buffer_gl0_inv 1473; GFX1064-NEXT: BB8_2: 1474; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1475; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1476; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1477; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1478; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1479; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1480; GFX1064-NEXT: s_mov_b32 s6, -1 1481; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1482; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1483; GFX1064-NEXT: s_endpgm 1484; 1485; GFX1032-LABEL: sub_i32_uniform: 1486; GFX1032: ; %bb.0: ; %entry 1487; GFX1032-NEXT: s_clause 0x1 1488; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1489; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1490; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1491; GFX1032-NEXT: ; implicit-def: $vgpr1 1492; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1493; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1494; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1495; GFX1032-NEXT: s_cbranch_execz BB8_2 1496; GFX1032-NEXT: ; %bb.1: 1497; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1498; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1499; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1500; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1501; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1502; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1503; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1504; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1505; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1506; GFX1032-NEXT: buffer_gl0_inv 1507; GFX1032-NEXT: BB8_2: 1508; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1509; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1510; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1511; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1512; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1513; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1514; GFX1032-NEXT: s_mov_b32 s6, -1 1515; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1516; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1517; GFX1032-NEXT: s_endpgm 1518entry: 1519 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1520 store i32 %old, i32 addrspace(1)* %out 1521 ret void 1522} 1523 1524define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1525; 1526; 1527; GFX7LESS-LABEL: sub_i32_varying: 1528; GFX7LESS: ; %bb.0: ; %entry 1529; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1530; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1531; GFX7LESS-NEXT: s_mov_b32 m0, -1 1532; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1533; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1534; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1535; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1536; GFX7LESS-NEXT: s_mov_b32 s2, -1 1537; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1538; GFX7LESS-NEXT: s_endpgm 1539; 1540; GFX8-LABEL: sub_i32_varying: 1541; GFX8: ; %bb.0: ; %entry 1542; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1543; GFX8-NEXT: v_mov_b32_e32 v2, v0 1544; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1545; GFX8-NEXT: v_mov_b32_e32 v1, 0 1546; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1547; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1548; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1549; GFX8-NEXT: s_not_b64 exec, exec 1550; GFX8-NEXT: v_mov_b32_e32 v2, 0 1551; GFX8-NEXT: s_not_b64 exec, exec 1552; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1553; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1554; GFX8-NEXT: s_nop 1 1555; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1556; GFX8-NEXT: s_nop 1 1557; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1558; GFX8-NEXT: s_nop 1 1559; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1560; GFX8-NEXT: s_nop 1 1561; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1562; GFX8-NEXT: s_nop 1 1563; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1564; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1565; GFX8-NEXT: s_nop 0 1566; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1567; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1568; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1569; GFX8-NEXT: ; implicit-def: $vgpr0 1570; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1571; GFX8-NEXT: s_cbranch_execz BB9_2 1572; GFX8-NEXT: ; %bb.1: 1573; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1574; GFX8-NEXT: v_mov_b32_e32 v3, s4 1575; GFX8-NEXT: s_mov_b32 m0, -1 1576; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1577; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1578; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1579; GFX8-NEXT: BB9_2: 1580; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1581; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1582; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1583; GFX8-NEXT: v_mov_b32_e32 v0, v1 1584; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1585; GFX8-NEXT: s_mov_b32 s3, 0xf000 1586; GFX8-NEXT: s_mov_b32 s2, -1 1587; GFX8-NEXT: s_nop 0 1588; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1589; GFX8-NEXT: s_endpgm 1590; 1591; GFX9-LABEL: sub_i32_varying: 1592; GFX9: ; %bb.0: ; %entry 1593; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1594; GFX9-NEXT: v_mov_b32_e32 v2, v0 1595; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1596; GFX9-NEXT: v_mov_b32_e32 v1, 0 1597; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1598; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1599; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1600; GFX9-NEXT: s_not_b64 exec, exec 1601; GFX9-NEXT: v_mov_b32_e32 v2, 0 1602; GFX9-NEXT: s_not_b64 exec, exec 1603; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1604; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1605; GFX9-NEXT: s_nop 1 1606; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1607; GFX9-NEXT: s_nop 1 1608; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1609; GFX9-NEXT: s_nop 1 1610; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1611; GFX9-NEXT: s_nop 1 1612; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1613; GFX9-NEXT: s_nop 1 1614; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1615; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1616; GFX9-NEXT: s_nop 0 1617; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1618; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1619; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1620; GFX9-NEXT: ; implicit-def: $vgpr0 1621; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1622; GFX9-NEXT: s_cbranch_execz BB9_2 1623; GFX9-NEXT: ; %bb.1: 1624; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1625; GFX9-NEXT: v_mov_b32_e32 v3, s4 1626; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1627; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1628; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1629; GFX9-NEXT: BB9_2: 1630; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1631; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1632; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1633; GFX9-NEXT: v_mov_b32_e32 v0, v1 1634; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1635; GFX9-NEXT: s_mov_b32 s3, 0xf000 1636; GFX9-NEXT: s_mov_b32 s2, -1 1637; GFX9-NEXT: s_nop 0 1638; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1639; GFX9-NEXT: s_endpgm 1640; 1641; GFX1064-LABEL: sub_i32_varying: 1642; GFX1064: ; %bb.0: ; %entry 1643; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1644; GFX1064-NEXT: s_not_b64 exec, exec 1645; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1646; GFX1064-NEXT: s_not_b64 exec, exec 1647; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1648; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1649; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1650; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1651; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1652; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1653; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1654; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1655; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1656; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1657; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1658; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1659; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1660; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1661; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1662; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1663; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1664; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1665; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 1666; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1667; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1668; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1669; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 1670; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 1671; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 1672; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1673; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 1674; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1675; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 1676; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1677; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1678; GFX1064-NEXT: s_mov_b32 s2, -1 1679; GFX1064-NEXT: ; implicit-def: $vgpr0 1680; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1681; GFX1064-NEXT: s_cbranch_execz BB9_2 1682; GFX1064-NEXT: ; %bb.1: 1683; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1684; GFX1064-NEXT: v_mov_b32_e32 v4, s7 1685; GFX1064-NEXT: s_mov_b32 s3, s7 1686; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1687; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1688; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 1689; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1690; GFX1064-NEXT: buffer_gl0_inv 1691; GFX1064-NEXT: BB9_2: 1692; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1693; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1694; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 1695; GFX1064-NEXT: v_mov_b32_e32 v0, v3 1696; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1697; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1698; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1699; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1700; GFX1064-NEXT: s_endpgm 1701; 1702; GFX1032-LABEL: sub_i32_varying: 1703; GFX1032: ; %bb.0: ; %entry 1704; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1705; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1706; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1707; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1708; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1709; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1710; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1711; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1712; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1713; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1714; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1715; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1716; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1717; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1718; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1719; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1720; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1721; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1722; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1723; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1724; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1725; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1726; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1727; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1728; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1729; GFX1032-NEXT: s_mov_b32 s2, -1 1730; GFX1032-NEXT: ; implicit-def: $vgpr0 1731; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1732; GFX1032-NEXT: s_cbranch_execz BB9_2 1733; GFX1032-NEXT: ; %bb.1: 1734; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1735; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1736; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1737; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1738; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 1739; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX1032-NEXT: buffer_gl0_inv 1741; GFX1032-NEXT: BB9_2: 1742; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1743; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1744; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1745; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1746; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 1747; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1748; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1749; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1750; GFX1032-NEXT: s_endpgm 1751entry: 1752 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1753 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1754 store i32 %old, i32 addrspace(1)* %out 1755 ret void 1756} 1757 1758define amdgpu_kernel void @sub_i32_varying_nouse() { 1759; GFX7LESS-LABEL: sub_i32_varying_nouse: 1760; GFX7LESS: ; %bb.0: ; %entry 1761; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1762; GFX7LESS-NEXT: s_mov_b32 m0, -1 1763; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1764; GFX7LESS-NEXT: ds_sub_u32 v1, v0 1765; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1766; GFX7LESS-NEXT: s_endpgm 1767; 1768; GFX8-LABEL: sub_i32_varying_nouse: 1769; GFX8: ; %bb.0: ; %entry 1770; GFX8-NEXT: v_mov_b32_e32 v1, v0 1771; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1772; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1773; GFX8-NEXT: s_not_b64 exec, exec 1774; GFX8-NEXT: v_mov_b32_e32 v1, 0 1775; GFX8-NEXT: s_not_b64 exec, exec 1776; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 1777; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1778; GFX8-NEXT: s_nop 1 1779; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1780; GFX8-NEXT: s_nop 1 1781; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1782; GFX8-NEXT: s_nop 1 1783; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1784; GFX8-NEXT: s_nop 1 1785; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1786; GFX8-NEXT: s_nop 1 1787; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1788; GFX8-NEXT: v_readlane_b32 s2, v1, 63 1789; GFX8-NEXT: s_mov_b64 exec, s[0:1] 1790; GFX8-NEXT: s_mov_b32 s0, s2 1791; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1792; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1793; GFX8-NEXT: s_cbranch_execz BB10_2 1794; GFX8-NEXT: ; %bb.1: 1795; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1796; GFX8-NEXT: v_mov_b32_e32 v2, s0 1797; GFX8-NEXT: s_mov_b32 m0, -1 1798; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1799; GFX8-NEXT: ds_sub_u32 v0, v2 1800; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1801; GFX8-NEXT: BB10_2: 1802; GFX8-NEXT: s_endpgm 1803; 1804; GFX9-LABEL: sub_i32_varying_nouse: 1805; GFX9: ; %bb.0: ; %entry 1806; GFX9-NEXT: v_mov_b32_e32 v1, v0 1807; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1808; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1809; GFX9-NEXT: s_not_b64 exec, exec 1810; GFX9-NEXT: v_mov_b32_e32 v1, 0 1811; GFX9-NEXT: s_not_b64 exec, exec 1812; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 1813; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1814; GFX9-NEXT: s_nop 1 1815; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1816; GFX9-NEXT: s_nop 1 1817; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1818; GFX9-NEXT: s_nop 1 1819; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1820; GFX9-NEXT: s_nop 1 1821; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf 1822; GFX9-NEXT: s_nop 1 1823; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1824; GFX9-NEXT: v_readlane_b32 s2, v1, 63 1825; GFX9-NEXT: s_mov_b64 exec, s[0:1] 1826; GFX9-NEXT: s_mov_b32 s0, s2 1827; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1828; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1829; GFX9-NEXT: s_cbranch_execz BB10_2 1830; GFX9-NEXT: ; %bb.1: 1831; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1832; GFX9-NEXT: v_mov_b32_e32 v2, s0 1833; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1834; GFX9-NEXT: ds_sub_u32 v0, v2 1835; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1836; GFX9-NEXT: BB10_2: 1837; GFX9-NEXT: s_endpgm 1838; 1839; GFX1064-LABEL: sub_i32_varying_nouse: 1840; GFX1064: ; %bb.0: ; %entry 1841; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1842; GFX1064-NEXT: s_not_b64 exec, exec 1843; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1844; GFX1064-NEXT: s_not_b64 exec, exec 1845; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1846; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1847; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1848; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1849; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1850; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1851; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1852; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 1853; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1854; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1855; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1856; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 1857; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 1858; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1859; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 1860; GFX1064-NEXT: s_add_i32 s0, s2, s3 1861; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1862; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1863; GFX1064-NEXT: s_cbranch_execz BB10_2 1864; GFX1064-NEXT: ; %bb.1: 1865; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1866; GFX1064-NEXT: v_mov_b32_e32 v3, s0 1867; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1868; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1869; GFX1064-NEXT: ds_sub_u32 v0, v3 1870; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1871; GFX1064-NEXT: buffer_gl0_inv 1872; GFX1064-NEXT: BB10_2: 1873; GFX1064-NEXT: s_endpgm 1874; 1875; GFX1032-LABEL: sub_i32_varying_nouse: 1876; GFX1032: ; %bb.0: ; %entry 1877; GFX1032-NEXT: v_mov_b32_e32 v1, v0 1878; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1879; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1880; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 1881; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1882; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1883; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1884; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1885; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1886; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1887; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1888; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 1889; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1890; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0 1891; GFX1032-NEXT: v_mov_b32_e32 v0, v1 1892; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 1893; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1894; GFX1032-NEXT: s_cbranch_execz BB10_2 1895; GFX1032-NEXT: ; %bb.1: 1896; GFX1032-NEXT: v_mov_b32_e32 v3, local_var32@abs32@lo 1897; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1898; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1899; GFX1032-NEXT: ds_sub_u32 v3, v0 1900; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1901; GFX1032-NEXT: buffer_gl0_inv 1902; GFX1032-NEXT: BB10_2: 1903; GFX1032-NEXT: s_endpgm 1904entry: 1905 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1906 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1907 ret void 1908} 1909 1910define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 1911; 1912; 1913; GFX7LESS-LABEL: sub_i64_constant: 1914; GFX7LESS: ; %bb.0: ; %entry 1915; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1916; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1917; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1918; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1919; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1920; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1921; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1922; GFX7LESS-NEXT: s_cbranch_execz BB11_2 1923; GFX7LESS-NEXT: ; %bb.1: 1924; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1925; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 1926; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 1927; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1928; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 1929; GFX7LESS-NEXT: s_mov_b32 m0, -1 1930; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1931; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 1932; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1933; GFX7LESS-NEXT: BB11_2: 1934; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1935; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1936; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1937; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1938; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1939; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1940; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1941; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1942; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1943; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1944; GFX7LESS-NEXT: s_mov_b32 s2, -1 1945; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1946; GFX7LESS-NEXT: s_endpgm 1947; 1948; GFX8-LABEL: sub_i64_constant: 1949; GFX8: ; %bb.0: ; %entry 1950; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1951; GFX8-NEXT: s_mov_b64 s[4:5], exec 1952; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1953; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1954; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1955; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1956; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1957; GFX8-NEXT: s_cbranch_execz BB11_2 1958; GFX8-NEXT: ; %bb.1: 1959; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1960; GFX8-NEXT: s_mul_i32 s4, s4, 5 1961; GFX8-NEXT: v_mov_b32_e32 v1, s4 1962; GFX8-NEXT: v_mov_b32_e32 v2, 0 1963; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1964; GFX8-NEXT: s_mov_b32 m0, -1 1965; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1966; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 1967; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1968; GFX8-NEXT: BB11_2: 1969; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1970; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1971; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1972; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1973; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1974; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1975; GFX8-NEXT: v_mov_b32_e32 v2, s3 1976; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1977; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 1978; GFX8-NEXT: s_mov_b32 s3, 0xf000 1979; GFX8-NEXT: s_mov_b32 s2, -1 1980; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1981; GFX8-NEXT: s_endpgm 1982; 1983; GFX9-LABEL: sub_i64_constant: 1984; GFX9: ; %bb.0: ; %entry 1985; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1986; GFX9-NEXT: s_mov_b64 s[4:5], exec 1987; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1988; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1989; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1990; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1991; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1992; GFX9-NEXT: s_cbranch_execz BB11_2 1993; GFX9-NEXT: ; %bb.1: 1994; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1995; GFX9-NEXT: s_mul_i32 s4, s4, 5 1996; GFX9-NEXT: v_mov_b32_e32 v1, s4 1997; GFX9-NEXT: v_mov_b32_e32 v2, 0 1998; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1999; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2000; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2001; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2002; GFX9-NEXT: BB11_2: 2003; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2004; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2005; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2006; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2007; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2008; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2009; GFX9-NEXT: v_mov_b32_e32 v2, s3 2010; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2011; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2012; GFX9-NEXT: s_mov_b32 s3, 0xf000 2013; GFX9-NEXT: s_mov_b32 s2, -1 2014; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2015; GFX9-NEXT: s_endpgm 2016; 2017; GFX1064-LABEL: sub_i64_constant: 2018; GFX1064: ; %bb.0: ; %entry 2019; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2020; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2021; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2022; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2023; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2024; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2025; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2026; GFX1064-NEXT: s_cbranch_execz BB11_2 2027; GFX1064-NEXT: ; %bb.1: 2028; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2029; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2030; GFX1064-NEXT: s_mul_i32 s4, s4, 5 2031; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2032; GFX1064-NEXT: v_mov_b32_e32 v1, s4 2033; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2034; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2035; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2036; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2037; GFX1064-NEXT: buffer_gl0_inv 2038; GFX1064-NEXT: BB11_2: 2039; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2040; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2041; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2042; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2043; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2044; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2045; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2046; GFX1064-NEXT: s_mov_b32 s2, -1 2047; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2048; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2049; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2050; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2051; GFX1064-NEXT: s_endpgm 2052; 2053; GFX1032-LABEL: sub_i64_constant: 2054; GFX1032: ; %bb.0: ; %entry 2055; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2056; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2057; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2058; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2059; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2060; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2061; GFX1032-NEXT: s_cbranch_execz BB11_2 2062; GFX1032-NEXT: ; %bb.1: 2063; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2064; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2065; GFX1032-NEXT: s_mul_i32 s3, s3, 5 2066; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2067; GFX1032-NEXT: v_mov_b32_e32 v1, s3 2068; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2069; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2070; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2071; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2072; GFX1032-NEXT: buffer_gl0_inv 2073; GFX1032-NEXT: BB11_2: 2074; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2075; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2076; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2077; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2078; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2079; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2080; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2081; GFX1032-NEXT: s_mov_b32 s2, -1 2082; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2083; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2084; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2085; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2086; GFX1032-NEXT: s_endpgm 2087entry: 2088 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2089 store i64 %old, i64 addrspace(1)* %out 2090 ret void 2091} 2092 2093define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2094; 2095; 2096; GFX7LESS-LABEL: sub_i64_uniform: 2097; GFX7LESS: ; %bb.0: ; %entry 2098; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2099; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2100; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2101; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2102; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2103; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2104; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2105; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2106; GFX7LESS-NEXT: ; %bb.1: 2107; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2108; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2109; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2110; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2111; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2112; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2113; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2114; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2115; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2116; GFX7LESS-NEXT: s_mov_b32 m0, -1 2117; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2118; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2119; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2120; GFX7LESS-NEXT: BB12_2: 2121; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2122; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2123; GFX7LESS-NEXT: s_mov_b32 s6, -1 2124; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2125; GFX7LESS-NEXT: s_mov_b32 s4, s0 2126; GFX7LESS-NEXT: s_mov_b32 s5, s1 2127; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2128; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2129; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2130; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2131; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2132; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2133; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2134; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2135; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2136; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2137; GFX7LESS-NEXT: s_endpgm 2138; 2139; GFX8-LABEL: sub_i64_uniform: 2140; GFX8: ; %bb.0: ; %entry 2141; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2142; GFX8-NEXT: s_mov_b64 s[6:7], exec 2143; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2144; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2145; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2146; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2147; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2148; GFX8-NEXT: s_cbranch_execz BB12_2 2149; GFX8-NEXT: ; %bb.1: 2150; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2151; GFX8-NEXT: v_mov_b32_e32 v1, s6 2152; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2153; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2154; GFX8-NEXT: s_mul_i32 s7, s3, s6 2155; GFX8-NEXT: s_mul_i32 s6, s2, s6 2156; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2157; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2158; GFX8-NEXT: v_mov_b32_e32 v1, s6 2159; GFX8-NEXT: s_mov_b32 m0, -1 2160; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2161; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2162; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2163; GFX8-NEXT: BB12_2: 2164; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2165; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2166; GFX8-NEXT: s_mov_b32 s4, s0 2167; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2168; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2169; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2170; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2171; GFX8-NEXT: s_mov_b32 s5, s1 2172; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2173; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2174; GFX8-NEXT: v_mov_b32_e32 v2, s1 2175; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2176; GFX8-NEXT: s_mov_b32 s7, 0xf000 2177; GFX8-NEXT: s_mov_b32 s6, -1 2178; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2179; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2180; GFX8-NEXT: s_endpgm 2181; 2182; GFX9-LABEL: sub_i64_uniform: 2183; GFX9: ; %bb.0: ; %entry 2184; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2185; GFX9-NEXT: s_mov_b64 s[6:7], exec 2186; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2187; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2188; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2189; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2190; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2191; GFX9-NEXT: s_cbranch_execz BB12_2 2192; GFX9-NEXT: ; %bb.1: 2193; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2194; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2195; GFX9-NEXT: s_mul_i32 s7, s3, s6 2196; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2197; GFX9-NEXT: s_add_i32 s8, s8, s7 2198; GFX9-NEXT: s_mul_i32 s6, s2, s6 2199; GFX9-NEXT: v_mov_b32_e32 v1, s6 2200; GFX9-NEXT: v_mov_b32_e32 v2, s8 2201; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2203; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2204; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2205; GFX9-NEXT: BB12_2: 2206; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2208; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2209; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2210; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2211; GFX9-NEXT: s_mov_b32 s4, s0 2212; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2213; GFX9-NEXT: s_mov_b32 s5, s1 2214; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2215; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2216; GFX9-NEXT: v_mov_b32_e32 v2, s1 2217; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2218; GFX9-NEXT: s_mov_b32 s7, 0xf000 2219; GFX9-NEXT: s_mov_b32 s6, -1 2220; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2221; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2222; GFX9-NEXT: s_endpgm 2223; 2224; GFX1064-LABEL: sub_i64_uniform: 2225; GFX1064: ; %bb.0: ; %entry 2226; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2227; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2228; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2229; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2230; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2231; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2232; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2233; GFX1064-NEXT: s_cbranch_execz BB12_2 2234; GFX1064-NEXT: ; %bb.1: 2235; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2236; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2237; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2238; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2239; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2240; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2241; GFX1064-NEXT: s_add_i32 s8, s8, s7 2242; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2243; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2244; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2245; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2246; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2247; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2248; GFX1064-NEXT: buffer_gl0_inv 2249; GFX1064-NEXT: BB12_2: 2250; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2251; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2252; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2253; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2254; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2255; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2256; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2257; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2258; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2259; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2260; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0 2261; GFX1064-NEXT: s_mov_b32 s2, -1 2262; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2263; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2264; GFX1064-NEXT: s_endpgm 2265; 2266; GFX1032-LABEL: sub_i64_uniform: 2267; GFX1032: ; %bb.0: ; %entry 2268; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2269; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2270; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2271; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2272; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2273; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2274; GFX1032-NEXT: s_cbranch_execz BB12_2 2275; GFX1032-NEXT: ; %bb.1: 2276; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2277; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2278; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2279; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2280; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2281; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2282; GFX1032-NEXT: s_add_i32 s7, s7, s6 2283; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2284; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2285; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2286; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2287; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2288; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2289; GFX1032-NEXT: buffer_gl0_inv 2290; GFX1032-NEXT: BB12_2: 2291; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2292; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2293; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2294; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2295; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2296; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2297; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2298; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2299; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2300; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2301; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0 2302; GFX1032-NEXT: s_mov_b32 s2, -1 2303; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2304; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2305; GFX1032-NEXT: s_endpgm 2306entry: 2307 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2308 store i64 %old, i64 addrspace(1)* %out 2309 ret void 2310} 2311 2312define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2313; 2314; 2315; GFX7LESS-LABEL: sub_i64_varying: 2316; GFX7LESS: ; %bb.0: ; %entry 2317; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2318; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2319; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2320; GFX7LESS-NEXT: s_mov_b32 m0, -1 2321; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2323; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2324; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2325; GFX7LESS-NEXT: s_mov_b32 s2, -1 2326; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2327; GFX7LESS-NEXT: s_endpgm 2328; 2329; GFX8-LABEL: sub_i64_varying: 2330; GFX8: ; %bb.0: ; %entry 2331; GFX8-NEXT: v_mov_b32_e32 v1, 0 2332; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2333; GFX8-NEXT: s_mov_b32 m0, -1 2334; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2335; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2336; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2337; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2338; GFX8-NEXT: s_mov_b32 s3, 0xf000 2339; GFX8-NEXT: s_mov_b32 s2, -1 2340; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2341; GFX8-NEXT: s_endpgm 2342; 2343; GFX9-LABEL: sub_i64_varying: 2344; GFX9: ; %bb.0: ; %entry 2345; GFX9-NEXT: v_mov_b32_e32 v1, 0 2346; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2347; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2348; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2349; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2350; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2351; GFX9-NEXT: s_mov_b32 s3, 0xf000 2352; GFX9-NEXT: s_mov_b32 s2, -1 2353; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2354; GFX9-NEXT: s_endpgm 2355; 2356; GFX10-LABEL: sub_i64_varying: 2357; GFX10: ; %bb.0: ; %entry 2358; GFX10-NEXT: v_mov_b32_e32 v1, 0 2359; GFX10-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2360; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2361; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2362; GFX10-NEXT: s_mov_b32 s2, -1 2363; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2364; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2365; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2366; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2367; GFX10-NEXT: buffer_gl0_inv 2368; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2369; GFX10-NEXT: s_endpgm 2370entry: 2371 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2372 %zext = zext i32 %lane to i64 2373 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2374 store i64 %old, i64 addrspace(1)* %out 2375 ret void 2376} 2377 2378define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2379; 2380; 2381; GFX7LESS-LABEL: and_i32_varying: 2382; GFX7LESS: ; %bb.0: ; %entry 2383; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2384; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2385; GFX7LESS-NEXT: s_mov_b32 m0, -1 2386; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2387; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2388; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2389; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2390; GFX7LESS-NEXT: s_mov_b32 s2, -1 2391; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2392; GFX7LESS-NEXT: s_endpgm 2393; 2394; GFX8-LABEL: and_i32_varying: 2395; GFX8: ; %bb.0: ; %entry 2396; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2397; GFX8-NEXT: v_mov_b32_e32 v2, v0 2398; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2399; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2400; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2401; GFX8-NEXT: v_mov_b32_e32 v1, -1 2402; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2403; GFX8-NEXT: s_not_b64 exec, exec 2404; GFX8-NEXT: v_mov_b32_e32 v2, -1 2405; GFX8-NEXT: s_not_b64 exec, exec 2406; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2407; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2408; GFX8-NEXT: s_nop 1 2409; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2410; GFX8-NEXT: s_nop 1 2411; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2412; GFX8-NEXT: s_nop 1 2413; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2414; GFX8-NEXT: s_nop 1 2415; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2416; GFX8-NEXT: s_nop 1 2417; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2418; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2419; GFX8-NEXT: s_nop 0 2420; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2421; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2422; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2423; GFX8-NEXT: ; implicit-def: $vgpr0 2424; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2425; GFX8-NEXT: s_cbranch_execz BB14_2 2426; GFX8-NEXT: ; %bb.1: 2427; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2428; GFX8-NEXT: v_mov_b32_e32 v3, s4 2429; GFX8-NEXT: s_mov_b32 m0, -1 2430; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2431; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2432; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2433; GFX8-NEXT: BB14_2: 2434; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2435; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2436; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2437; GFX8-NEXT: v_mov_b32_e32 v0, v1 2438; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2439; GFX8-NEXT: s_mov_b32 s3, 0xf000 2440; GFX8-NEXT: s_mov_b32 s2, -1 2441; GFX8-NEXT: s_nop 0 2442; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2443; GFX8-NEXT: s_endpgm 2444; 2445; GFX9-LABEL: and_i32_varying: 2446; GFX9: ; %bb.0: ; %entry 2447; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2448; GFX9-NEXT: v_mov_b32_e32 v2, v0 2449; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2450; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2451; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2452; GFX9-NEXT: v_mov_b32_e32 v1, -1 2453; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2454; GFX9-NEXT: s_not_b64 exec, exec 2455; GFX9-NEXT: v_mov_b32_e32 v2, -1 2456; GFX9-NEXT: s_not_b64 exec, exec 2457; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2458; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2459; GFX9-NEXT: s_nop 1 2460; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2461; GFX9-NEXT: s_nop 1 2462; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2463; GFX9-NEXT: s_nop 1 2464; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2465; GFX9-NEXT: s_nop 1 2466; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2467; GFX9-NEXT: s_nop 1 2468; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2469; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2470; GFX9-NEXT: s_nop 0 2471; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2472; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2473; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2474; GFX9-NEXT: ; implicit-def: $vgpr0 2475; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2476; GFX9-NEXT: s_cbranch_execz BB14_2 2477; GFX9-NEXT: ; %bb.1: 2478; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2479; GFX9-NEXT: v_mov_b32_e32 v3, s4 2480; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2481; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2482; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2483; GFX9-NEXT: BB14_2: 2484; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2486; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2487; GFX9-NEXT: v_mov_b32_e32 v0, v1 2488; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2489; GFX9-NEXT: s_mov_b32 s3, 0xf000 2490; GFX9-NEXT: s_mov_b32 s2, -1 2491; GFX9-NEXT: s_nop 0 2492; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2493; GFX9-NEXT: s_endpgm 2494; 2495; GFX1064-LABEL: and_i32_varying: 2496; GFX1064: ; %bb.0: ; %entry 2497; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2498; GFX1064-NEXT: s_not_b64 exec, exec 2499; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2500; GFX1064-NEXT: s_not_b64 exec, exec 2501; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2502; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2503; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2504; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2505; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2506; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2507; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2508; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2509; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2510; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2511; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2512; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2513; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2514; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2515; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2516; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2517; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2518; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2519; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2520; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2521; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2522; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2523; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2524; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2525; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2526; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2527; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2528; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2529; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2530; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2531; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2532; GFX1064-NEXT: s_mov_b32 s2, -1 2533; GFX1064-NEXT: ; implicit-def: $vgpr0 2534; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2535; GFX1064-NEXT: s_cbranch_execz BB14_2 2536; GFX1064-NEXT: ; %bb.1: 2537; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2538; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2539; GFX1064-NEXT: s_mov_b32 s3, s7 2540; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2541; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2542; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 2543; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2544; GFX1064-NEXT: buffer_gl0_inv 2545; GFX1064-NEXT: BB14_2: 2546; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2547; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2548; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2549; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2550; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2551; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2552; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2553; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2554; GFX1064-NEXT: s_endpgm 2555; 2556; GFX1032-LABEL: and_i32_varying: 2557; GFX1032: ; %bb.0: ; %entry 2558; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2559; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2560; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2561; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2562; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2563; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2564; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2565; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2566; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2567; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2568; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2569; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2570; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2571; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2572; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2573; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2574; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2575; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2576; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2577; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2578; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2579; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2580; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2581; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2582; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2583; GFX1032-NEXT: s_mov_b32 s2, -1 2584; GFX1032-NEXT: ; implicit-def: $vgpr0 2585; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2586; GFX1032-NEXT: s_cbranch_execz BB14_2 2587; GFX1032-NEXT: ; %bb.1: 2588; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2589; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2590; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2591; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2592; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 2593; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2594; GFX1032-NEXT: buffer_gl0_inv 2595; GFX1032-NEXT: BB14_2: 2596; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2597; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2598; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2599; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2600; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2601; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2602; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2603; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2604; GFX1032-NEXT: s_endpgm 2605entry: 2606 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2607 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2608 store i32 %old, i32 addrspace(1)* %out 2609 ret void 2610} 2611 2612define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2613; 2614; 2615; GFX7LESS-LABEL: or_i32_varying: 2616; GFX7LESS: ; %bb.0: ; %entry 2617; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2618; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2619; GFX7LESS-NEXT: s_mov_b32 m0, -1 2620; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2621; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2622; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2623; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2624; GFX7LESS-NEXT: s_mov_b32 s2, -1 2625; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2626; GFX7LESS-NEXT: s_endpgm 2627; 2628; GFX8-LABEL: or_i32_varying: 2629; GFX8: ; %bb.0: ; %entry 2630; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2631; GFX8-NEXT: v_mov_b32_e32 v2, v0 2632; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2633; GFX8-NEXT: v_mov_b32_e32 v1, 0 2634; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2635; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2636; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2637; GFX8-NEXT: s_not_b64 exec, exec 2638; GFX8-NEXT: v_mov_b32_e32 v2, 0 2639; GFX8-NEXT: s_not_b64 exec, exec 2640; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2641; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2642; GFX8-NEXT: s_nop 1 2643; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2644; GFX8-NEXT: s_nop 1 2645; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2646; GFX8-NEXT: s_nop 1 2647; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2648; GFX8-NEXT: s_nop 1 2649; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2650; GFX8-NEXT: s_nop 1 2651; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2652; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2653; GFX8-NEXT: s_nop 0 2654; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2655; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2656; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2657; GFX8-NEXT: ; implicit-def: $vgpr0 2658; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2659; GFX8-NEXT: s_cbranch_execz BB15_2 2660; GFX8-NEXT: ; %bb.1: 2661; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2662; GFX8-NEXT: v_mov_b32_e32 v3, s4 2663; GFX8-NEXT: s_mov_b32 m0, -1 2664; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2665; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2666; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2667; GFX8-NEXT: BB15_2: 2668; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2669; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2670; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2671; GFX8-NEXT: v_mov_b32_e32 v0, v1 2672; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2673; GFX8-NEXT: s_mov_b32 s3, 0xf000 2674; GFX8-NEXT: s_mov_b32 s2, -1 2675; GFX8-NEXT: s_nop 0 2676; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2677; GFX8-NEXT: s_endpgm 2678; 2679; GFX9-LABEL: or_i32_varying: 2680; GFX9: ; %bb.0: ; %entry 2681; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2682; GFX9-NEXT: v_mov_b32_e32 v2, v0 2683; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2684; GFX9-NEXT: v_mov_b32_e32 v1, 0 2685; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2686; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2687; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2688; GFX9-NEXT: s_not_b64 exec, exec 2689; GFX9-NEXT: v_mov_b32_e32 v2, 0 2690; GFX9-NEXT: s_not_b64 exec, exec 2691; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2692; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2693; GFX9-NEXT: s_nop 1 2694; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2695; GFX9-NEXT: s_nop 1 2696; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2697; GFX9-NEXT: s_nop 1 2698; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2699; GFX9-NEXT: s_nop 1 2700; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2701; GFX9-NEXT: s_nop 1 2702; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2703; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2704; GFX9-NEXT: s_nop 0 2705; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2706; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2707; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2708; GFX9-NEXT: ; implicit-def: $vgpr0 2709; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2710; GFX9-NEXT: s_cbranch_execz BB15_2 2711; GFX9-NEXT: ; %bb.1: 2712; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2713; GFX9-NEXT: v_mov_b32_e32 v3, s4 2714; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2715; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2716; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2717; GFX9-NEXT: BB15_2: 2718; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2719; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2720; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2721; GFX9-NEXT: v_mov_b32_e32 v0, v1 2722; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2723; GFX9-NEXT: s_mov_b32 s3, 0xf000 2724; GFX9-NEXT: s_mov_b32 s2, -1 2725; GFX9-NEXT: s_nop 0 2726; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2727; GFX9-NEXT: s_endpgm 2728; 2729; GFX1064-LABEL: or_i32_varying: 2730; GFX1064: ; %bb.0: ; %entry 2731; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2732; GFX1064-NEXT: s_not_b64 exec, exec 2733; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2734; GFX1064-NEXT: s_not_b64 exec, exec 2735; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2736; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2737; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2738; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2739; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2740; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2741; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2742; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2743; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2744; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2745; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2746; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2747; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2748; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2749; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2750; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2751; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2752; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2753; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2754; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2755; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2756; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2757; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2758; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2759; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2760; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2761; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2762; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2763; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2764; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2765; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2766; GFX1064-NEXT: s_mov_b32 s2, -1 2767; GFX1064-NEXT: ; implicit-def: $vgpr0 2768; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2769; GFX1064-NEXT: s_cbranch_execz BB15_2 2770; GFX1064-NEXT: ; %bb.1: 2771; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2772; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2773; GFX1064-NEXT: s_mov_b32 s3, s7 2774; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2775; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2776; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 2777; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2778; GFX1064-NEXT: buffer_gl0_inv 2779; GFX1064-NEXT: BB15_2: 2780; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2781; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2782; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2783; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2784; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2785; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2786; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2787; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2788; GFX1064-NEXT: s_endpgm 2789; 2790; GFX1032-LABEL: or_i32_varying: 2791; GFX1032: ; %bb.0: ; %entry 2792; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2793; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2794; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2795; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2796; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2797; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2798; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2799; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2800; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2801; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2802; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2803; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2804; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2805; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2806; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2807; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2808; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2809; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2810; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2811; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2812; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2813; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2814; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2815; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2816; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2817; GFX1032-NEXT: s_mov_b32 s2, -1 2818; GFX1032-NEXT: ; implicit-def: $vgpr0 2819; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2820; GFX1032-NEXT: s_cbranch_execz BB15_2 2821; GFX1032-NEXT: ; %bb.1: 2822; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2823; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2824; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2825; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2826; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 2827; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2828; GFX1032-NEXT: buffer_gl0_inv 2829; GFX1032-NEXT: BB15_2: 2830; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2831; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2832; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2833; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2834; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 2835; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2836; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2838; GFX1032-NEXT: s_endpgm 2839entry: 2840 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2841 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2842 store i32 %old, i32 addrspace(1)* %out 2843 ret void 2844} 2845 2846define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 2847; 2848; 2849; GFX7LESS-LABEL: xor_i32_varying: 2850; GFX7LESS: ; %bb.0: ; %entry 2851; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2852; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2853; GFX7LESS-NEXT: s_mov_b32 m0, -1 2854; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2855; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 2856; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2857; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2858; GFX7LESS-NEXT: s_mov_b32 s2, -1 2859; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2860; GFX7LESS-NEXT: s_endpgm 2861; 2862; GFX8-LABEL: xor_i32_varying: 2863; GFX8: ; %bb.0: ; %entry 2864; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2865; GFX8-NEXT: v_mov_b32_e32 v2, v0 2866; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2867; GFX8-NEXT: v_mov_b32_e32 v1, 0 2868; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2869; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2870; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2871; GFX8-NEXT: s_not_b64 exec, exec 2872; GFX8-NEXT: v_mov_b32_e32 v2, 0 2873; GFX8-NEXT: s_not_b64 exec, exec 2874; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2875; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2876; GFX8-NEXT: s_nop 1 2877; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2878; GFX8-NEXT: s_nop 1 2879; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2880; GFX8-NEXT: s_nop 1 2881; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2882; GFX8-NEXT: s_nop 1 2883; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2884; GFX8-NEXT: s_nop 1 2885; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2886; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2887; GFX8-NEXT: s_nop 0 2888; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2889; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2890; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2891; GFX8-NEXT: ; implicit-def: $vgpr0 2892; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2893; GFX8-NEXT: s_cbranch_execz BB16_2 2894; GFX8-NEXT: ; %bb.1: 2895; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2896; GFX8-NEXT: v_mov_b32_e32 v3, s4 2897; GFX8-NEXT: s_mov_b32 m0, -1 2898; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2899; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 2900; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2901; GFX8-NEXT: BB16_2: 2902; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2903; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2904; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2905; GFX8-NEXT: v_mov_b32_e32 v0, v1 2906; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 2907; GFX8-NEXT: s_mov_b32 s3, 0xf000 2908; GFX8-NEXT: s_mov_b32 s2, -1 2909; GFX8-NEXT: s_nop 0 2910; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2911; GFX8-NEXT: s_endpgm 2912; 2913; GFX9-LABEL: xor_i32_varying: 2914; GFX9: ; %bb.0: ; %entry 2915; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2916; GFX9-NEXT: v_mov_b32_e32 v2, v0 2917; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2918; GFX9-NEXT: v_mov_b32_e32 v1, 0 2919; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2920; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2921; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2922; GFX9-NEXT: s_not_b64 exec, exec 2923; GFX9-NEXT: v_mov_b32_e32 v2, 0 2924; GFX9-NEXT: s_not_b64 exec, exec 2925; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2926; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2927; GFX9-NEXT: s_nop 1 2928; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2929; GFX9-NEXT: s_nop 1 2930; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2931; GFX9-NEXT: s_nop 1 2932; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2933; GFX9-NEXT: s_nop 1 2934; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2935; GFX9-NEXT: s_nop 1 2936; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2937; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2938; GFX9-NEXT: s_nop 0 2939; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2940; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2941; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2942; GFX9-NEXT: ; implicit-def: $vgpr0 2943; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2944; GFX9-NEXT: s_cbranch_execz BB16_2 2945; GFX9-NEXT: ; %bb.1: 2946; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2947; GFX9-NEXT: v_mov_b32_e32 v3, s4 2948; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2949; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 2950; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2951; GFX9-NEXT: BB16_2: 2952; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2953; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2954; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2955; GFX9-NEXT: v_mov_b32_e32 v0, v1 2956; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2957; GFX9-NEXT: s_mov_b32 s3, 0xf000 2958; GFX9-NEXT: s_mov_b32 s2, -1 2959; GFX9-NEXT: s_nop 0 2960; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2961; GFX9-NEXT: s_endpgm 2962; 2963; GFX1064-LABEL: xor_i32_varying: 2964; GFX1064: ; %bb.0: ; %entry 2965; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2966; GFX1064-NEXT: s_not_b64 exec, exec 2967; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2968; GFX1064-NEXT: s_not_b64 exec, exec 2969; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2970; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2971; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2972; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2973; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2974; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2975; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2976; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2977; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2978; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2979; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2980; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2981; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2982; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2983; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2984; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2985; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2986; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2987; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2988; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2989; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2990; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2991; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2992; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2993; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2994; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2995; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2996; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2997; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2998; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2999; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3000; GFX1064-NEXT: s_mov_b32 s2, -1 3001; GFX1064-NEXT: ; implicit-def: $vgpr0 3002; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3003; GFX1064-NEXT: s_cbranch_execz BB16_2 3004; GFX1064-NEXT: ; %bb.1: 3005; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3006; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3007; GFX1064-NEXT: s_mov_b32 s3, s7 3008; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3009; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3010; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 3011; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3012; GFX1064-NEXT: buffer_gl0_inv 3013; GFX1064-NEXT: BB16_2: 3014; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3015; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3016; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3017; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3018; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3019; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3020; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3021; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3022; GFX1064-NEXT: s_endpgm 3023; 3024; GFX1032-LABEL: xor_i32_varying: 3025; GFX1032: ; %bb.0: ; %entry 3026; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3027; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3028; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3029; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3030; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3031; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3032; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3033; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3034; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3035; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3036; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3037; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3038; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3039; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3040; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3041; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3042; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3043; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3044; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3045; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3046; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3047; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3048; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3049; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3050; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3051; GFX1032-NEXT: s_mov_b32 s2, -1 3052; GFX1032-NEXT: ; implicit-def: $vgpr0 3053; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3054; GFX1032-NEXT: s_cbranch_execz BB16_2 3055; GFX1032-NEXT: ; %bb.1: 3056; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3057; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3058; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3059; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3060; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 3061; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3062; GFX1032-NEXT: buffer_gl0_inv 3063; GFX1032-NEXT: BB16_2: 3064; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3065; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3066; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3067; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3068; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3069; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3070; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3071; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3072; GFX1032-NEXT: s_endpgm 3073entry: 3074 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3075 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3076 store i32 %old, i32 addrspace(1)* %out 3077 ret void 3078} 3079 3080define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3081; 3082; 3083; GFX7LESS-LABEL: max_i32_varying: 3084; GFX7LESS: ; %bb.0: ; %entry 3085; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3086; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3087; GFX7LESS-NEXT: s_mov_b32 m0, -1 3088; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3089; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3090; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3091; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3092; GFX7LESS-NEXT: s_mov_b32 s2, -1 3093; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3094; GFX7LESS-NEXT: s_endpgm 3095; 3096; GFX8-LABEL: max_i32_varying: 3097; GFX8: ; %bb.0: ; %entry 3098; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3099; GFX8-NEXT: v_mov_b32_e32 v2, v0 3100; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3101; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3102; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3103; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3104; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3105; GFX8-NEXT: s_not_b64 exec, exec 3106; GFX8-NEXT: v_mov_b32_e32 v2, v1 3107; GFX8-NEXT: s_not_b64 exec, exec 3108; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3109; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3110; GFX8-NEXT: s_nop 1 3111; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3112; GFX8-NEXT: s_nop 1 3113; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3114; GFX8-NEXT: s_nop 1 3115; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3116; GFX8-NEXT: s_nop 1 3117; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3118; GFX8-NEXT: s_nop 1 3119; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3120; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3121; GFX8-NEXT: s_nop 0 3122; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3123; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3124; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3125; GFX8-NEXT: ; implicit-def: $vgpr0 3126; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3127; GFX8-NEXT: s_cbranch_execz BB17_2 3128; GFX8-NEXT: ; %bb.1: 3129; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3130; GFX8-NEXT: v_mov_b32_e32 v3, s4 3131; GFX8-NEXT: s_mov_b32 m0, -1 3132; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3133; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3134; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3135; GFX8-NEXT: BB17_2: 3136; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3137; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3138; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3139; GFX8-NEXT: v_mov_b32_e32 v0, v1 3140; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3141; GFX8-NEXT: s_mov_b32 s3, 0xf000 3142; GFX8-NEXT: s_mov_b32 s2, -1 3143; GFX8-NEXT: s_nop 0 3144; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3145; GFX8-NEXT: s_endpgm 3146; 3147; GFX9-LABEL: max_i32_varying: 3148; GFX9: ; %bb.0: ; %entry 3149; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3150; GFX9-NEXT: v_mov_b32_e32 v2, v0 3151; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3152; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3153; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3154; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3155; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3156; GFX9-NEXT: s_not_b64 exec, exec 3157; GFX9-NEXT: v_mov_b32_e32 v2, v1 3158; GFX9-NEXT: s_not_b64 exec, exec 3159; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3160; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3161; GFX9-NEXT: s_nop 1 3162; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3163; GFX9-NEXT: s_nop 1 3164; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3165; GFX9-NEXT: s_nop 1 3166; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3167; GFX9-NEXT: s_nop 1 3168; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3169; GFX9-NEXT: s_nop 1 3170; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3171; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3172; GFX9-NEXT: s_nop 0 3173; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3174; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3175; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3176; GFX9-NEXT: ; implicit-def: $vgpr0 3177; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3178; GFX9-NEXT: s_cbranch_execz BB17_2 3179; GFX9-NEXT: ; %bb.1: 3180; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3181; GFX9-NEXT: v_mov_b32_e32 v3, s4 3182; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3183; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3184; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3185; GFX9-NEXT: BB17_2: 3186; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3187; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3188; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3189; GFX9-NEXT: v_mov_b32_e32 v0, v1 3190; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3191; GFX9-NEXT: s_mov_b32 s3, 0xf000 3192; GFX9-NEXT: s_mov_b32 s2, -1 3193; GFX9-NEXT: s_nop 0 3194; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3195; GFX9-NEXT: s_endpgm 3196; 3197; GFX1064-LABEL: max_i32_varying: 3198; GFX1064: ; %bb.0: ; %entry 3199; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3200; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3201; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3202; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3203; GFX1064-NEXT: s_not_b64 exec, exec 3204; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3205; GFX1064-NEXT: s_not_b64 exec, exec 3206; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3207; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3208; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3209; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3210; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3211; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3212; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3213; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3214; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3215; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3216; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3217; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3218; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3219; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3220; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3221; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3222; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3223; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3224; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3225; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3226; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3227; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3228; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3229; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3230; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3231; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3232; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3233; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3234; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3235; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3236; GFX1064-NEXT: s_mov_b32 s2, -1 3237; GFX1064-NEXT: ; implicit-def: $vgpr0 3238; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3239; GFX1064-NEXT: s_cbranch_execz BB17_2 3240; GFX1064-NEXT: ; %bb.1: 3241; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3242; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3243; GFX1064-NEXT: s_mov_b32 s3, s7 3244; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3245; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3246; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 3247; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3248; GFX1064-NEXT: buffer_gl0_inv 3249; GFX1064-NEXT: BB17_2: 3250; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3251; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3252; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3253; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3254; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3255; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3256; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3257; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3258; GFX1064-NEXT: s_endpgm 3259; 3260; GFX1032-LABEL: max_i32_varying: 3261; GFX1032: ; %bb.0: ; %entry 3262; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3263; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3264; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3265; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3266; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3267; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3268; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3269; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3270; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3271; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3272; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3273; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3274; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3275; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3276; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3277; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3278; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3279; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3280; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3281; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3282; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3283; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3284; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3285; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3286; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3287; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3288; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3289; GFX1032-NEXT: s_mov_b32 s2, -1 3290; GFX1032-NEXT: ; implicit-def: $vgpr0 3291; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3292; GFX1032-NEXT: s_cbranch_execz BB17_2 3293; GFX1032-NEXT: ; %bb.1: 3294; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3295; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3296; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3297; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3298; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 3299; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3300; GFX1032-NEXT: buffer_gl0_inv 3301; GFX1032-NEXT: BB17_2: 3302; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3303; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3304; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3305; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3306; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3307; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3308; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3309; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3310; GFX1032-NEXT: s_endpgm 3311entry: 3312 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3313 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3314 store i32 %old, i32 addrspace(1)* %out 3315 ret void 3316} 3317 3318define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3319; 3320; 3321; GFX7LESS-LABEL: max_i64_constant: 3322; GFX7LESS: ; %bb.0: ; %entry 3323; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3324; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3325; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3326; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3327; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3328; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3329; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3330; GFX7LESS-NEXT: ; %bb.1: 3331; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3332; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3333; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3334; GFX7LESS-NEXT: s_mov_b32 m0, -1 3335; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3336; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3337; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3338; GFX7LESS-NEXT: BB18_2: 3339; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3340; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3341; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3342; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3343; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3344; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3345; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3346; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3347; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3348; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3349; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3350; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3351; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3352; GFX7LESS-NEXT: s_mov_b32 s2, -1 3353; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3354; GFX7LESS-NEXT: s_endpgm 3355; 3356; GFX8-LABEL: max_i64_constant: 3357; GFX8: ; %bb.0: ; %entry 3358; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3359; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3360; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3361; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3362; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3363; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3364; GFX8-NEXT: s_cbranch_execz BB18_2 3365; GFX8-NEXT: ; %bb.1: 3366; GFX8-NEXT: v_mov_b32_e32 v0, 5 3367; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3368; GFX8-NEXT: v_mov_b32_e32 v1, 0 3369; GFX8-NEXT: s_mov_b32 m0, -1 3370; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3371; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3372; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3373; GFX8-NEXT: BB18_2: 3374; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3375; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3376; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3377; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3378; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3379; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3380; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3381; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3382; GFX8-NEXT: v_mov_b32_e32 v2, s3 3383; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3384; GFX8-NEXT: v_mov_b32_e32 v2, s2 3385; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3386; GFX8-NEXT: s_mov_b32 s3, 0xf000 3387; GFX8-NEXT: s_mov_b32 s2, -1 3388; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3389; GFX8-NEXT: s_endpgm 3390; 3391; GFX9-LABEL: max_i64_constant: 3392; GFX9: ; %bb.0: ; %entry 3393; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3394; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3395; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3396; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3397; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3398; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3399; GFX9-NEXT: s_cbranch_execz BB18_2 3400; GFX9-NEXT: ; %bb.1: 3401; GFX9-NEXT: v_mov_b32_e32 v0, 5 3402; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3403; GFX9-NEXT: v_mov_b32_e32 v1, 0 3404; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3405; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3406; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3407; GFX9-NEXT: BB18_2: 3408; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3409; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3410; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3411; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3412; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3413; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3414; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3415; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3416; GFX9-NEXT: v_mov_b32_e32 v2, s3 3417; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3418; GFX9-NEXT: v_mov_b32_e32 v2, s2 3419; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3420; GFX9-NEXT: s_mov_b32 s3, 0xf000 3421; GFX9-NEXT: s_mov_b32 s2, -1 3422; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3423; GFX9-NEXT: s_endpgm 3424; 3425; GFX1064-LABEL: max_i64_constant: 3426; GFX1064: ; %bb.0: ; %entry 3427; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3428; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3429; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3430; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3431; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3432; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3433; GFX1064-NEXT: s_cbranch_execz BB18_2 3434; GFX1064-NEXT: ; %bb.1: 3435; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3436; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3437; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3438; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3439; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3440; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3441; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3442; GFX1064-NEXT: buffer_gl0_inv 3443; GFX1064-NEXT: BB18_2: 3444; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3445; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3446; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3447; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3448; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3449; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3450; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3451; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3452; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3453; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3454; GFX1064-NEXT: s_mov_b32 s2, -1 3455; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3456; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3457; GFX1064-NEXT: s_endpgm 3458; 3459; GFX1032-LABEL: max_i64_constant: 3460; GFX1032: ; %bb.0: ; %entry 3461; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3462; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3463; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3464; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3465; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3466; GFX1032-NEXT: s_cbranch_execz BB18_2 3467; GFX1032-NEXT: ; %bb.1: 3468; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3469; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3470; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3471; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3472; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3473; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3474; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3475; GFX1032-NEXT: buffer_gl0_inv 3476; GFX1032-NEXT: BB18_2: 3477; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3478; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3479; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3480; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3481; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3482; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3483; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3484; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3485; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3486; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3487; GFX1032-NEXT: s_mov_b32 s2, -1 3488; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3489; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3490; GFX1032-NEXT: s_endpgm 3491entry: 3492 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3493 store i64 %old, i64 addrspace(1)* %out 3494 ret void 3495} 3496 3497define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3498; 3499; 3500; GFX7LESS-LABEL: min_i32_varying: 3501; GFX7LESS: ; %bb.0: ; %entry 3502; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3503; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3504; GFX7LESS-NEXT: s_mov_b32 m0, -1 3505; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3506; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3507; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3508; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3509; GFX7LESS-NEXT: s_mov_b32 s2, -1 3510; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3511; GFX7LESS-NEXT: s_endpgm 3512; 3513; GFX8-LABEL: min_i32_varying: 3514; GFX8: ; %bb.0: ; %entry 3515; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3516; GFX8-NEXT: v_mov_b32_e32 v2, v0 3517; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3518; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3519; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3520; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3521; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3522; GFX8-NEXT: s_not_b64 exec, exec 3523; GFX8-NEXT: v_mov_b32_e32 v2, v1 3524; GFX8-NEXT: s_not_b64 exec, exec 3525; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3526; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3527; GFX8-NEXT: s_nop 1 3528; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3529; GFX8-NEXT: s_nop 1 3530; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3531; GFX8-NEXT: s_nop 1 3532; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3533; GFX8-NEXT: s_nop 1 3534; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3535; GFX8-NEXT: s_nop 1 3536; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3537; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3538; GFX8-NEXT: s_nop 0 3539; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3540; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3541; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3542; GFX8-NEXT: ; implicit-def: $vgpr0 3543; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3544; GFX8-NEXT: s_cbranch_execz BB19_2 3545; GFX8-NEXT: ; %bb.1: 3546; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3547; GFX8-NEXT: v_mov_b32_e32 v3, s4 3548; GFX8-NEXT: s_mov_b32 m0, -1 3549; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3550; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3551; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3552; GFX8-NEXT: BB19_2: 3553; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3554; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3555; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3556; GFX8-NEXT: v_mov_b32_e32 v0, v1 3557; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3558; GFX8-NEXT: s_mov_b32 s3, 0xf000 3559; GFX8-NEXT: s_mov_b32 s2, -1 3560; GFX8-NEXT: s_nop 0 3561; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3562; GFX8-NEXT: s_endpgm 3563; 3564; GFX9-LABEL: min_i32_varying: 3565; GFX9: ; %bb.0: ; %entry 3566; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3567; GFX9-NEXT: v_mov_b32_e32 v2, v0 3568; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3569; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3570; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3571; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3572; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3573; GFX9-NEXT: s_not_b64 exec, exec 3574; GFX9-NEXT: v_mov_b32_e32 v2, v1 3575; GFX9-NEXT: s_not_b64 exec, exec 3576; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3577; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3578; GFX9-NEXT: s_nop 1 3579; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3580; GFX9-NEXT: s_nop 1 3581; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3582; GFX9-NEXT: s_nop 1 3583; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3584; GFX9-NEXT: s_nop 1 3585; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3586; GFX9-NEXT: s_nop 1 3587; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3588; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3589; GFX9-NEXT: s_nop 0 3590; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3591; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3592; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3593; GFX9-NEXT: ; implicit-def: $vgpr0 3594; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3595; GFX9-NEXT: s_cbranch_execz BB19_2 3596; GFX9-NEXT: ; %bb.1: 3597; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3598; GFX9-NEXT: v_mov_b32_e32 v3, s4 3599; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3600; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3601; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3602; GFX9-NEXT: BB19_2: 3603; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3604; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3605; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3606; GFX9-NEXT: v_mov_b32_e32 v0, v1 3607; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3608; GFX9-NEXT: s_mov_b32 s3, 0xf000 3609; GFX9-NEXT: s_mov_b32 s2, -1 3610; GFX9-NEXT: s_nop 0 3611; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3612; GFX9-NEXT: s_endpgm 3613; 3614; GFX1064-LABEL: min_i32_varying: 3615; GFX1064: ; %bb.0: ; %entry 3616; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3617; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3618; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3619; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3620; GFX1064-NEXT: s_not_b64 exec, exec 3621; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3622; GFX1064-NEXT: s_not_b64 exec, exec 3623; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3624; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3625; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3626; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3627; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3628; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3629; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3630; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3631; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3632; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3633; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3634; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3635; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3636; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3637; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3638; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3639; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3640; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3641; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3642; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3643; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3644; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3645; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3646; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3647; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3648; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3649; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3650; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3651; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3652; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3653; GFX1064-NEXT: s_mov_b32 s2, -1 3654; GFX1064-NEXT: ; implicit-def: $vgpr0 3655; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3656; GFX1064-NEXT: s_cbranch_execz BB19_2 3657; GFX1064-NEXT: ; %bb.1: 3658; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3659; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3660; GFX1064-NEXT: s_mov_b32 s3, s7 3661; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3662; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3663; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 3664; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3665; GFX1064-NEXT: buffer_gl0_inv 3666; GFX1064-NEXT: BB19_2: 3667; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3668; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3669; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3670; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3671; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3672; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3673; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3674; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3675; GFX1064-NEXT: s_endpgm 3676; 3677; GFX1032-LABEL: min_i32_varying: 3678; GFX1032: ; %bb.0: ; %entry 3679; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3680; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3681; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3682; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3683; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3684; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3685; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3686; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3687; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3688; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3689; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3690; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3691; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3692; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3693; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3694; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3695; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3696; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3697; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3698; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3699; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3700; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3701; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3702; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3703; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3704; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3705; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3706; GFX1032-NEXT: s_mov_b32 s2, -1 3707; GFX1032-NEXT: ; implicit-def: $vgpr0 3708; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3709; GFX1032-NEXT: s_cbranch_execz BB19_2 3710; GFX1032-NEXT: ; %bb.1: 3711; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3712; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3713; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3714; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3715; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 3716; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3717; GFX1032-NEXT: buffer_gl0_inv 3718; GFX1032-NEXT: BB19_2: 3719; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3720; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3721; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3722; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3723; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3724; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3725; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3726; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3727; GFX1032-NEXT: s_endpgm 3728entry: 3729 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3730 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3731 store i32 %old, i32 addrspace(1)* %out 3732 ret void 3733} 3734 3735define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3736; 3737; 3738; GFX7LESS-LABEL: min_i64_constant: 3739; GFX7LESS: ; %bb.0: ; %entry 3740; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3741; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3742; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3743; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3744; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3745; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3746; GFX7LESS-NEXT: s_cbranch_execz BB20_2 3747; GFX7LESS-NEXT: ; %bb.1: 3748; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3749; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3750; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3751; GFX7LESS-NEXT: s_mov_b32 m0, -1 3752; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3753; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3754; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3755; GFX7LESS-NEXT: BB20_2: 3756; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3757; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3758; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3759; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3760; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3761; GFX7LESS-NEXT: s_mov_b32 s2, -1 3762; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3763; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3764; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3765; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3766; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3767; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3768; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3769; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3770; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3771; GFX7LESS-NEXT: s_endpgm 3772; 3773; GFX8-LABEL: min_i64_constant: 3774; GFX8: ; %bb.0: ; %entry 3775; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3776; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3777; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3778; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3779; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3780; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3781; GFX8-NEXT: s_cbranch_execz BB20_2 3782; GFX8-NEXT: ; %bb.1: 3783; GFX8-NEXT: v_mov_b32_e32 v0, 5 3784; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3785; GFX8-NEXT: v_mov_b32_e32 v1, 0 3786; GFX8-NEXT: s_mov_b32 m0, -1 3787; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3788; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3789; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3790; GFX8-NEXT: BB20_2: 3791; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3792; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3793; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3794; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3795; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3796; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3797; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3798; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3799; GFX8-NEXT: v_mov_b32_e32 v2, s5 3800; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3801; GFX8-NEXT: v_mov_b32_e32 v2, s4 3802; GFX8-NEXT: s_mov_b32 s2, -1 3803; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3804; GFX8-NEXT: s_mov_b32 s3, 0xf000 3805; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3806; GFX8-NEXT: s_endpgm 3807; 3808; GFX9-LABEL: min_i64_constant: 3809; GFX9: ; %bb.0: ; %entry 3810; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3811; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3812; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3813; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3814; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3815; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3816; GFX9-NEXT: s_cbranch_execz BB20_2 3817; GFX9-NEXT: ; %bb.1: 3818; GFX9-NEXT: v_mov_b32_e32 v0, 5 3819; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3820; GFX9-NEXT: v_mov_b32_e32 v1, 0 3821; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3822; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3823; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3824; GFX9-NEXT: BB20_2: 3825; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3826; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3827; GFX9-NEXT: v_readfirstlane_b32 s4, v0 3828; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 3829; GFX9-NEXT: v_readfirstlane_b32 s5, v1 3830; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3831; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3832; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3833; GFX9-NEXT: v_mov_b32_e32 v2, s5 3834; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3835; GFX9-NEXT: v_mov_b32_e32 v2, s4 3836; GFX9-NEXT: s_mov_b32 s2, -1 3837; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3838; GFX9-NEXT: s_mov_b32 s3, 0xf000 3839; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3840; GFX9-NEXT: s_endpgm 3841; 3842; GFX1064-LABEL: min_i64_constant: 3843; GFX1064: ; %bb.0: ; %entry 3844; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3845; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3846; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3847; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3848; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3849; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3850; GFX1064-NEXT: s_cbranch_execz BB20_2 3851; GFX1064-NEXT: ; %bb.1: 3852; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3853; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3854; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3855; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3856; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3857; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3858; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3859; GFX1064-NEXT: buffer_gl0_inv 3860; GFX1064-NEXT: BB20_2: 3861; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3862; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3863; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3864; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3865; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 3866; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3867; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 3868; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3869; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3870; GFX1064-NEXT: s_mov_b32 s2, -1 3871; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3872; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3873; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3874; GFX1064-NEXT: s_endpgm 3875; 3876; GFX1032-LABEL: min_i64_constant: 3877; GFX1032: ; %bb.0: ; %entry 3878; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3879; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3880; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3881; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3882; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3883; GFX1032-NEXT: s_cbranch_execz BB20_2 3884; GFX1032-NEXT: ; %bb.1: 3885; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3886; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3887; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3888; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3889; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3890; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3891; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3892; GFX1032-NEXT: buffer_gl0_inv 3893; GFX1032-NEXT: BB20_2: 3894; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3895; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3896; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3897; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3898; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 3899; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 3900; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 3901; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3902; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3903; GFX1032-NEXT: s_mov_b32 s2, -1 3904; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3905; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3906; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3907; GFX1032-NEXT: s_endpgm 3908entry: 3909 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 3910 store i64 %old, i64 addrspace(1)* %out 3911 ret void 3912} 3913 3914define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 3915; 3916; 3917; GFX7LESS-LABEL: umax_i32_varying: 3918; GFX7LESS: ; %bb.0: ; %entry 3919; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3920; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3921; GFX7LESS-NEXT: s_mov_b32 m0, -1 3922; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3923; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 3924; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3925; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3926; GFX7LESS-NEXT: s_mov_b32 s2, -1 3927; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3928; GFX7LESS-NEXT: s_endpgm 3929; 3930; GFX8-LABEL: umax_i32_varying: 3931; GFX8: ; %bb.0: ; %entry 3932; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3933; GFX8-NEXT: v_mov_b32_e32 v2, v0 3934; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3935; GFX8-NEXT: v_mov_b32_e32 v1, 0 3936; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3937; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3938; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3939; GFX8-NEXT: s_not_b64 exec, exec 3940; GFX8-NEXT: v_mov_b32_e32 v2, 0 3941; GFX8-NEXT: s_not_b64 exec, exec 3942; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3943; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3944; GFX8-NEXT: s_nop 1 3945; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3946; GFX8-NEXT: s_nop 1 3947; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3948; GFX8-NEXT: s_nop 1 3949; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3950; GFX8-NEXT: s_nop 1 3951; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3952; GFX8-NEXT: s_nop 1 3953; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3954; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3955; GFX8-NEXT: s_nop 0 3956; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3957; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3958; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3959; GFX8-NEXT: ; implicit-def: $vgpr0 3960; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3961; GFX8-NEXT: s_cbranch_execz BB21_2 3962; GFX8-NEXT: ; %bb.1: 3963; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3964; GFX8-NEXT: v_mov_b32_e32 v3, s4 3965; GFX8-NEXT: s_mov_b32 m0, -1 3966; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3967; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 3968; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3969; GFX8-NEXT: BB21_2: 3970; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3971; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3972; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3973; GFX8-NEXT: v_mov_b32_e32 v0, v1 3974; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 3975; GFX8-NEXT: s_mov_b32 s3, 0xf000 3976; GFX8-NEXT: s_mov_b32 s2, -1 3977; GFX8-NEXT: s_nop 0 3978; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3979; GFX8-NEXT: s_endpgm 3980; 3981; GFX9-LABEL: umax_i32_varying: 3982; GFX9: ; %bb.0: ; %entry 3983; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3984; GFX9-NEXT: v_mov_b32_e32 v2, v0 3985; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3986; GFX9-NEXT: v_mov_b32_e32 v1, 0 3987; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3988; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3989; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3990; GFX9-NEXT: s_not_b64 exec, exec 3991; GFX9-NEXT: v_mov_b32_e32 v2, 0 3992; GFX9-NEXT: s_not_b64 exec, exec 3993; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3994; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3995; GFX9-NEXT: s_nop 1 3996; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3997; GFX9-NEXT: s_nop 1 3998; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3999; GFX9-NEXT: s_nop 1 4000; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4001; GFX9-NEXT: s_nop 1 4002; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4003; GFX9-NEXT: s_nop 1 4004; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4005; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4006; GFX9-NEXT: s_nop 0 4007; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4008; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4009; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4010; GFX9-NEXT: ; implicit-def: $vgpr0 4011; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4012; GFX9-NEXT: s_cbranch_execz BB21_2 4013; GFX9-NEXT: ; %bb.1: 4014; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4015; GFX9-NEXT: v_mov_b32_e32 v3, s4 4016; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4017; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4018; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4019; GFX9-NEXT: BB21_2: 4020; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4021; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4022; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4023; GFX9-NEXT: v_mov_b32_e32 v0, v1 4024; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4025; GFX9-NEXT: s_mov_b32 s3, 0xf000 4026; GFX9-NEXT: s_mov_b32 s2, -1 4027; GFX9-NEXT: s_nop 0 4028; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4029; GFX9-NEXT: s_endpgm 4030; 4031; GFX1064-LABEL: umax_i32_varying: 4032; GFX1064: ; %bb.0: ; %entry 4033; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4034; GFX1064-NEXT: s_not_b64 exec, exec 4035; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4036; GFX1064-NEXT: s_not_b64 exec, exec 4037; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4038; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4039; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4040; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4041; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4042; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4043; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4044; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4045; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4046; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4047; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4048; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4049; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4050; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4051; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4052; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4053; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4054; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4055; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4056; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4057; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4058; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4059; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4060; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4061; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4062; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4063; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4064; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4065; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4066; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4067; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4068; GFX1064-NEXT: s_mov_b32 s2, -1 4069; GFX1064-NEXT: ; implicit-def: $vgpr0 4070; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4071; GFX1064-NEXT: s_cbranch_execz BB21_2 4072; GFX1064-NEXT: ; %bb.1: 4073; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4074; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4075; GFX1064-NEXT: s_mov_b32 s3, s7 4076; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4077; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4078; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 4079; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4080; GFX1064-NEXT: buffer_gl0_inv 4081; GFX1064-NEXT: BB21_2: 4082; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4083; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4084; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4085; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4086; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4087; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4088; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4089; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4090; GFX1064-NEXT: s_endpgm 4091; 4092; GFX1032-LABEL: umax_i32_varying: 4093; GFX1032: ; %bb.0: ; %entry 4094; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4095; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4096; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4097; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4098; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4099; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4100; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4101; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4102; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4103; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4104; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4105; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4106; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4107; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4108; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4109; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4110; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4111; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4112; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4113; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4114; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4115; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4116; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4117; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4118; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4119; GFX1032-NEXT: s_mov_b32 s2, -1 4120; GFX1032-NEXT: ; implicit-def: $vgpr0 4121; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4122; GFX1032-NEXT: s_cbranch_execz BB21_2 4123; GFX1032-NEXT: ; %bb.1: 4124; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4125; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4126; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4127; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4128; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 4129; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4130; GFX1032-NEXT: buffer_gl0_inv 4131; GFX1032-NEXT: BB21_2: 4132; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4133; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4134; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4135; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4136; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4137; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4138; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4139; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4140; GFX1032-NEXT: s_endpgm 4141entry: 4142 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4143 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4144 store i32 %old, i32 addrspace(1)* %out 4145 ret void 4146} 4147 4148define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4149; 4150; 4151; GFX7LESS-LABEL: umax_i64_constant: 4152; GFX7LESS: ; %bb.0: ; %entry 4153; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4154; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4155; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4156; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4157; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4158; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4159; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4160; GFX7LESS-NEXT: ; %bb.1: 4161; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4162; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4163; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4164; GFX7LESS-NEXT: s_mov_b32 m0, -1 4165; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4166; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4167; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4168; GFX7LESS-NEXT: BB22_2: 4169; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4170; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4171; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4172; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4173; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4174; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4175; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4176; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4177; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4178; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4179; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4180; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4181; GFX7LESS-NEXT: s_mov_b32 s2, -1 4182; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4183; GFX7LESS-NEXT: s_endpgm 4184; 4185; GFX8-LABEL: umax_i64_constant: 4186; GFX8: ; %bb.0: ; %entry 4187; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4188; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4189; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4190; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4191; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4192; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4193; GFX8-NEXT: s_cbranch_execz BB22_2 4194; GFX8-NEXT: ; %bb.1: 4195; GFX8-NEXT: v_mov_b32_e32 v0, 5 4196; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4197; GFX8-NEXT: v_mov_b32_e32 v1, 0 4198; GFX8-NEXT: s_mov_b32 m0, -1 4199; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4200; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4201; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4202; GFX8-NEXT: BB22_2: 4203; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4204; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4205; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4206; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4207; GFX8-NEXT: v_mov_b32_e32 v1, 0 4208; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4209; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4210; GFX8-NEXT: v_mov_b32_e32 v1, s3 4211; GFX8-NEXT: v_mov_b32_e32 v2, s2 4212; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4213; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4214; GFX8-NEXT: s_mov_b32 s3, 0xf000 4215; GFX8-NEXT: s_mov_b32 s2, -1 4216; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4217; GFX8-NEXT: s_endpgm 4218; 4219; GFX9-LABEL: umax_i64_constant: 4220; GFX9: ; %bb.0: ; %entry 4221; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4222; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4223; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4224; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4225; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4226; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4227; GFX9-NEXT: s_cbranch_execz BB22_2 4228; GFX9-NEXT: ; %bb.1: 4229; GFX9-NEXT: v_mov_b32_e32 v0, 5 4230; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4231; GFX9-NEXT: v_mov_b32_e32 v1, 0 4232; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4233; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4234; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4235; GFX9-NEXT: BB22_2: 4236; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4237; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4238; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4239; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4240; GFX9-NEXT: v_mov_b32_e32 v1, 0 4241; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4242; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4243; GFX9-NEXT: v_mov_b32_e32 v1, s3 4244; GFX9-NEXT: v_mov_b32_e32 v2, s2 4245; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4246; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4247; GFX9-NEXT: s_mov_b32 s3, 0xf000 4248; GFX9-NEXT: s_mov_b32 s2, -1 4249; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4250; GFX9-NEXT: s_endpgm 4251; 4252; GFX1064-LABEL: umax_i64_constant: 4253; GFX1064: ; %bb.0: ; %entry 4254; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4255; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4256; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4257; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4258; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4259; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4260; GFX1064-NEXT: s_cbranch_execz BB22_2 4261; GFX1064-NEXT: ; %bb.1: 4262; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4263; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4264; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4265; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4266; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4267; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4268; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4269; GFX1064-NEXT: buffer_gl0_inv 4270; GFX1064-NEXT: BB22_2: 4271; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4272; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4273; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4274; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4275; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4276; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4277; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4278; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4279; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4280; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4281; GFX1064-NEXT: s_mov_b32 s2, -1 4282; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4283; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4284; GFX1064-NEXT: s_endpgm 4285; 4286; GFX1032-LABEL: umax_i64_constant: 4287; GFX1032: ; %bb.0: ; %entry 4288; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4289; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4290; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4291; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4292; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4293; GFX1032-NEXT: s_cbranch_execz BB22_2 4294; GFX1032-NEXT: ; %bb.1: 4295; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4296; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4297; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4298; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4299; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4300; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4301; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4302; GFX1032-NEXT: buffer_gl0_inv 4303; GFX1032-NEXT: BB22_2: 4304; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4305; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4306; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4307; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4308; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4309; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4310; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4311; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4312; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4313; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4314; GFX1032-NEXT: s_mov_b32 s2, -1 4315; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4316; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4317; GFX1032-NEXT: s_endpgm 4318entry: 4319 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4320 store i64 %old, i64 addrspace(1)* %out 4321 ret void 4322} 4323 4324define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4325; 4326; 4327; GFX7LESS-LABEL: umin_i32_varying: 4328; GFX7LESS: ; %bb.0: ; %entry 4329; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4330; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4331; GFX7LESS-NEXT: s_mov_b32 m0, -1 4332; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4333; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4334; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4335; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4336; GFX7LESS-NEXT: s_mov_b32 s2, -1 4337; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4338; GFX7LESS-NEXT: s_endpgm 4339; 4340; GFX8-LABEL: umin_i32_varying: 4341; GFX8: ; %bb.0: ; %entry 4342; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4343; GFX8-NEXT: v_mov_b32_e32 v2, v0 4344; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4345; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4346; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4347; GFX8-NEXT: v_mov_b32_e32 v1, -1 4348; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4349; GFX8-NEXT: s_not_b64 exec, exec 4350; GFX8-NEXT: v_mov_b32_e32 v2, -1 4351; GFX8-NEXT: s_not_b64 exec, exec 4352; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4353; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4354; GFX8-NEXT: s_nop 1 4355; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4356; GFX8-NEXT: s_nop 1 4357; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4358; GFX8-NEXT: s_nop 1 4359; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4360; GFX8-NEXT: s_nop 1 4361; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4362; GFX8-NEXT: s_nop 1 4363; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4364; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4365; GFX8-NEXT: s_nop 0 4366; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4367; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4368; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4369; GFX8-NEXT: ; implicit-def: $vgpr0 4370; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4371; GFX8-NEXT: s_cbranch_execz BB23_2 4372; GFX8-NEXT: ; %bb.1: 4373; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4374; GFX8-NEXT: v_mov_b32_e32 v3, s4 4375; GFX8-NEXT: s_mov_b32 m0, -1 4376; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4377; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4378; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4379; GFX8-NEXT: BB23_2: 4380; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4381; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4382; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4383; GFX8-NEXT: v_mov_b32_e32 v0, v1 4384; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4385; GFX8-NEXT: s_mov_b32 s3, 0xf000 4386; GFX8-NEXT: s_mov_b32 s2, -1 4387; GFX8-NEXT: s_nop 0 4388; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4389; GFX8-NEXT: s_endpgm 4390; 4391; GFX9-LABEL: umin_i32_varying: 4392; GFX9: ; %bb.0: ; %entry 4393; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4394; GFX9-NEXT: v_mov_b32_e32 v2, v0 4395; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4396; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4397; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4398; GFX9-NEXT: v_mov_b32_e32 v1, -1 4399; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4400; GFX9-NEXT: s_not_b64 exec, exec 4401; GFX9-NEXT: v_mov_b32_e32 v2, -1 4402; GFX9-NEXT: s_not_b64 exec, exec 4403; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4404; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4405; GFX9-NEXT: s_nop 1 4406; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4407; GFX9-NEXT: s_nop 1 4408; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4409; GFX9-NEXT: s_nop 1 4410; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4411; GFX9-NEXT: s_nop 1 4412; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4413; GFX9-NEXT: s_nop 1 4414; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4415; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4416; GFX9-NEXT: s_nop 0 4417; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4418; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4419; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4420; GFX9-NEXT: ; implicit-def: $vgpr0 4421; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4422; GFX9-NEXT: s_cbranch_execz BB23_2 4423; GFX9-NEXT: ; %bb.1: 4424; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4425; GFX9-NEXT: v_mov_b32_e32 v3, s4 4426; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4427; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4428; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4429; GFX9-NEXT: BB23_2: 4430; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4431; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4432; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4433; GFX9-NEXT: v_mov_b32_e32 v0, v1 4434; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4435; GFX9-NEXT: s_mov_b32 s3, 0xf000 4436; GFX9-NEXT: s_mov_b32 s2, -1 4437; GFX9-NEXT: s_nop 0 4438; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4439; GFX9-NEXT: s_endpgm 4440; 4441; GFX1064-LABEL: umin_i32_varying: 4442; GFX1064: ; %bb.0: ; %entry 4443; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4444; GFX1064-NEXT: s_not_b64 exec, exec 4445; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4446; GFX1064-NEXT: s_not_b64 exec, exec 4447; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4448; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4449; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4450; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4451; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4452; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4453; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4454; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4455; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4456; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4457; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4458; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4459; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4460; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4461; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4462; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4463; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4464; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4465; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4466; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4467; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4468; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4469; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4470; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4471; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4472; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4473; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4474; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4475; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4476; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4477; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4478; GFX1064-NEXT: s_mov_b32 s2, -1 4479; GFX1064-NEXT: ; implicit-def: $vgpr0 4480; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4481; GFX1064-NEXT: s_cbranch_execz BB23_2 4482; GFX1064-NEXT: ; %bb.1: 4483; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4484; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4485; GFX1064-NEXT: s_mov_b32 s3, s7 4486; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4487; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4488; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 4489; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4490; GFX1064-NEXT: buffer_gl0_inv 4491; GFX1064-NEXT: BB23_2: 4492; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4493; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4494; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4495; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4496; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4497; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4498; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4499; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4500; GFX1064-NEXT: s_endpgm 4501; 4502; GFX1032-LABEL: umin_i32_varying: 4503; GFX1032: ; %bb.0: ; %entry 4504; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4505; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4506; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4507; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4508; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4509; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4510; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4511; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4512; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4513; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4514; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4515; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4516; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4517; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4518; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4519; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4520; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4521; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4522; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4523; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4524; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4525; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4526; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4527; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4528; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4529; GFX1032-NEXT: s_mov_b32 s2, -1 4530; GFX1032-NEXT: ; implicit-def: $vgpr0 4531; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4532; GFX1032-NEXT: s_cbranch_execz BB23_2 4533; GFX1032-NEXT: ; %bb.1: 4534; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4535; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4536; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4537; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4538; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 4539; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4540; GFX1032-NEXT: buffer_gl0_inv 4541; GFX1032-NEXT: BB23_2: 4542; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4543; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4544; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4545; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4546; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4547; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4548; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4549; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4550; GFX1032-NEXT: s_endpgm 4551entry: 4552 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4553 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4554 store i32 %old, i32 addrspace(1)* %out 4555 ret void 4556} 4557 4558define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4559; 4560; 4561; GFX7LESS-LABEL: umin_i64_constant: 4562; GFX7LESS: ; %bb.0: ; %entry 4563; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4564; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4565; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4566; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4567; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4568; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4569; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4570; GFX7LESS-NEXT: ; %bb.1: 4571; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4572; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4573; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4574; GFX7LESS-NEXT: s_mov_b32 m0, -1 4575; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4576; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4577; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4578; GFX7LESS-NEXT: BB24_2: 4579; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4580; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4581; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4582; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4583; GFX7LESS-NEXT: s_mov_b32 s2, -1 4584; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4585; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4586; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4587; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4588; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4589; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4590; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4591; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4592; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4593; GFX7LESS-NEXT: s_endpgm 4594; 4595; GFX8-LABEL: umin_i64_constant: 4596; GFX8: ; %bb.0: ; %entry 4597; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4598; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4599; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4600; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4601; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4602; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4603; GFX8-NEXT: s_cbranch_execz BB24_2 4604; GFX8-NEXT: ; %bb.1: 4605; GFX8-NEXT: v_mov_b32_e32 v0, 5 4606; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4607; GFX8-NEXT: v_mov_b32_e32 v1, 0 4608; GFX8-NEXT: s_mov_b32 m0, -1 4609; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4610; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4611; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4612; GFX8-NEXT: BB24_2: 4613; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4614; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4615; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4616; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4617; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4618; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4619; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4620; GFX8-NEXT: v_mov_b32_e32 v2, s5 4621; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4622; GFX8-NEXT: v_mov_b32_e32 v2, s4 4623; GFX8-NEXT: s_mov_b32 s2, -1 4624; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4625; GFX8-NEXT: s_mov_b32 s3, 0xf000 4626; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4627; GFX8-NEXT: s_endpgm 4628; 4629; GFX9-LABEL: umin_i64_constant: 4630; GFX9: ; %bb.0: ; %entry 4631; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4632; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4633; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4634; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4635; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4636; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4637; GFX9-NEXT: s_cbranch_execz BB24_2 4638; GFX9-NEXT: ; %bb.1: 4639; GFX9-NEXT: v_mov_b32_e32 v0, 5 4640; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4641; GFX9-NEXT: v_mov_b32_e32 v1, 0 4642; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4643; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4644; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4645; GFX9-NEXT: BB24_2: 4646; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4647; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4648; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4649; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4650; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4651; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4652; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4653; GFX9-NEXT: v_mov_b32_e32 v2, s5 4654; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4655; GFX9-NEXT: v_mov_b32_e32 v2, s4 4656; GFX9-NEXT: s_mov_b32 s2, -1 4657; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4658; GFX9-NEXT: s_mov_b32 s3, 0xf000 4659; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4660; GFX9-NEXT: s_endpgm 4661; 4662; GFX1064-LABEL: umin_i64_constant: 4663; GFX1064: ; %bb.0: ; %entry 4664; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4665; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4666; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4667; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4668; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4669; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4670; GFX1064-NEXT: s_cbranch_execz BB24_2 4671; GFX1064-NEXT: ; %bb.1: 4672; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4673; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4674; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4675; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4676; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4677; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4678; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4679; GFX1064-NEXT: buffer_gl0_inv 4680; GFX1064-NEXT: BB24_2: 4681; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4682; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4683; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4684; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4685; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4686; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4687; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4688; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4689; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4690; GFX1064-NEXT: s_mov_b32 s2, -1 4691; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4692; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4693; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4694; GFX1064-NEXT: s_endpgm 4695; 4696; GFX1032-LABEL: umin_i64_constant: 4697; GFX1032: ; %bb.0: ; %entry 4698; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4699; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4700; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4701; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4702; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4703; GFX1032-NEXT: s_cbranch_execz BB24_2 4704; GFX1032-NEXT: ; %bb.1: 4705; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4706; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4707; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4708; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4709; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4710; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4711; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4712; GFX1032-NEXT: buffer_gl0_inv 4713; GFX1032-NEXT: BB24_2: 4714; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4715; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4716; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4717; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4718; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4719; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4720; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4721; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4722; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4723; GFX1032-NEXT: s_mov_b32 s2, -1 4724; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4725; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4726; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4727; GFX1032-NEXT: s_endpgm 4728entry: 4729 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4730 store i64 %old, i64 addrspace(1)* %out 4731 ret void 4732} 4733