1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() 9 10@local_var32 = addrspace(3) global i32 undef, align 4 11@local_var64 = addrspace(3) global i64 undef, align 8 12 13; Show what the atomic optimization pass will do for local pointers. 14 15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { 16; 17; 18; GFX7LESS-LABEL: add_i32_constant: 19; GFX7LESS: ; %bb.0: ; %entry 20; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 21; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 22; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 23; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 24; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 25; GFX7LESS-NEXT: ; implicit-def: $vgpr1 26; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 27; GFX7LESS-NEXT: s_cbranch_execz BB0_2 28; GFX7LESS-NEXT: ; %bb.1: 29; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 30; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 31; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 32; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 33; GFX7LESS-NEXT: s_mov_b32 m0, -1 34; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 35; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 36; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 37; GFX7LESS-NEXT: BB0_2: 38; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 39; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 40; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 41; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 42; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 43; GFX7LESS-NEXT: s_mov_b32 s2, -1 44; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7LESS-NEXT: s_endpgm 46; 47; GFX8-LABEL: add_i32_constant: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 50; GFX8-NEXT: s_mov_b64 s[2:3], exec 51; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 52; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 53; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 54; GFX8-NEXT: ; implicit-def: $vgpr1 55; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 56; GFX8-NEXT: s_cbranch_execz BB0_2 57; GFX8-NEXT: ; %bb.1: 58; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 59; GFX8-NEXT: s_mul_i32 s2, s2, 5 60; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 61; GFX8-NEXT: v_mov_b32_e32 v2, s2 62; GFX8-NEXT: s_mov_b32 m0, -1 63; GFX8-NEXT: s_waitcnt lgkmcnt(0) 64; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 65; GFX8-NEXT: s_waitcnt lgkmcnt(0) 66; GFX8-NEXT: BB0_2: 67; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s2, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 71; GFX8-NEXT: s_mov_b32 s3, 0xf000 72; GFX8-NEXT: s_mov_b32 s2, -1 73; GFX8-NEXT: s_nop 1 74; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; GFX8-NEXT: s_endpgm 76; 77; GFX9-LABEL: add_i32_constant: 78; GFX9: ; %bb.0: ; %entry 79; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 80; GFX9-NEXT: s_mov_b64 s[2:3], exec 81; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 82; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 83; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 84; GFX9-NEXT: ; implicit-def: $vgpr1 85; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 86; GFX9-NEXT: s_cbranch_execz BB0_2 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 89; GFX9-NEXT: s_mul_i32 s2, s2, 5 90; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 91; GFX9-NEXT: v_mov_b32_e32 v2, s2 92; GFX9-NEXT: s_waitcnt lgkmcnt(0) 93; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: BB0_2: 96; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_readfirstlane_b32 s2, v1 99; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_nop 1 103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 104; GFX9-NEXT: s_endpgm 105; 106; GFX1064-LABEL: add_i32_constant: 107; GFX1064: ; %bb.0: ; %entry 108; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; GFX1064-NEXT: s_mov_b64 s[2:3], exec 110; GFX1064-NEXT: ; implicit-def: $vgpr1 111; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 112; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 113; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 114; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 115; GFX1064-NEXT: s_cbranch_execz BB0_2 116; GFX1064-NEXT: ; %bb.1: 117; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 118; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 119; GFX1064-NEXT: s_mul_i32 s2, s2, 5 120; GFX1064-NEXT: v_mov_b32_e32 v2, s2 121; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 122; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 124; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 125; GFX1064-NEXT: buffer_gl0_inv 126; GFX1064-NEXT: BB0_2: 127; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 128; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 129; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 130; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 131; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 132; GFX1064-NEXT: s_mov_b32 s2, -1 133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 134; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 135; GFX1064-NEXT: s_endpgm 136; 137; GFX1032-LABEL: add_i32_constant: 138; GFX1032: ; %bb.0: ; %entry 139; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 140; GFX1032-NEXT: s_mov_b32 s3, exec_lo 141; GFX1032-NEXT: ; implicit-def: $vgpr1 142; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 143; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 144; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 145; GFX1032-NEXT: s_cbranch_execz BB0_2 146; GFX1032-NEXT: ; %bb.1: 147; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 148; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 149; GFX1032-NEXT: s_mul_i32 s3, s3, 5 150; GFX1032-NEXT: v_mov_b32_e32 v2, s3 151; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 152; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 154; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 155; GFX1032-NEXT: buffer_gl0_inv 156; GFX1032-NEXT: BB0_2: 157; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 158; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 159; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 160; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 161; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 162; GFX1032-NEXT: s_mov_b32 s2, -1 163; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 164; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 165; GFX1032-NEXT: s_endpgm 166entry: 167 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel 168 store i32 %old, i32 addrspace(1)* %out 169 ret void 170} 171 172define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { 173; 174; 175; GFX7LESS-LABEL: add_i32_uniform: 176; GFX7LESS: ; %bb.0: ; %entry 177; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 178; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 179; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 180; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 181; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 182; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 183; GFX7LESS-NEXT: ; implicit-def: $vgpr1 184; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 185; GFX7LESS-NEXT: s_cbranch_execz BB1_2 186; GFX7LESS-NEXT: ; %bb.1: 187; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 188; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 189; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 190; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 191; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 192; GFX7LESS-NEXT: s_mov_b32 m0, -1 193; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 194; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 195; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 196; GFX7LESS-NEXT: BB1_2: 197; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 198; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 199; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 200; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 201; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 202; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 203; GFX7LESS-NEXT: s_mov_b32 s6, -1 204; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 205; GFX7LESS-NEXT: s_endpgm 206; 207; GFX8-LABEL: add_i32_uniform: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 210; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 211; GFX8-NEXT: s_mov_b64 s[2:3], exec 212; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 213; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 215; GFX8-NEXT: ; implicit-def: $vgpr1 216; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 217; GFX8-NEXT: s_cbranch_execz BB1_2 218; GFX8-NEXT: ; %bb.1: 219; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 220; GFX8-NEXT: s_waitcnt lgkmcnt(0) 221; GFX8-NEXT: s_mul_i32 s1, s0, s1 222; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 223; GFX8-NEXT: v_mov_b32_e32 v2, s1 224; GFX8-NEXT: s_mov_b32 m0, -1 225; GFX8-NEXT: s_waitcnt lgkmcnt(0) 226; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 227; GFX8-NEXT: s_waitcnt lgkmcnt(0) 228; GFX8-NEXT: BB1_2: 229; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 230; GFX8-NEXT: s_waitcnt lgkmcnt(0) 231; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 232; GFX8-NEXT: v_readfirstlane_b32 s0, v1 233; GFX8-NEXT: s_mov_b32 s7, 0xf000 234; GFX8-NEXT: s_mov_b32 s6, -1 235; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 236; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 237; GFX8-NEXT: s_endpgm 238; 239; GFX9-LABEL: add_i32_uniform: 240; GFX9: ; %bb.0: ; %entry 241; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 242; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 243; GFX9-NEXT: s_mov_b64 s[6:7], exec 244; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 245; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 246; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX9-NEXT: ; implicit-def: $vgpr1 248; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 249; GFX9-NEXT: s_cbranch_execz BB1_2 250; GFX9-NEXT: ; %bb.1: 251; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: s_mul_i32 s3, s2, s3 254; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 255; GFX9-NEXT: v_mov_b32_e32 v2, s3 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: BB1_2: 260; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 263; GFX9-NEXT: v_readfirstlane_b32 s0, v1 264; GFX9-NEXT: s_mov_b32 s7, 0xf000 265; GFX9-NEXT: s_mov_b32 s6, -1 266; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 267; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 268; GFX9-NEXT: s_endpgm 269; 270; GFX1064-LABEL: add_i32_uniform: 271; GFX1064: ; %bb.0: ; %entry 272; GFX1064-NEXT: s_clause 0x1 273; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 274; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 275; GFX1064-NEXT: s_mov_b64 s[6:7], exec 276; GFX1064-NEXT: ; implicit-def: $vgpr1 277; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 278; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 279; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 280; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 281; GFX1064-NEXT: s_cbranch_execz BB1_2 282; GFX1064-NEXT: ; %bb.1: 283; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 284; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 285; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 286; GFX1064-NEXT: s_mul_i32 s3, s2, s3 287; GFX1064-NEXT: v_mov_b32_e32 v2, s3 288; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 289; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 290; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 291; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 292; GFX1064-NEXT: buffer_gl0_inv 293; GFX1064-NEXT: BB1_2: 294; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 295; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 296; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 297; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 298; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 299; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 300; GFX1064-NEXT: s_mov_b32 s6, -1 301; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 302; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 303; GFX1064-NEXT: s_endpgm 304; 305; GFX1032-LABEL: add_i32_uniform: 306; GFX1032: ; %bb.0: ; %entry 307; GFX1032-NEXT: s_clause 0x1 308; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 309; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 310; GFX1032-NEXT: s_mov_b32 s3, exec_lo 311; GFX1032-NEXT: ; implicit-def: $vgpr1 312; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 313; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 314; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 315; GFX1032-NEXT: s_cbranch_execz BB1_2 316; GFX1032-NEXT: ; %bb.1: 317; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 318; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 319; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 320; GFX1032-NEXT: s_mul_i32 s1, s2, s1 321; GFX1032-NEXT: v_mov_b32_e32 v2, s1 322; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 323; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 324; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 325; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 326; GFX1032-NEXT: buffer_gl0_inv 327; GFX1032-NEXT: BB1_2: 328; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 329; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 330; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 331; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 332; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 333; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 334; GFX1032-NEXT: s_mov_b32 s6, -1 335; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 336; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX1032-NEXT: s_endpgm 338entry: 339 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel 340 store i32 %old, i32 addrspace(1)* %out 341 ret void 342} 343 344define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { 345; 346; 347; GFX7LESS-LABEL: add_i32_varying: 348; GFX7LESS: ; %bb.0: ; %entry 349; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 350; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 351; GFX7LESS-NEXT: s_mov_b32 m0, -1 352; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 353; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 354; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 356; GFX7LESS-NEXT: s_mov_b32 s2, -1 357; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 358; GFX7LESS-NEXT: s_endpgm 359; 360; GFX8-LABEL: add_i32_varying: 361; GFX8: ; %bb.0: ; %entry 362; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 363; GFX8-NEXT: v_mov_b32_e32 v2, v0 364; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 365; GFX8-NEXT: v_mov_b32_e32 v1, 0 366; GFX8-NEXT: s_mov_b64 exec, s[2:3] 367; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 368; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 369; GFX8-NEXT: s_not_b64 exec, exec 370; GFX8-NEXT: v_mov_b32_e32 v2, 0 371; GFX8-NEXT: s_not_b64 exec, exec 372; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 373; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 374; GFX8-NEXT: s_nop 1 375; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 376; GFX8-NEXT: s_nop 1 377; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 378; GFX8-NEXT: s_nop 1 379; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 380; GFX8-NEXT: s_nop 1 381; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 382; GFX8-NEXT: s_nop 1 383; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 384; GFX8-NEXT: v_readlane_b32 s4, v2, 63 385; GFX8-NEXT: s_nop 0 386; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 387; GFX8-NEXT: s_mov_b64 exec, s[2:3] 388; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 389; GFX8-NEXT: ; implicit-def: $vgpr0 390; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 391; GFX8-NEXT: s_cbranch_execz BB2_2 392; GFX8-NEXT: ; %bb.1: 393; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 394; GFX8-NEXT: v_mov_b32_e32 v3, s4 395; GFX8-NEXT: s_mov_b32 m0, -1 396; GFX8-NEXT: s_waitcnt lgkmcnt(0) 397; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 398; GFX8-NEXT: s_waitcnt lgkmcnt(0) 399; GFX8-NEXT: BB2_2: 400; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 402; GFX8-NEXT: v_readfirstlane_b32 s2, v0 403; GFX8-NEXT: v_mov_b32_e32 v0, v1 404; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 405; GFX8-NEXT: s_mov_b32 s3, 0xf000 406; GFX8-NEXT: s_mov_b32 s2, -1 407; GFX8-NEXT: s_nop 0 408; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 409; GFX8-NEXT: s_endpgm 410; 411; GFX9-LABEL: add_i32_varying: 412; GFX9: ; %bb.0: ; %entry 413; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 414; GFX9-NEXT: v_mov_b32_e32 v2, v0 415; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 416; GFX9-NEXT: v_mov_b32_e32 v1, 0 417; GFX9-NEXT: s_mov_b64 exec, s[2:3] 418; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 419; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 420; GFX9-NEXT: s_not_b64 exec, exec 421; GFX9-NEXT: v_mov_b32_e32 v2, 0 422; GFX9-NEXT: s_not_b64 exec, exec 423; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 424; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 425; GFX9-NEXT: s_nop 1 426; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 427; GFX9-NEXT: s_nop 1 428; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 429; GFX9-NEXT: s_nop 1 430; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 431; GFX9-NEXT: s_nop 1 432; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 433; GFX9-NEXT: s_nop 1 434; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 435; GFX9-NEXT: v_readlane_b32 s4, v2, 63 436; GFX9-NEXT: s_nop 0 437; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 438; GFX9-NEXT: s_mov_b64 exec, s[2:3] 439; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 440; GFX9-NEXT: ; implicit-def: $vgpr0 441; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 442; GFX9-NEXT: s_cbranch_execz BB2_2 443; GFX9-NEXT: ; %bb.1: 444; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 445; GFX9-NEXT: v_mov_b32_e32 v3, s4 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: BB2_2: 450; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: v_readfirstlane_b32 s2, v0 453; GFX9-NEXT: v_mov_b32_e32 v0, v1 454; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 455; GFX9-NEXT: s_mov_b32 s3, 0xf000 456; GFX9-NEXT: s_mov_b32 s2, -1 457; GFX9-NEXT: s_nop 0 458; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX9-NEXT: s_endpgm 460; 461; GFX1064-LABEL: add_i32_varying: 462; GFX1064: ; %bb.0: ; %entry 463; GFX1064-NEXT: v_mov_b32_e32 v1, v0 464; GFX1064-NEXT: s_not_b64 exec, exec 465; GFX1064-NEXT: v_mov_b32_e32 v1, 0 466; GFX1064-NEXT: s_not_b64 exec, exec 467; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 468; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 469; GFX1064-NEXT: v_mov_b32_e32 v3, 0 470; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 471; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 472; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 473; GFX1064-NEXT: v_mov_b32_e32 v2, v1 474; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 475; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 476; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 477; GFX1064-NEXT: v_mov_b32_e32 v2, s4 478; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 479; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 480; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 481; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 482; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 484; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 485; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 486; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 487; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 488; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 489; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 490; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 491; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 492; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 493; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 494; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 495; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 496; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 497; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 498; GFX1064-NEXT: s_mov_b32 s2, -1 499; GFX1064-NEXT: ; implicit-def: $vgpr0 500; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 501; GFX1064-NEXT: s_cbranch_execz BB2_2 502; GFX1064-NEXT: ; %bb.1: 503; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 504; GFX1064-NEXT: v_mov_b32_e32 v4, s7 505; GFX1064-NEXT: s_mov_b32 s3, s7 506; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 507; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 508; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 509; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 510; GFX1064-NEXT: buffer_gl0_inv 511; GFX1064-NEXT: BB2_2: 512; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 513; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 514; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 515; GFX1064-NEXT: v_mov_b32_e32 v0, v3 516; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 517; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 518; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 519; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 520; GFX1064-NEXT: s_endpgm 521; 522; GFX1032-LABEL: add_i32_varying: 523; GFX1032: ; %bb.0: ; %entry 524; GFX1032-NEXT: v_mov_b32_e32 v1, v0 525; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 526; GFX1032-NEXT: v_mov_b32_e32 v1, 0 527; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 528; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 529; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 530; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 531; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 532; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 533; GFX1032-NEXT: v_mov_b32_e32 v2, v1 534; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 535; GFX1032-NEXT: s_mov_b32 exec_lo, s2 536; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 537; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 538; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 539; GFX1032-NEXT: v_mov_b32_e32 v3, 0 540; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 541; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 542; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 543; GFX1032-NEXT: s_mov_b32 exec_lo, s2 544; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 545; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 546; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 547; GFX1032-NEXT: s_mov_b32 exec_lo, s2 548; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 549; GFX1032-NEXT: s_mov_b32 s2, -1 550; GFX1032-NEXT: ; implicit-def: $vgpr0 551; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 552; GFX1032-NEXT: s_cbranch_execz BB2_2 553; GFX1032-NEXT: ; %bb.1: 554; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 555; GFX1032-NEXT: v_mov_b32_e32 v4, s4 556; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 557; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 558; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 559; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 560; GFX1032-NEXT: buffer_gl0_inv 561; GFX1032-NEXT: BB2_2: 562; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 563; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 564; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 565; GFX1032-NEXT: v_mov_b32_e32 v0, v3 566; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 567; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 568; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 569; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 570; GFX1032-NEXT: s_endpgm 571entry: 572 %lane = call i32 @llvm.amdgcn.workitem.id.x() 573 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 574 store i32 %old, i32 addrspace(1)* %out 575 ret void 576} 577 578define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { 579; 580; 581; GFX7LESS-LABEL: add_i32_varying_gfx1032: 582; GFX7LESS: ; %bb.0: ; %entry 583; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 584; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 585; GFX7LESS-NEXT: s_mov_b32 m0, -1 586; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 587; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 588; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 589; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 590; GFX7LESS-NEXT: s_mov_b32 s2, -1 591; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 592; GFX7LESS-NEXT: s_endpgm 593; 594; GFX8-LABEL: add_i32_varying_gfx1032: 595; GFX8: ; %bb.0: ; %entry 596; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 597; GFX8-NEXT: v_mov_b32_e32 v2, v0 598; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 599; GFX8-NEXT: v_mov_b32_e32 v1, 0 600; GFX8-NEXT: s_mov_b64 exec, s[2:3] 601; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 602; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 603; GFX8-NEXT: s_not_b64 exec, exec 604; GFX8-NEXT: v_mov_b32_e32 v2, 0 605; GFX8-NEXT: s_not_b64 exec, exec 606; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 607; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 608; GFX8-NEXT: s_nop 1 609; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 610; GFX8-NEXT: s_nop 1 611; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 612; GFX8-NEXT: s_nop 1 613; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 614; GFX8-NEXT: s_nop 1 615; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 616; GFX8-NEXT: s_nop 1 617; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 618; GFX8-NEXT: v_readlane_b32 s4, v2, 63 619; GFX8-NEXT: s_nop 0 620; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 621; GFX8-NEXT: s_mov_b64 exec, s[2:3] 622; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 623; GFX8-NEXT: ; implicit-def: $vgpr0 624; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 625; GFX8-NEXT: s_cbranch_execz BB3_2 626; GFX8-NEXT: ; %bb.1: 627; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 628; GFX8-NEXT: v_mov_b32_e32 v3, s4 629; GFX8-NEXT: s_mov_b32 m0, -1 630; GFX8-NEXT: s_waitcnt lgkmcnt(0) 631; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 632; GFX8-NEXT: s_waitcnt lgkmcnt(0) 633; GFX8-NEXT: BB3_2: 634; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 635; GFX8-NEXT: s_waitcnt lgkmcnt(0) 636; GFX8-NEXT: v_readfirstlane_b32 s2, v0 637; GFX8-NEXT: v_mov_b32_e32 v0, v1 638; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 639; GFX8-NEXT: s_mov_b32 s3, 0xf000 640; GFX8-NEXT: s_mov_b32 s2, -1 641; GFX8-NEXT: s_nop 0 642; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 643; GFX8-NEXT: s_endpgm 644; 645; GFX9-LABEL: add_i32_varying_gfx1032: 646; GFX9: ; %bb.0: ; %entry 647; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 648; GFX9-NEXT: v_mov_b32_e32 v2, v0 649; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 650; GFX9-NEXT: v_mov_b32_e32 v1, 0 651; GFX9-NEXT: s_mov_b64 exec, s[2:3] 652; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 653; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 654; GFX9-NEXT: s_not_b64 exec, exec 655; GFX9-NEXT: v_mov_b32_e32 v2, 0 656; GFX9-NEXT: s_not_b64 exec, exec 657; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 658; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 659; GFX9-NEXT: s_nop 1 660; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 661; GFX9-NEXT: s_nop 1 662; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 663; GFX9-NEXT: s_nop 1 664; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 665; GFX9-NEXT: s_nop 1 666; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 667; GFX9-NEXT: s_nop 1 668; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 669; GFX9-NEXT: v_readlane_b32 s4, v2, 63 670; GFX9-NEXT: s_nop 0 671; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 672; GFX9-NEXT: s_mov_b64 exec, s[2:3] 673; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 674; GFX9-NEXT: ; implicit-def: $vgpr0 675; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 676; GFX9-NEXT: s_cbranch_execz BB3_2 677; GFX9-NEXT: ; %bb.1: 678; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 679; GFX9-NEXT: v_mov_b32_e32 v3, s4 680; GFX9-NEXT: s_waitcnt lgkmcnt(0) 681; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 682; GFX9-NEXT: s_waitcnt lgkmcnt(0) 683; GFX9-NEXT: BB3_2: 684; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 685; GFX9-NEXT: s_waitcnt lgkmcnt(0) 686; GFX9-NEXT: v_readfirstlane_b32 s2, v0 687; GFX9-NEXT: v_mov_b32_e32 v0, v1 688; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 689; GFX9-NEXT: s_mov_b32 s3, 0xf000 690; GFX9-NEXT: s_mov_b32 s2, -1 691; GFX9-NEXT: s_nop 0 692; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 693; GFX9-NEXT: s_endpgm 694; 695; GFX1064-LABEL: add_i32_varying_gfx1032: 696; GFX1064: ; %bb.0: ; %entry 697; GFX1064-NEXT: v_mov_b32_e32 v1, v0 698; GFX1064-NEXT: s_not_b64 exec, exec 699; GFX1064-NEXT: v_mov_b32_e32 v1, 0 700; GFX1064-NEXT: s_not_b64 exec, exec 701; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 702; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 703; GFX1064-NEXT: v_mov_b32_e32 v3, 0 704; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 705; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 706; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 707; GFX1064-NEXT: v_mov_b32_e32 v2, v1 708; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 709; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 710; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 711; GFX1064-NEXT: v_mov_b32_e32 v2, s4 712; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 713; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 714; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 715; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 716; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 717; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 718; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 719; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 720; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 721; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 722; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 723; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 724; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 725; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 726; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 727; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 728; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 729; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 730; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 731; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 732; GFX1064-NEXT: s_mov_b32 s2, -1 733; GFX1064-NEXT: ; implicit-def: $vgpr0 734; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 735; GFX1064-NEXT: s_cbranch_execz BB3_2 736; GFX1064-NEXT: ; %bb.1: 737; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 738; GFX1064-NEXT: v_mov_b32_e32 v4, s7 739; GFX1064-NEXT: s_mov_b32 s3, s7 740; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 741; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 742; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 743; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 744; GFX1064-NEXT: buffer_gl0_inv 745; GFX1064-NEXT: BB3_2: 746; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 747; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 748; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 749; GFX1064-NEXT: v_mov_b32_e32 v0, v3 750; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 751; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 752; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 753; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 754; GFX1064-NEXT: s_endpgm 755; 756; GFX1032-LABEL: add_i32_varying_gfx1032: 757; GFX1032: ; %bb.0: ; %entry 758; GFX1032-NEXT: v_mov_b32_e32 v1, v0 759; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 760; GFX1032-NEXT: v_mov_b32_e32 v1, 0 761; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 762; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 763; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 764; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 765; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 766; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 767; GFX1032-NEXT: v_mov_b32_e32 v2, v1 768; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 769; GFX1032-NEXT: s_mov_b32 exec_lo, s2 770; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 771; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 772; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 773; GFX1032-NEXT: v_mov_b32_e32 v3, 0 774; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 775; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 776; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 777; GFX1032-NEXT: s_mov_b32 exec_lo, s2 778; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 779; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 780; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 781; GFX1032-NEXT: s_mov_b32 exec_lo, s2 782; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 783; GFX1032-NEXT: s_mov_b32 s2, -1 784; GFX1032-NEXT: ; implicit-def: $vgpr0 785; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 786; GFX1032-NEXT: s_cbranch_execz BB3_2 787; GFX1032-NEXT: ; %bb.1: 788; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 789; GFX1032-NEXT: v_mov_b32_e32 v4, s4 790; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 791; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 792; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 793; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 794; GFX1032-NEXT: buffer_gl0_inv 795; GFX1032-NEXT: BB3_2: 796; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 797; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 798; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 799; GFX1032-NEXT: v_mov_b32_e32 v0, v3 800; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 801; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 802; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 803; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 804; GFX1032-NEXT: s_endpgm 805entry: 806 %lane = call i32 @llvm.amdgcn.workitem.id.x() 807 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 808 store i32 %old, i32 addrspace(1)* %out 809 ret void 810} 811 812define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { 813; 814; 815; GFX7LESS-LABEL: add_i32_varying_gfx1064: 816; GFX7LESS: ; %bb.0: ; %entry 817; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 818; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 819; GFX7LESS-NEXT: s_mov_b32 m0, -1 820; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 821; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 822; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 823; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 824; GFX7LESS-NEXT: s_mov_b32 s2, -1 825; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 826; GFX7LESS-NEXT: s_endpgm 827; 828; GFX8-LABEL: add_i32_varying_gfx1064: 829; GFX8: ; %bb.0: ; %entry 830; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 831; GFX8-NEXT: v_mov_b32_e32 v2, v0 832; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 833; GFX8-NEXT: v_mov_b32_e32 v1, 0 834; GFX8-NEXT: s_mov_b64 exec, s[2:3] 835; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 836; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 837; GFX8-NEXT: s_not_b64 exec, exec 838; GFX8-NEXT: v_mov_b32_e32 v2, 0 839; GFX8-NEXT: s_not_b64 exec, exec 840; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 841; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 842; GFX8-NEXT: s_nop 1 843; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 844; GFX8-NEXT: s_nop 1 845; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 846; GFX8-NEXT: s_nop 1 847; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 848; GFX8-NEXT: s_nop 1 849; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 850; GFX8-NEXT: s_nop 1 851; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 852; GFX8-NEXT: v_readlane_b32 s4, v2, 63 853; GFX8-NEXT: s_nop 0 854; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 855; GFX8-NEXT: s_mov_b64 exec, s[2:3] 856; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 857; GFX8-NEXT: ; implicit-def: $vgpr0 858; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 859; GFX8-NEXT: s_cbranch_execz BB4_2 860; GFX8-NEXT: ; %bb.1: 861; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 862; GFX8-NEXT: v_mov_b32_e32 v3, s4 863; GFX8-NEXT: s_mov_b32 m0, -1 864; GFX8-NEXT: s_waitcnt lgkmcnt(0) 865; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 866; GFX8-NEXT: s_waitcnt lgkmcnt(0) 867; GFX8-NEXT: BB4_2: 868; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 869; GFX8-NEXT: s_waitcnt lgkmcnt(0) 870; GFX8-NEXT: v_readfirstlane_b32 s2, v0 871; GFX8-NEXT: v_mov_b32_e32 v0, v1 872; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 873; GFX8-NEXT: s_mov_b32 s3, 0xf000 874; GFX8-NEXT: s_mov_b32 s2, -1 875; GFX8-NEXT: s_nop 0 876; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 877; GFX8-NEXT: s_endpgm 878; 879; GFX9-LABEL: add_i32_varying_gfx1064: 880; GFX9: ; %bb.0: ; %entry 881; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 882; GFX9-NEXT: v_mov_b32_e32 v2, v0 883; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 884; GFX9-NEXT: v_mov_b32_e32 v1, 0 885; GFX9-NEXT: s_mov_b64 exec, s[2:3] 886; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 887; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 888; GFX9-NEXT: s_not_b64 exec, exec 889; GFX9-NEXT: v_mov_b32_e32 v2, 0 890; GFX9-NEXT: s_not_b64 exec, exec 891; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 892; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 893; GFX9-NEXT: s_nop 1 894; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 895; GFX9-NEXT: s_nop 1 896; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 897; GFX9-NEXT: s_nop 1 898; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 899; GFX9-NEXT: s_nop 1 900; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 901; GFX9-NEXT: s_nop 1 902; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 903; GFX9-NEXT: v_readlane_b32 s4, v2, 63 904; GFX9-NEXT: s_nop 0 905; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 906; GFX9-NEXT: s_mov_b64 exec, s[2:3] 907; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 908; GFX9-NEXT: ; implicit-def: $vgpr0 909; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 910; GFX9-NEXT: s_cbranch_execz BB4_2 911; GFX9-NEXT: ; %bb.1: 912; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 913; GFX9-NEXT: v_mov_b32_e32 v3, s4 914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 915; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 916; GFX9-NEXT: s_waitcnt lgkmcnt(0) 917; GFX9-NEXT: BB4_2: 918; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 919; GFX9-NEXT: s_waitcnt lgkmcnt(0) 920; GFX9-NEXT: v_readfirstlane_b32 s2, v0 921; GFX9-NEXT: v_mov_b32_e32 v0, v1 922; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 923; GFX9-NEXT: s_mov_b32 s3, 0xf000 924; GFX9-NEXT: s_mov_b32 s2, -1 925; GFX9-NEXT: s_nop 0 926; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 927; GFX9-NEXT: s_endpgm 928; 929; GFX1064-LABEL: add_i32_varying_gfx1064: 930; GFX1064: ; %bb.0: ; %entry 931; GFX1064-NEXT: v_mov_b32_e32 v1, v0 932; GFX1064-NEXT: s_not_b64 exec, exec 933; GFX1064-NEXT: v_mov_b32_e32 v1, 0 934; GFX1064-NEXT: s_not_b64 exec, exec 935; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 936; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 937; GFX1064-NEXT: v_mov_b32_e32 v3, 0 938; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 939; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 940; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 941; GFX1064-NEXT: v_mov_b32_e32 v2, v1 942; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 943; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 944; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 945; GFX1064-NEXT: v_mov_b32_e32 v2, s4 946; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 947; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 948; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 949; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 950; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 951; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 952; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 953; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 954; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 955; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 956; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 957; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 958; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 959; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 960; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 961; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 962; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 963; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 964; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 965; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 966; GFX1064-NEXT: s_mov_b32 s2, -1 967; GFX1064-NEXT: ; implicit-def: $vgpr0 968; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 969; GFX1064-NEXT: s_cbranch_execz BB4_2 970; GFX1064-NEXT: ; %bb.1: 971; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 972; GFX1064-NEXT: v_mov_b32_e32 v4, s7 973; GFX1064-NEXT: s_mov_b32 s3, s7 974; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 975; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 976; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 977; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 978; GFX1064-NEXT: buffer_gl0_inv 979; GFX1064-NEXT: BB4_2: 980; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 981; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 982; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 983; GFX1064-NEXT: v_mov_b32_e32 v0, v3 984; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 985; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 986; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 987; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 988; GFX1064-NEXT: s_endpgm 989; 990; GFX1032-LABEL: add_i32_varying_gfx1064: 991; GFX1032: ; %bb.0: ; %entry 992; GFX1032-NEXT: v_mov_b32_e32 v1, v0 993; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 994; GFX1032-NEXT: v_mov_b32_e32 v1, 0 995; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 996; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 997; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 998; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 999; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1000; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1001; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1002; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1003; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1004; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1005; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1006; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1007; GFX1032-NEXT: v_mov_b32_e32 v3, 0 1008; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 1009; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 1010; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1011; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1012; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 1013; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1014; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 1015; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1016; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1017; GFX1032-NEXT: s_mov_b32 s2, -1 1018; GFX1032-NEXT: ; implicit-def: $vgpr0 1019; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 1020; GFX1032-NEXT: s_cbranch_execz BB4_2 1021; GFX1032-NEXT: ; %bb.1: 1022; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 1023; GFX1032-NEXT: v_mov_b32_e32 v4, s4 1024; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1025; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1026; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 1027; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1028; GFX1032-NEXT: buffer_gl0_inv 1029; GFX1032-NEXT: BB4_2: 1030; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1031; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 1032; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 1033; GFX1032-NEXT: v_mov_b32_e32 v0, v3 1034; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 1035; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1036; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1037; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1038; GFX1032-NEXT: s_endpgm 1039entry: 1040 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1041 %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel 1042 store i32 %old, i32 addrspace(1)* %out 1043 ret void 1044} 1045 1046define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { 1047; 1048; 1049; GFX7LESS-LABEL: add_i64_constant: 1050; GFX7LESS: ; %bb.0: ; %entry 1051; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 1052; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1053; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1054; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 1055; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1056; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1057; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 1058; GFX7LESS-NEXT: s_cbranch_execz BB5_2 1059; GFX7LESS-NEXT: ; %bb.1: 1060; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1061; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1062; GFX7LESS-NEXT: s_mul_i32 s5, s4, 5 1063; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1064; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 1065; GFX7LESS-NEXT: s_mov_b32 m0, -1 1066; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1067; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1068; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1069; GFX7LESS-NEXT: BB5_2: 1070; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 1071; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1072; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1073; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 1074; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 1075; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1076; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1077; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 1078; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 1079; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1080; GFX7LESS-NEXT: s_mov_b32 s2, -1 1081; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1082; GFX7LESS-NEXT: s_endpgm 1083; 1084; GFX8-LABEL: add_i64_constant: 1085; GFX8: ; %bb.0: ; %entry 1086; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1087; GFX8-NEXT: s_mov_b64 s[4:5], exec 1088; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1089; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1090; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1091; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1092; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1093; GFX8-NEXT: s_cbranch_execz BB5_2 1094; GFX8-NEXT: ; %bb.1: 1095; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1096; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1097; GFX8-NEXT: s_mul_i32 s4, s4, 5 1098; GFX8-NEXT: v_mov_b32_e32 v1, s4 1099; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1100; GFX8-NEXT: s_mov_b32 m0, -1 1101; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1102; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1103; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1104; GFX8-NEXT: BB5_2: 1105; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1106; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1107; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1108; GFX8-NEXT: v_readfirstlane_b32 s3, v2 1109; GFX8-NEXT: v_mov_b32_e32 v1, s2 1110; GFX8-NEXT: v_mov_b32_e32 v2, s3 1111; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1112; GFX8-NEXT: s_mov_b32 s3, 0xf000 1113; GFX8-NEXT: s_mov_b32 s2, -1 1114; GFX8-NEXT: s_nop 2 1115; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1116; GFX8-NEXT: s_endpgm 1117; 1118; GFX9-LABEL: add_i64_constant: 1119; GFX9: ; %bb.0: ; %entry 1120; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1121; GFX9-NEXT: s_mov_b64 s[4:5], exec 1122; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 1123; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 1124; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1125; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1126; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1127; GFX9-NEXT: s_cbranch_execz BB5_2 1128; GFX9-NEXT: ; %bb.1: 1129; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1130; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1131; GFX9-NEXT: s_mul_i32 s4, s4, 5 1132; GFX9-NEXT: v_mov_b32_e32 v1, s4 1133; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1134; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1135; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1136; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX9-NEXT: BB5_2: 1138; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1140; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1141; GFX9-NEXT: v_readfirstlane_b32 s3, v2 1142; GFX9-NEXT: v_mov_b32_e32 v1, s2 1143; GFX9-NEXT: v_mov_b32_e32 v2, s3 1144; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] 1145; GFX9-NEXT: s_mov_b32 s3, 0xf000 1146; GFX9-NEXT: s_mov_b32 s2, -1 1147; GFX9-NEXT: s_nop 2 1148; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1149; GFX9-NEXT: s_endpgm 1150; 1151; GFX1064-LABEL: add_i64_constant: 1152; GFX1064: ; %bb.0: ; %entry 1153; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1154; GFX1064-NEXT: s_mov_b64 s[4:5], exec 1155; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1156; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 1157; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 1158; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1159; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1160; GFX1064-NEXT: s_cbranch_execz BB5_2 1161; GFX1064-NEXT: ; %bb.1: 1162; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 1163; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1164; GFX1064-NEXT: s_mul_i32 s5, s4, 5 1165; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 1166; GFX1064-NEXT: v_mov_b32_e32 v1, s5 1167; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1168; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1169; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1170; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1171; GFX1064-NEXT: buffer_gl0_inv 1172; GFX1064-NEXT: BB5_2: 1173; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1174; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1175; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1176; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 1177; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] 1178; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1179; GFX1064-NEXT: s_mov_b32 s2, -1 1180; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1181; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1182; GFX1064-NEXT: s_endpgm 1183; 1184; GFX1032-LABEL: add_i64_constant: 1185; GFX1032: ; %bb.0: ; %entry 1186; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1187; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1188; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1189; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1190; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1191; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1192; GFX1032-NEXT: s_cbranch_execz BB5_2 1193; GFX1032-NEXT: ; %bb.1: 1194; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1195; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1196; GFX1032-NEXT: s_mul_i32 s4, s3, 5 1197; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 1198; GFX1032-NEXT: v_mov_b32_e32 v1, s4 1199; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1200; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1201; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1202; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1203; GFX1032-NEXT: buffer_gl0_inv 1204; GFX1032-NEXT: BB5_2: 1205; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1206; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1207; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1208; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 1209; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] 1210; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1211; GFX1032-NEXT: s_mov_b32 s2, -1 1212; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1213; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1214; GFX1032-NEXT: s_endpgm 1215entry: 1216 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel 1217 store i64 %old, i64 addrspace(1)* %out 1218 ret void 1219} 1220 1221define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { 1222; 1223; 1224; GFX7LESS-LABEL: add_i64_uniform: 1225; GFX7LESS: ; %bb.0: ; %entry 1226; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 1227; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1228; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1229; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 1230; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1231; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 1232; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1233; GFX7LESS-NEXT: s_cbranch_execz BB6_2 1234; GFX7LESS-NEXT: ; %bb.1: 1235; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1236; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1237; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 1239; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1240; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 1241; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 1242; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 1243; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 1244; GFX7LESS-NEXT: s_mov_b32 m0, -1 1245; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1247; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1248; GFX7LESS-NEXT: BB6_2: 1249; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1250; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1251; GFX7LESS-NEXT: s_mov_b32 s6, -1 1252; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1253; GFX7LESS-NEXT: s_mov_b32 s4, s0 1254; GFX7LESS-NEXT: s_mov_b32 s5, s1 1255; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 1256; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 1257; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 1258; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 1259; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 1260; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1261; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1262; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1263; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1264; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1265; GFX7LESS-NEXT: s_endpgm 1266; 1267; GFX8-LABEL: add_i64_uniform: 1268; GFX8: ; %bb.0: ; %entry 1269; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1270; GFX8-NEXT: s_mov_b64 s[6:7], exec 1271; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1272; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1273; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1274; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 1275; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1276; GFX8-NEXT: s_cbranch_execz BB6_2 1277; GFX8-NEXT: ; %bb.1: 1278; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1279; GFX8-NEXT: v_mov_b32_e32 v1, s6 1280; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1281; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 1282; GFX8-NEXT: s_mul_i32 s7, s3, s6 1283; GFX8-NEXT: s_mul_i32 s6, s2, s6 1284; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1285; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 1286; GFX8-NEXT: v_mov_b32_e32 v1, s6 1287; GFX8-NEXT: s_mov_b32 m0, -1 1288; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1289; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1290; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX8-NEXT: BB6_2: 1292; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1293; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX8-NEXT: s_mov_b32 s4, s0 1295; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1296; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 1297; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 1298; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 1299; GFX8-NEXT: s_mov_b32 s5, s1 1300; GFX8-NEXT: v_readfirstlane_b32 s1, v2 1301; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 1302; GFX8-NEXT: v_mov_b32_e32 v2, s1 1303; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1304; GFX8-NEXT: s_mov_b32 s7, 0xf000 1305; GFX8-NEXT: s_mov_b32 s6, -1 1306; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc 1307; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1308; GFX8-NEXT: s_endpgm 1309; 1310; GFX9-LABEL: add_i64_uniform: 1311; GFX9: ; %bb.0: ; %entry 1312; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1313; GFX9-NEXT: s_mov_b64 s[6:7], exec 1314; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1315; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1316; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1317; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 1318; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1319; GFX9-NEXT: s_cbranch_execz BB6_2 1320; GFX9-NEXT: ; %bb.1: 1321; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1323; GFX9-NEXT: s_mul_i32 s7, s3, s6 1324; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 1325; GFX9-NEXT: s_add_i32 s8, s8, s7 1326; GFX9-NEXT: s_mul_i32 s6, s2, s6 1327; GFX9-NEXT: v_mov_b32_e32 v1, s6 1328; GFX9-NEXT: v_mov_b32_e32 v2, s8 1329; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1330; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1333; GFX9-NEXT: BB6_2: 1334; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1335; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1336; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 1337; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 1338; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1339; GFX9-NEXT: s_mov_b32 s4, s0 1340; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1341; GFX9-NEXT: s_mov_b32 s5, s1 1342; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1343; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 1344; GFX9-NEXT: v_mov_b32_e32 v2, s1 1345; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 1346; GFX9-NEXT: s_mov_b32 s7, 0xf000 1347; GFX9-NEXT: s_mov_b32 s6, -1 1348; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1349; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1350; GFX9-NEXT: s_endpgm 1351; 1352; GFX1064-LABEL: add_i64_uniform: 1353; GFX1064: ; %bb.0: ; %entry 1354; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1355; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1356; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 1357; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1358; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1359; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1360; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1361; GFX1064-NEXT: s_cbranch_execz BB6_2 1362; GFX1064-NEXT: ; %bb.1: 1363; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 1364; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1365; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1366; GFX1064-NEXT: s_mul_i32 s7, s3, s6 1367; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 1368; GFX1064-NEXT: s_mul_i32 s6, s2, s6 1369; GFX1064-NEXT: s_add_i32 s8, s8, s7 1370; GFX1064-NEXT: v_mov_b32_e32 v1, s6 1371; GFX1064-NEXT: v_mov_b32_e32 v2, s8 1372; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1373; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1374; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1375; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1376; GFX1064-NEXT: buffer_gl0_inv 1377; GFX1064-NEXT: BB6_2: 1378; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1379; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1380; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1381; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 1382; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 1383; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1384; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1385; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 1386; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1387; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 1388; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 1389; GFX1064-NEXT: s_mov_b32 s2, -1 1390; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc 1391; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1392; GFX1064-NEXT: s_endpgm 1393; 1394; GFX1032-LABEL: add_i64_uniform: 1395; GFX1032: ; %bb.0: ; %entry 1396; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1397; GFX1032-NEXT: s_mov_b32 s5, exec_lo 1398; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 1399; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 1400; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1401; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 1402; GFX1032-NEXT: s_cbranch_execz BB6_2 1403; GFX1032-NEXT: ; %bb.1: 1404; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 1405; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 1406; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1407; GFX1032-NEXT: s_mul_i32 s6, s3, s5 1408; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 1409; GFX1032-NEXT: s_mul_i32 s5, s2, s5 1410; GFX1032-NEXT: s_add_i32 s7, s7, s6 1411; GFX1032-NEXT: v_mov_b32_e32 v1, s5 1412; GFX1032-NEXT: v_mov_b32_e32 v2, s7 1413; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1414; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1415; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] 1416; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX1032-NEXT: buffer_gl0_inv 1418; GFX1032-NEXT: BB6_2: 1419; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1420; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 1421; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 1423; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 1424; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1425; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1426; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 1427; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1428; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 1429; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 1430; GFX1032-NEXT: s_mov_b32 s2, -1 1431; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 1432; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1433; GFX1032-NEXT: s_endpgm 1434entry: 1435 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel 1436 store i64 %old, i64 addrspace(1)* %out 1437 ret void 1438} 1439 1440; GCN-NOT: v_mbcnt_lo_u32_b32 1441; GCN-NOT: v_mbcnt_hi_u32_b32 1442; GCN-NOT: s_bcnt1_i32_b64 1443define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { 1444; 1445; 1446; GFX7LESS-LABEL: add_i64_varying: 1447; GFX7LESS: ; %bb.0: ; %entry 1448; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1449; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 1450; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1451; GFX7LESS-NEXT: s_mov_b32 m0, -1 1452; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1454; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1455; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1456; GFX7LESS-NEXT: s_mov_b32 s2, -1 1457; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1458; GFX7LESS-NEXT: s_endpgm 1459; 1460; GFX8-LABEL: add_i64_varying: 1461; GFX8: ; %bb.0: ; %entry 1462; GFX8-NEXT: v_mov_b32_e32 v1, 0 1463; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1464; GFX8-NEXT: s_mov_b32 m0, -1 1465; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1466; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1467; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1468; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1469; GFX8-NEXT: s_mov_b32 s3, 0xf000 1470; GFX8-NEXT: s_mov_b32 s2, -1 1471; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1472; GFX8-NEXT: s_endpgm 1473; 1474; GFX9-LABEL: add_i64_varying: 1475; GFX9: ; %bb.0: ; %entry 1476; GFX9-NEXT: v_mov_b32_e32 v1, 0 1477; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1478; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1479; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1481; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1482; GFX9-NEXT: s_mov_b32 s3, 0xf000 1483; GFX9-NEXT: s_mov_b32 s2, -1 1484; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1485; GFX9-NEXT: s_endpgm 1486; 1487; GFX1064-LABEL: add_i64_varying: 1488; GFX1064: ; %bb.0: ; %entry 1489; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1490; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1491; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1492; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1493; GFX1064-NEXT: s_mov_b32 s2, -1 1494; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1495; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1496; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1497; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1498; GFX1064-NEXT: buffer_gl0_inv 1499; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1500; GFX1064-NEXT: s_endpgm 1501; 1502; GFX1032-LABEL: add_i64_varying: 1503; GFX1032: ; %bb.0: ; %entry 1504; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1505; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 1506; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1507; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1508; GFX1032-NEXT: s_mov_b32 s2, -1 1509; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1510; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1511; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] 1512; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1513; GFX1032-NEXT: buffer_gl0_inv 1514; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1515; GFX1032-NEXT: s_endpgm 1516entry: 1517 %lane = call i32 @llvm.amdgcn.workitem.id.x() 1518 %zext = zext i32 %lane to i64 1519 %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel 1520 store i64 %old, i64 addrspace(1)* %out 1521 ret void 1522} 1523 1524define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { 1525; 1526; 1527; GFX7LESS-LABEL: sub_i32_constant: 1528; GFX7LESS: ; %bb.0: ; %entry 1529; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1530; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1531; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1532; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1533; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1534; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1535; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 1536; GFX7LESS-NEXT: s_cbranch_execz BB8_2 1537; GFX7LESS-NEXT: ; %bb.1: 1538; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1539; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 1540; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1541; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 1542; GFX7LESS-NEXT: s_mov_b32 m0, -1 1543; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1544; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1545; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1546; GFX7LESS-NEXT: BB8_2: 1547; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 1548; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1549; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 1550; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1551; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1552; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1553; GFX7LESS-NEXT: s_mov_b32 s2, -1 1554; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1555; GFX7LESS-NEXT: s_endpgm 1556; 1557; GFX8-LABEL: sub_i32_constant: 1558; GFX8: ; %bb.0: ; %entry 1559; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1560; GFX8-NEXT: s_mov_b64 s[2:3], exec 1561; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1562; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1563; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1564; GFX8-NEXT: ; implicit-def: $vgpr1 1565; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 1566; GFX8-NEXT: s_cbranch_execz BB8_2 1567; GFX8-NEXT: ; %bb.1: 1568; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1569; GFX8-NEXT: s_mul_i32 s2, s2, 5 1570; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1571; GFX8-NEXT: v_mov_b32_e32 v2, s2 1572; GFX8-NEXT: s_mov_b32 m0, -1 1573; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1574; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1575; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1576; GFX8-NEXT: BB8_2: 1577; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1578; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1579; GFX8-NEXT: v_readfirstlane_b32 s2, v1 1580; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1581; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1582; GFX8-NEXT: s_mov_b32 s3, 0xf000 1583; GFX8-NEXT: s_mov_b32 s2, -1 1584; GFX8-NEXT: s_nop 0 1585; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1586; GFX8-NEXT: s_endpgm 1587; 1588; GFX9-LABEL: sub_i32_constant: 1589; GFX9: ; %bb.0: ; %entry 1590; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1591; GFX9-NEXT: s_mov_b64 s[2:3], exec 1592; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1593; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1594; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1595; GFX9-NEXT: ; implicit-def: $vgpr1 1596; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 1597; GFX9-NEXT: s_cbranch_execz BB8_2 1598; GFX9-NEXT: ; %bb.1: 1599; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1600; GFX9-NEXT: s_mul_i32 s2, s2, 5 1601; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1602; GFX9-NEXT: v_mov_b32_e32 v2, s2 1603; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1604; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1605; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1606; GFX9-NEXT: BB8_2: 1607; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1608; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1609; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1610; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1611; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1612; GFX9-NEXT: s_mov_b32 s3, 0xf000 1613; GFX9-NEXT: s_mov_b32 s2, -1 1614; GFX9-NEXT: s_nop 0 1615; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1616; GFX9-NEXT: s_endpgm 1617; 1618; GFX1064-LABEL: sub_i32_constant: 1619; GFX1064: ; %bb.0: ; %entry 1620; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1621; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1622; GFX1064-NEXT: ; implicit-def: $vgpr1 1623; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1624; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 1625; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1626; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 1627; GFX1064-NEXT: s_cbranch_execz BB8_2 1628; GFX1064-NEXT: ; %bb.1: 1629; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 1630; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1631; GFX1064-NEXT: s_mul_i32 s2, s2, 5 1632; GFX1064-NEXT: v_mov_b32_e32 v2, s2 1633; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1634; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1635; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1636; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1637; GFX1064-NEXT: buffer_gl0_inv 1638; GFX1064-NEXT: BB8_2: 1639; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1640; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 1641; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 1642; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1643; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 1644; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1645; GFX1064-NEXT: s_mov_b32 s2, -1 1646; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1647; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 1648; GFX1064-NEXT: s_endpgm 1649; 1650; GFX1032-LABEL: sub_i32_constant: 1651; GFX1032: ; %bb.0: ; %entry 1652; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1653; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1654; GFX1032-NEXT: ; implicit-def: $vgpr1 1655; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1656; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1657; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 1658; GFX1032-NEXT: s_cbranch_execz BB8_2 1659; GFX1032-NEXT: ; %bb.1: 1660; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 1661; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1662; GFX1032-NEXT: s_mul_i32 s3, s3, 5 1663; GFX1032-NEXT: v_mov_b32_e32 v2, s3 1664; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1665; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1666; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1667; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1668; GFX1032-NEXT: buffer_gl0_inv 1669; GFX1032-NEXT: BB8_2: 1670; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1671; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 1672; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 1673; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 1674; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 1675; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 1676; GFX1032-NEXT: s_mov_b32 s2, -1 1677; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1678; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 1679; GFX1032-NEXT: s_endpgm 1680entry: 1681 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel 1682 store i32 %old, i32 addrspace(1)* %out 1683 ret void 1684} 1685 1686define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { 1687; 1688; 1689; GFX7LESS-LABEL: sub_i32_uniform: 1690; GFX7LESS: ; %bb.0: ; %entry 1691; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec 1692; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1693; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb 1694; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 1695; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 1696; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1697; GFX7LESS-NEXT: ; implicit-def: $vgpr1 1698; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc 1699; GFX7LESS-NEXT: s_cbranch_execz BB9_2 1700; GFX7LESS-NEXT: ; %bb.1: 1701; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1702; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1703; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 1704; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1705; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 1706; GFX7LESS-NEXT: s_mov_b32 m0, -1 1707; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1708; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 1709; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX7LESS-NEXT: BB9_2: 1711; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] 1712; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1713; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 1714; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 1715; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 1716; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 1717; GFX7LESS-NEXT: s_mov_b32 s6, -1 1718; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 1719; GFX7LESS-NEXT: s_endpgm 1720; 1721; GFX8-LABEL: sub_i32_uniform: 1722; GFX8: ; %bb.0: ; %entry 1723; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1724; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c 1725; GFX8-NEXT: s_mov_b64 s[2:3], exec 1726; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 1727; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 1728; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1729; GFX8-NEXT: ; implicit-def: $vgpr1 1730; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc 1731; GFX8-NEXT: s_cbranch_execz BB9_2 1732; GFX8-NEXT: ; %bb.1: 1733; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 1734; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1735; GFX8-NEXT: s_mul_i32 s1, s0, s1 1736; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1737; GFX8-NEXT: v_mov_b32_e32 v2, s1 1738; GFX8-NEXT: s_mov_b32 m0, -1 1739; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 1741; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1742; GFX8-NEXT: BB9_2: 1743; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1744; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1745; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 1746; GFX8-NEXT: v_readfirstlane_b32 s0, v1 1747; GFX8-NEXT: s_mov_b32 s7, 0xf000 1748; GFX8-NEXT: s_mov_b32 s6, -1 1749; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1750; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1751; GFX8-NEXT: s_endpgm 1752; 1753; GFX9-LABEL: sub_i32_uniform: 1754; GFX9: ; %bb.0: ; %entry 1755; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1756; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1757; GFX9-NEXT: s_mov_b64 s[6:7], exec 1758; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 1759; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 1760; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1761; GFX9-NEXT: ; implicit-def: $vgpr1 1762; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc 1763; GFX9-NEXT: s_cbranch_execz BB9_2 1764; GFX9-NEXT: ; %bb.1: 1765; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1766; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1767; GFX9-NEXT: s_mul_i32 s3, s2, s3 1768; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1769; GFX9-NEXT: v_mov_b32_e32 v2, s3 1770; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1771; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 1772; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1773; GFX9-NEXT: BB9_2: 1774; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 1775; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1776; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 1777; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1778; GFX9-NEXT: s_mov_b32 s7, 0xf000 1779; GFX9-NEXT: s_mov_b32 s6, -1 1780; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 1781; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1782; GFX9-NEXT: s_endpgm 1783; 1784; GFX1064-LABEL: sub_i32_uniform: 1785; GFX1064: ; %bb.0: ; %entry 1786; GFX1064-NEXT: s_clause 0x1 1787; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1788; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c 1789; GFX1064-NEXT: s_mov_b64 s[6:7], exec 1790; GFX1064-NEXT: ; implicit-def: $vgpr1 1791; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 1792; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 1793; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1794; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 1795; GFX1064-NEXT: s_cbranch_execz BB9_2 1796; GFX1064-NEXT: ; %bb.1: 1797; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] 1798; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1799; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1800; GFX1064-NEXT: s_mul_i32 s3, s2, s3 1801; GFX1064-NEXT: v_mov_b32_e32 v2, s3 1802; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1803; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1804; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 1805; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1806; GFX1064-NEXT: buffer_gl0_inv 1807; GFX1064-NEXT: BB9_2: 1808; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1809; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1810; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1811; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 1812; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 1813; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 1814; GFX1064-NEXT: s_mov_b32 s6, -1 1815; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1816; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 1817; GFX1064-NEXT: s_endpgm 1818; 1819; GFX1032-LABEL: sub_i32_uniform: 1820; GFX1032: ; %bb.0: ; %entry 1821; GFX1032-NEXT: s_clause 0x1 1822; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1823; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c 1824; GFX1032-NEXT: s_mov_b32 s3, exec_lo 1825; GFX1032-NEXT: ; implicit-def: $vgpr1 1826; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 1827; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1828; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 1829; GFX1032-NEXT: s_cbranch_execz BB9_2 1830; GFX1032-NEXT: ; %bb.1: 1831; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 1832; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1833; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1834; GFX1032-NEXT: s_mul_i32 s1, s2, s1 1835; GFX1032-NEXT: v_mov_b32_e32 v2, s1 1836; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1837; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1838; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 1839; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1840; GFX1032-NEXT: buffer_gl0_inv 1841; GFX1032-NEXT: BB9_2: 1842; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1843; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1844; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1845; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 1846; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 1847; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 1848; GFX1032-NEXT: s_mov_b32 s6, -1 1849; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 1850; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 1851; GFX1032-NEXT: s_endpgm 1852entry: 1853 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel 1854 store i32 %old, i32 addrspace(1)* %out 1855 ret void 1856} 1857 1858define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { 1859; 1860; 1861; GFX7LESS-LABEL: sub_i32_varying: 1862; GFX7LESS: ; %bb.0: ; %entry 1863; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1864; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 1865; GFX7LESS-NEXT: s_mov_b32 m0, -1 1866; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1867; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 1868; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 1869; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 1870; GFX7LESS-NEXT: s_mov_b32 s2, -1 1871; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 1872; GFX7LESS-NEXT: s_endpgm 1873; 1874; GFX8-LABEL: sub_i32_varying: 1875; GFX8: ; %bb.0: ; %entry 1876; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1877; GFX8-NEXT: v_mov_b32_e32 v2, v0 1878; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1879; GFX8-NEXT: v_mov_b32_e32 v1, 0 1880; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1881; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1882; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1883; GFX8-NEXT: s_not_b64 exec, exec 1884; GFX8-NEXT: v_mov_b32_e32 v2, 0 1885; GFX8-NEXT: s_not_b64 exec, exec 1886; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 1887; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1888; GFX8-NEXT: s_nop 1 1889; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1890; GFX8-NEXT: s_nop 1 1891; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1892; GFX8-NEXT: s_nop 1 1893; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1894; GFX8-NEXT: s_nop 1 1895; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1896; GFX8-NEXT: s_nop 1 1897; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1898; GFX8-NEXT: v_readlane_b32 s4, v2, 63 1899; GFX8-NEXT: s_nop 0 1900; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1901; GFX8-NEXT: s_mov_b64 exec, s[2:3] 1902; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1903; GFX8-NEXT: ; implicit-def: $vgpr0 1904; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 1905; GFX8-NEXT: s_cbranch_execz BB10_2 1906; GFX8-NEXT: ; %bb.1: 1907; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1908; GFX8-NEXT: v_mov_b32_e32 v3, s4 1909; GFX8-NEXT: s_mov_b32 m0, -1 1910; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1911; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 1912; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1913; GFX8-NEXT: BB10_2: 1914; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 1915; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1916; GFX8-NEXT: v_readfirstlane_b32 s2, v0 1917; GFX8-NEXT: v_mov_b32_e32 v0, v1 1918; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 1919; GFX8-NEXT: s_mov_b32 s3, 0xf000 1920; GFX8-NEXT: s_mov_b32 s2, -1 1921; GFX8-NEXT: s_nop 0 1922; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1923; GFX8-NEXT: s_endpgm 1924; 1925; GFX9-LABEL: sub_i32_varying: 1926; GFX9: ; %bb.0: ; %entry 1927; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1928; GFX9-NEXT: v_mov_b32_e32 v2, v0 1929; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1930; GFX9-NEXT: v_mov_b32_e32 v1, 0 1931; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1932; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 1933; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 1934; GFX9-NEXT: s_not_b64 exec, exec 1935; GFX9-NEXT: v_mov_b32_e32 v2, 0 1936; GFX9-NEXT: s_not_b64 exec, exec 1937; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 1938; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1939; GFX9-NEXT: s_nop 1 1940; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1941; GFX9-NEXT: s_nop 1 1942; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1943; GFX9-NEXT: s_nop 1 1944; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1945; GFX9-NEXT: s_nop 1 1946; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 1947; GFX9-NEXT: s_nop 1 1948; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1949; GFX9-NEXT: v_readlane_b32 s4, v2, 63 1950; GFX9-NEXT: s_nop 0 1951; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 1952; GFX9-NEXT: s_mov_b64 exec, s[2:3] 1953; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1954; GFX9-NEXT: ; implicit-def: $vgpr0 1955; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 1956; GFX9-NEXT: s_cbranch_execz BB10_2 1957; GFX9-NEXT: ; %bb.1: 1958; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 1959; GFX9-NEXT: v_mov_b32_e32 v3, s4 1960; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1961; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 1962; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1963; GFX9-NEXT: BB10_2: 1964; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1965; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1966; GFX9-NEXT: v_readfirstlane_b32 s2, v0 1967; GFX9-NEXT: v_mov_b32_e32 v0, v1 1968; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1969; GFX9-NEXT: s_mov_b32 s3, 0xf000 1970; GFX9-NEXT: s_mov_b32 s2, -1 1971; GFX9-NEXT: s_nop 0 1972; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1973; GFX9-NEXT: s_endpgm 1974; 1975; GFX1064-LABEL: sub_i32_varying: 1976; GFX1064: ; %bb.0: ; %entry 1977; GFX1064-NEXT: v_mov_b32_e32 v1, v0 1978; GFX1064-NEXT: s_not_b64 exec, exec 1979; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1980; GFX1064-NEXT: s_not_b64 exec, exec 1981; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1982; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 1983; GFX1064-NEXT: v_mov_b32_e32 v3, 0 1984; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 1985; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 1986; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 1987; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1988; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 1989; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 1990; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 1991; GFX1064-NEXT: v_mov_b32_e32 v2, s4 1992; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 1993; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 1994; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 1995; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1996; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1997; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1998; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 1999; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2000; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2001; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2002; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2003; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2004; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2005; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2006; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2007; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2008; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2009; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2010; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2011; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2012; GFX1064-NEXT: s_mov_b32 s2, -1 2013; GFX1064-NEXT: ; implicit-def: $vgpr0 2014; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2015; GFX1064-NEXT: s_cbranch_execz BB10_2 2016; GFX1064-NEXT: ; %bb.1: 2017; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2018; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2019; GFX1064-NEXT: s_mov_b32 s3, s7 2020; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2021; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2022; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 2023; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2024; GFX1064-NEXT: buffer_gl0_inv 2025; GFX1064-NEXT: BB10_2: 2026; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2027; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2028; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2029; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2030; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2031; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2032; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2033; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2034; GFX1064-NEXT: s_endpgm 2035; 2036; GFX1032-LABEL: sub_i32_varying: 2037; GFX1032: ; %bb.0: ; %entry 2038; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2039; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2040; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2041; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2042; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2043; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2044; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2045; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2046; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2047; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2048; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2049; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2050; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2051; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2052; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2053; GFX1032-NEXT: v_mov_b32_e32 v3, 0 2054; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2055; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2056; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2057; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2058; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2059; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2060; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2061; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2062; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2063; GFX1032-NEXT: s_mov_b32 s2, -1 2064; GFX1032-NEXT: ; implicit-def: $vgpr0 2065; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2066; GFX1032-NEXT: s_cbranch_execz BB10_2 2067; GFX1032-NEXT: ; %bb.1: 2068; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2069; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2070; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2071; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2072; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 2073; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2074; GFX1032-NEXT: buffer_gl0_inv 2075; GFX1032-NEXT: BB10_2: 2076; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2077; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2078; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2079; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2080; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 2081; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2082; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2083; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2084; GFX1032-NEXT: s_endpgm 2085entry: 2086 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2087 %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2088 store i32 %old, i32 addrspace(1)* %out 2089 ret void 2090} 2091 2092define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { 2093; 2094; 2095; GFX7LESS-LABEL: sub_i64_constant: 2096; GFX7LESS: ; %bb.0: ; %entry 2097; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec 2098; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2099; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2100; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 2101; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2102; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2103; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 2104; GFX7LESS-NEXT: s_cbranch_execz BB11_2 2105; GFX7LESS-NEXT: ; %bb.1: 2106; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2107; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2108; GFX7LESS-NEXT: s_mul_i32 s5, s4, 5 2109; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2110; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 2111; GFX7LESS-NEXT: s_mov_b32 m0, -1 2112; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2113; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2114; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2115; GFX7LESS-NEXT: BB11_2: 2116; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 2117; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2118; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 2119; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 2120; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2121; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2122; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2123; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 2124; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2125; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2126; GFX7LESS-NEXT: s_mov_b32 s2, -1 2127; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2128; GFX7LESS-NEXT: s_endpgm 2129; 2130; GFX8-LABEL: sub_i64_constant: 2131; GFX8: ; %bb.0: ; %entry 2132; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2133; GFX8-NEXT: s_mov_b64 s[4:5], exec 2134; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2135; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2136; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2137; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2138; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2139; GFX8-NEXT: s_cbranch_execz BB11_2 2140; GFX8-NEXT: ; %bb.1: 2141; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2142; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2143; GFX8-NEXT: s_mul_i32 s4, s4, 5 2144; GFX8-NEXT: v_mov_b32_e32 v1, s4 2145; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2146; GFX8-NEXT: s_mov_b32 m0, -1 2147; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2148; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2149; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2150; GFX8-NEXT: BB11_2: 2151; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2152; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2153; GFX8-NEXT: v_readfirstlane_b32 s3, v2 2154; GFX8-NEXT: v_readfirstlane_b32 s2, v1 2155; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2156; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2157; GFX8-NEXT: v_mov_b32_e32 v2, s3 2158; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 2159; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2160; GFX8-NEXT: s_mov_b32 s3, 0xf000 2161; GFX8-NEXT: s_mov_b32 s2, -1 2162; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2163; GFX8-NEXT: s_endpgm 2164; 2165; GFX9-LABEL: sub_i64_constant: 2166; GFX9: ; %bb.0: ; %entry 2167; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2168; GFX9-NEXT: s_mov_b64 s[4:5], exec 2169; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 2170; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 2171; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2172; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2173; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2174; GFX9-NEXT: s_cbranch_execz BB11_2 2175; GFX9-NEXT: ; %bb.1: 2176; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2177; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2178; GFX9-NEXT: s_mul_i32 s4, s4, 5 2179; GFX9-NEXT: v_mov_b32_e32 v1, s4 2180; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2181; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2182; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2183; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2184; GFX9-NEXT: BB11_2: 2185; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2186; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2187; GFX9-NEXT: v_readfirstlane_b32 s3, v2 2188; GFX9-NEXT: v_readfirstlane_b32 s2, v1 2189; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 2190; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 2191; GFX9-NEXT: v_mov_b32_e32 v2, s3 2192; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 2193; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2194; GFX9-NEXT: s_mov_b32 s3, 0xf000 2195; GFX9-NEXT: s_mov_b32 s2, -1 2196; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2197; GFX9-NEXT: s_endpgm 2198; 2199; GFX1064-LABEL: sub_i64_constant: 2200; GFX1064: ; %bb.0: ; %entry 2201; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2202; GFX1064-NEXT: s_mov_b64 s[4:5], exec 2203; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2204; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 2205; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 2206; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2207; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2208; GFX1064-NEXT: s_cbranch_execz BB11_2 2209; GFX1064-NEXT: ; %bb.1: 2210; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] 2211; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2212; GFX1064-NEXT: s_mul_i32 s5, s4, 5 2213; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 2214; GFX1064-NEXT: v_mov_b32_e32 v1, s5 2215; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2216; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2217; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2218; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2219; GFX1064-NEXT: buffer_gl0_inv 2220; GFX1064-NEXT: BB11_2: 2221; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2222; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2223; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2224; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2225; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 2226; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2227; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 2228; GFX1064-NEXT: s_mov_b32 s2, -1 2229; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc 2230; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2231; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2232; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2233; GFX1064-NEXT: s_endpgm 2234; 2235; GFX1032-LABEL: sub_i64_constant: 2236; GFX1032: ; %bb.0: ; %entry 2237; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2238; GFX1032-NEXT: s_mov_b32 s3, exec_lo 2239; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2240; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 2241; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2242; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 2243; GFX1032-NEXT: s_cbranch_execz BB11_2 2244; GFX1032-NEXT: ; %bb.1: 2245; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 2246; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2247; GFX1032-NEXT: s_mul_i32 s4, s3, 5 2248; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 2249; GFX1032-NEXT: v_mov_b32_e32 v1, s4 2250; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2251; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2252; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2253; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2254; GFX1032-NEXT: buffer_gl0_inv 2255; GFX1032-NEXT: BB11_2: 2256; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2257; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 2258; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2259; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 2260; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 2261; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 2262; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 2263; GFX1032-NEXT: s_mov_b32 s2, -1 2264; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo 2265; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2266; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2267; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2268; GFX1032-NEXT: s_endpgm 2269entry: 2270 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel 2271 store i64 %old, i64 addrspace(1)* %out 2272 ret void 2273} 2274 2275define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { 2276; 2277; 2278; GFX7LESS-LABEL: sub_i64_uniform: 2279; GFX7LESS: ; %bb.0: ; %entry 2280; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec 2281; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2282; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2283; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 2284; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2285; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 2286; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc 2287; GFX7LESS-NEXT: s_cbranch_execz BB12_2 2288; GFX7LESS-NEXT: ; %bb.1: 2289; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2290; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2291; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2292; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 2293; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2294; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 2295; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 2296; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 2297; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 2298; GFX7LESS-NEXT: s_mov_b32 m0, -1 2299; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2300; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2301; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX7LESS-NEXT: BB12_2: 2303; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] 2304; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 2305; GFX7LESS-NEXT: s_mov_b32 s6, -1 2306; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2307; GFX7LESS-NEXT: s_mov_b32 s4, s0 2308; GFX7LESS-NEXT: s_mov_b32 s5, s1 2309; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 2310; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 2311; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 2312; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 2313; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 2314; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2315; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 2316; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2317; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2318; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2319; GFX7LESS-NEXT: s_endpgm 2320; 2321; GFX8-LABEL: sub_i64_uniform: 2322; GFX8: ; %bb.0: ; %entry 2323; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2324; GFX8-NEXT: s_mov_b64 s[6:7], exec 2325; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2326; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2327; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2328; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 2329; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 2330; GFX8-NEXT: s_cbranch_execz BB12_2 2331; GFX8-NEXT: ; %bb.1: 2332; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2333; GFX8-NEXT: v_mov_b32_e32 v1, s6 2334; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2335; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 2336; GFX8-NEXT: s_mul_i32 s7, s3, s6 2337; GFX8-NEXT: s_mul_i32 s6, s2, s6 2338; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2339; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 2340; GFX8-NEXT: v_mov_b32_e32 v1, s6 2341; GFX8-NEXT: s_mov_b32 m0, -1 2342; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2343; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2344; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2345; GFX8-NEXT: BB12_2: 2346; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2347; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2348; GFX8-NEXT: s_mov_b32 s4, s0 2349; GFX8-NEXT: v_readfirstlane_b32 s0, v1 2350; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 2351; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 2352; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 2353; GFX8-NEXT: s_mov_b32 s5, s1 2354; GFX8-NEXT: v_readfirstlane_b32 s1, v2 2355; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 2356; GFX8-NEXT: v_mov_b32_e32 v2, s1 2357; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2358; GFX8-NEXT: s_mov_b32 s7, 0xf000 2359; GFX8-NEXT: s_mov_b32 s6, -1 2360; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2361; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2362; GFX8-NEXT: s_endpgm 2363; 2364; GFX9-LABEL: sub_i64_uniform: 2365; GFX9: ; %bb.0: ; %entry 2366; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2367; GFX9-NEXT: s_mov_b64 s[6:7], exec 2368; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 2369; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 2370; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2371; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 2372; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 2373; GFX9-NEXT: s_cbranch_execz BB12_2 2374; GFX9-NEXT: ; %bb.1: 2375; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2376; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2377; GFX9-NEXT: s_mul_i32 s7, s3, s6 2378; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 2379; GFX9-NEXT: s_add_i32 s8, s8, s7 2380; GFX9-NEXT: s_mul_i32 s6, s2, s6 2381; GFX9-NEXT: v_mov_b32_e32 v1, s6 2382; GFX9-NEXT: v_mov_b32_e32 v2, s8 2383; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2385; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2386; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2387; GFX9-NEXT: BB12_2: 2388; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 2389; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2390; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 2391; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 2392; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 2393; GFX9-NEXT: s_mov_b32 s4, s0 2394; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2395; GFX9-NEXT: s_mov_b32 s5, s1 2396; GFX9-NEXT: v_readfirstlane_b32 s1, v2 2397; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 2398; GFX9-NEXT: v_mov_b32_e32 v2, s1 2399; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2400; GFX9-NEXT: s_mov_b32 s7, 0xf000 2401; GFX9-NEXT: s_mov_b32 s6, -1 2402; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2403; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2404; GFX9-NEXT: s_endpgm 2405; 2406; GFX1064-LABEL: sub_i64_uniform: 2407; GFX1064: ; %bb.0: ; %entry 2408; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2409; GFX1064-NEXT: s_mov_b64 s[6:7], exec 2410; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 2411; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 2412; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 2413; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2414; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2415; GFX1064-NEXT: s_cbranch_execz BB12_2 2416; GFX1064-NEXT: ; %bb.1: 2417; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] 2418; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2419; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2420; GFX1064-NEXT: s_mul_i32 s7, s3, s6 2421; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 2422; GFX1064-NEXT: s_mul_i32 s6, s2, s6 2423; GFX1064-NEXT: s_add_i32 s8, s8, s7 2424; GFX1064-NEXT: v_mov_b32_e32 v1, s6 2425; GFX1064-NEXT: v_mov_b32_e32 v2, s8 2426; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2427; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2428; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2429; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2430; GFX1064-NEXT: buffer_gl0_inv 2431; GFX1064-NEXT: BB12_2: 2432; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2433; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2434; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2435; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 2436; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 2437; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 2438; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 2439; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 2440; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2441; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 2442; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0 2443; GFX1064-NEXT: s_mov_b32 s2, -1 2444; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc 2445; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2446; GFX1064-NEXT: s_endpgm 2447; 2448; GFX1032-LABEL: sub_i64_uniform: 2449; GFX1032: ; %bb.0: ; %entry 2450; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2451; GFX1032-NEXT: s_mov_b32 s5, exec_lo 2452; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 2453; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 2454; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2455; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo 2456; GFX1032-NEXT: s_cbranch_execz BB12_2 2457; GFX1032-NEXT: ; %bb.1: 2458; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 2459; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo 2460; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2461; GFX1032-NEXT: s_mul_i32 s6, s3, s5 2462; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 2463; GFX1032-NEXT: s_mul_i32 s5, s2, s5 2464; GFX1032-NEXT: s_add_i32 s7, s7, s6 2465; GFX1032-NEXT: v_mov_b32_e32 v1, s5 2466; GFX1032-NEXT: v_mov_b32_e32 v2, s7 2467; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2468; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2469; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] 2470; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2471; GFX1032-NEXT: buffer_gl0_inv 2472; GFX1032-NEXT: BB12_2: 2473; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2474; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 2475; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2476; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 2477; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 2478; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 2479; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 2480; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 2481; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2482; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 2483; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0 2484; GFX1032-NEXT: s_mov_b32 s2, -1 2485; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo 2486; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2487; GFX1032-NEXT: s_endpgm 2488entry: 2489 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel 2490 store i64 %old, i64 addrspace(1)* %out 2491 ret void 2492} 2493 2494; GCN-NOT: v_mbcnt_lo_u32_b32 2495; GCN-NOT: v_mbcnt_hi_u32_b32 2496; GCN-NOT: s_bcnt1_i32_b64 2497define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { 2498; 2499; 2500; GFX7LESS-LABEL: sub_i64_varying: 2501; GFX7LESS: ; %bb.0: ; %entry 2502; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2503; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 2504; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2505; GFX7LESS-NEXT: s_mov_b32 m0, -1 2506; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2507; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2508; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2509; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2510; GFX7LESS-NEXT: s_mov_b32 s2, -1 2511; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2512; GFX7LESS-NEXT: s_endpgm 2513; 2514; GFX8-LABEL: sub_i64_varying: 2515; GFX8: ; %bb.0: ; %entry 2516; GFX8-NEXT: v_mov_b32_e32 v1, 0 2517; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2518; GFX8-NEXT: s_mov_b32 m0, -1 2519; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2520; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2521; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2522; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2523; GFX8-NEXT: s_mov_b32 s3, 0xf000 2524; GFX8-NEXT: s_mov_b32 s2, -1 2525; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2526; GFX8-NEXT: s_endpgm 2527; 2528; GFX9-LABEL: sub_i64_varying: 2529; GFX9: ; %bb.0: ; %entry 2530; GFX9-NEXT: v_mov_b32_e32 v1, 0 2531; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2532; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2533; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2534; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2535; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2536; GFX9-NEXT: s_mov_b32 s3, 0xf000 2537; GFX9-NEXT: s_mov_b32 s2, -1 2538; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2539; GFX9-NEXT: s_endpgm 2540; 2541; GFX1064-LABEL: sub_i64_varying: 2542; GFX1064: ; %bb.0: ; %entry 2543; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2544; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2545; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2546; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2547; GFX1064-NEXT: s_mov_b32 s2, -1 2548; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2549; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2550; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2551; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2552; GFX1064-NEXT: buffer_gl0_inv 2553; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2554; GFX1064-NEXT: s_endpgm 2555; 2556; GFX1032-LABEL: sub_i64_varying: 2557; GFX1032: ; %bb.0: ; %entry 2558; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2559; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 2560; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2561; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2562; GFX1032-NEXT: s_mov_b32 s2, -1 2563; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2564; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2565; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] 2566; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2567; GFX1032-NEXT: buffer_gl0_inv 2568; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2569; GFX1032-NEXT: s_endpgm 2570entry: 2571 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2572 %zext = zext i32 %lane to i64 2573 %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel 2574 store i64 %old, i64 addrspace(1)* %out 2575 ret void 2576} 2577 2578define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { 2579; 2580; 2581; GFX7LESS-LABEL: and_i32_varying: 2582; GFX7LESS: ; %bb.0: ; %entry 2583; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2584; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2585; GFX7LESS-NEXT: s_mov_b32 m0, -1 2586; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2587; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 2588; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2589; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2590; GFX7LESS-NEXT: s_mov_b32 s2, -1 2591; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2592; GFX7LESS-NEXT: s_endpgm 2593; 2594; GFX8-LABEL: and_i32_varying: 2595; GFX8: ; %bb.0: ; %entry 2596; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2597; GFX8-NEXT: v_mov_b32_e32 v2, v0 2598; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2599; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2600; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2601; GFX8-NEXT: v_mov_b32_e32 v1, -1 2602; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2603; GFX8-NEXT: s_not_b64 exec, exec 2604; GFX8-NEXT: v_mov_b32_e32 v2, -1 2605; GFX8-NEXT: s_not_b64 exec, exec 2606; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2607; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2608; GFX8-NEXT: s_nop 1 2609; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2610; GFX8-NEXT: s_nop 1 2611; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2612; GFX8-NEXT: s_nop 1 2613; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2614; GFX8-NEXT: s_nop 1 2615; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2616; GFX8-NEXT: s_nop 1 2617; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2618; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2619; GFX8-NEXT: s_nop 0 2620; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2621; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2622; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2623; GFX8-NEXT: ; implicit-def: $vgpr0 2624; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2625; GFX8-NEXT: s_cbranch_execz BB14_2 2626; GFX8-NEXT: ; %bb.1: 2627; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2628; GFX8-NEXT: v_mov_b32_e32 v3, s4 2629; GFX8-NEXT: s_mov_b32 m0, -1 2630; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2631; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 2632; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2633; GFX8-NEXT: BB14_2: 2634; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2635; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2636; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2637; GFX8-NEXT: v_mov_b32_e32 v0, v1 2638; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 2639; GFX8-NEXT: s_mov_b32 s3, 0xf000 2640; GFX8-NEXT: s_mov_b32 s2, -1 2641; GFX8-NEXT: s_nop 0 2642; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2643; GFX8-NEXT: s_endpgm 2644; 2645; GFX9-LABEL: and_i32_varying: 2646; GFX9: ; %bb.0: ; %entry 2647; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2648; GFX9-NEXT: v_mov_b32_e32 v2, v0 2649; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2650; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2651; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2652; GFX9-NEXT: v_mov_b32_e32 v1, -1 2653; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2654; GFX9-NEXT: s_not_b64 exec, exec 2655; GFX9-NEXT: v_mov_b32_e32 v2, -1 2656; GFX9-NEXT: s_not_b64 exec, exec 2657; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2658; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 2659; GFX9-NEXT: s_nop 1 2660; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 2661; GFX9-NEXT: s_nop 1 2662; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 2663; GFX9-NEXT: s_nop 1 2664; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 2665; GFX9-NEXT: s_nop 1 2666; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2667; GFX9-NEXT: s_nop 1 2668; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2669; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2670; GFX9-NEXT: s_nop 0 2671; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2672; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2673; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2674; GFX9-NEXT: ; implicit-def: $vgpr0 2675; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2676; GFX9-NEXT: s_cbranch_execz BB14_2 2677; GFX9-NEXT: ; %bb.1: 2678; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2679; GFX9-NEXT: v_mov_b32_e32 v3, s4 2680; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2681; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 2682; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2683; GFX9-NEXT: BB14_2: 2684; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2685; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2686; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2687; GFX9-NEXT: v_mov_b32_e32 v0, v1 2688; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 2689; GFX9-NEXT: s_mov_b32 s3, 0xf000 2690; GFX9-NEXT: s_mov_b32 s2, -1 2691; GFX9-NEXT: s_nop 0 2692; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2693; GFX9-NEXT: s_endpgm 2694; 2695; GFX1064-LABEL: and_i32_varying: 2696; GFX1064: ; %bb.0: ; %entry 2697; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2698; GFX1064-NEXT: s_not_b64 exec, exec 2699; GFX1064-NEXT: v_mov_b32_e32 v1, -1 2700; GFX1064-NEXT: s_not_b64 exec, exec 2701; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2702; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2703; GFX1064-NEXT: v_mov_b32_e32 v3, -1 2704; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2705; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2706; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2707; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2708; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2709; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2710; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2711; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2712; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2713; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2714; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2715; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2716; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2717; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2718; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2719; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2720; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2721; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2722; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2723; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2724; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2725; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2726; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2727; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2728; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2729; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2730; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2731; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2732; GFX1064-NEXT: s_mov_b32 s2, -1 2733; GFX1064-NEXT: ; implicit-def: $vgpr0 2734; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2735; GFX1064-NEXT: s_cbranch_execz BB14_2 2736; GFX1064-NEXT: ; %bb.1: 2737; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2738; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2739; GFX1064-NEXT: s_mov_b32 s3, s7 2740; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2741; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2742; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 2743; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2744; GFX1064-NEXT: buffer_gl0_inv 2745; GFX1064-NEXT: BB14_2: 2746; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2747; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2748; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2749; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2750; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 2751; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2752; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2753; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2754; GFX1064-NEXT: s_endpgm 2755; 2756; GFX1032-LABEL: and_i32_varying: 2757; GFX1032: ; %bb.0: ; %entry 2758; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2759; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2760; GFX1032-NEXT: v_mov_b32_e32 v1, -1 2761; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2762; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2763; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2764; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 2765; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 2766; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 2767; GFX1032-NEXT: v_mov_b32_e32 v2, v1 2768; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2769; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2770; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2771; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2772; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2773; GFX1032-NEXT: v_mov_b32_e32 v3, -1 2774; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 2775; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 2776; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2777; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2778; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2779; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2780; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 2781; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2782; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2783; GFX1032-NEXT: s_mov_b32 s2, -1 2784; GFX1032-NEXT: ; implicit-def: $vgpr0 2785; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 2786; GFX1032-NEXT: s_cbranch_execz BB14_2 2787; GFX1032-NEXT: ; %bb.1: 2788; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2789; GFX1032-NEXT: v_mov_b32_e32 v4, s4 2790; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2791; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 2792; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 2793; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2794; GFX1032-NEXT: buffer_gl0_inv 2795; GFX1032-NEXT: BB14_2: 2796; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2797; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 2798; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 2799; GFX1032-NEXT: v_mov_b32_e32 v0, v3 2800; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 2801; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 2802; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2803; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 2804; GFX1032-NEXT: s_endpgm 2805entry: 2806 %lane = call i32 @llvm.amdgcn.workitem.id.x() 2807 %old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel 2808 store i32 %old, i32 addrspace(1)* %out 2809 ret void 2810} 2811 2812define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { 2813; 2814; 2815; GFX7LESS-LABEL: or_i32_varying: 2816; GFX7LESS: ; %bb.0: ; %entry 2817; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2818; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 2819; GFX7LESS-NEXT: s_mov_b32 m0, -1 2820; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2821; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 2822; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 2823; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 2824; GFX7LESS-NEXT: s_mov_b32 s2, -1 2825; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 2826; GFX7LESS-NEXT: s_endpgm 2827; 2828; GFX8-LABEL: or_i32_varying: 2829; GFX8: ; %bb.0: ; %entry 2830; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2831; GFX8-NEXT: v_mov_b32_e32 v2, v0 2832; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2833; GFX8-NEXT: v_mov_b32_e32 v1, 0 2834; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2835; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2836; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2837; GFX8-NEXT: s_not_b64 exec, exec 2838; GFX8-NEXT: v_mov_b32_e32 v2, 0 2839; GFX8-NEXT: s_not_b64 exec, exec 2840; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 2841; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2842; GFX8-NEXT: s_nop 1 2843; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2844; GFX8-NEXT: s_nop 1 2845; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2846; GFX8-NEXT: s_nop 1 2847; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2848; GFX8-NEXT: s_nop 1 2849; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2850; GFX8-NEXT: s_nop 1 2851; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2852; GFX8-NEXT: v_readlane_b32 s4, v2, 63 2853; GFX8-NEXT: s_nop 0 2854; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2855; GFX8-NEXT: s_mov_b64 exec, s[2:3] 2856; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2857; GFX8-NEXT: ; implicit-def: $vgpr0 2858; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 2859; GFX8-NEXT: s_cbranch_execz BB15_2 2860; GFX8-NEXT: ; %bb.1: 2861; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2862; GFX8-NEXT: v_mov_b32_e32 v3, s4 2863; GFX8-NEXT: s_mov_b32 m0, -1 2864; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2865; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 2866; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2867; GFX8-NEXT: BB15_2: 2868; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 2869; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2870; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2871; GFX8-NEXT: v_mov_b32_e32 v0, v1 2872; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 2873; GFX8-NEXT: s_mov_b32 s3, 0xf000 2874; GFX8-NEXT: s_mov_b32 s2, -1 2875; GFX8-NEXT: s_nop 0 2876; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 2877; GFX8-NEXT: s_endpgm 2878; 2879; GFX9-LABEL: or_i32_varying: 2880; GFX9: ; %bb.0: ; %entry 2881; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2882; GFX9-NEXT: v_mov_b32_e32 v2, v0 2883; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2884; GFX9-NEXT: v_mov_b32_e32 v1, 0 2885; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2886; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 2887; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 2888; GFX9-NEXT: s_not_b64 exec, exec 2889; GFX9-NEXT: v_mov_b32_e32 v2, 0 2890; GFX9-NEXT: s_not_b64 exec, exec 2891; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 2892; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2893; GFX9-NEXT: s_nop 1 2894; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2895; GFX9-NEXT: s_nop 1 2896; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2897; GFX9-NEXT: s_nop 1 2898; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2899; GFX9-NEXT: s_nop 1 2900; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 2901; GFX9-NEXT: s_nop 1 2902; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 2903; GFX9-NEXT: v_readlane_b32 s4, v2, 63 2904; GFX9-NEXT: s_nop 0 2905; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 2906; GFX9-NEXT: s_mov_b64 exec, s[2:3] 2907; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2908; GFX9-NEXT: ; implicit-def: $vgpr0 2909; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 2910; GFX9-NEXT: s_cbranch_execz BB15_2 2911; GFX9-NEXT: ; %bb.1: 2912; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 2913; GFX9-NEXT: v_mov_b32_e32 v3, s4 2914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2915; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 2916; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2917; GFX9-NEXT: BB15_2: 2918; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 2919; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2920; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2921; GFX9-NEXT: v_mov_b32_e32 v0, v1 2922; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 2923; GFX9-NEXT: s_mov_b32 s3, 0xf000 2924; GFX9-NEXT: s_mov_b32 s2, -1 2925; GFX9-NEXT: s_nop 0 2926; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2927; GFX9-NEXT: s_endpgm 2928; 2929; GFX1064-LABEL: or_i32_varying: 2930; GFX1064: ; %bb.0: ; %entry 2931; GFX1064-NEXT: v_mov_b32_e32 v1, v0 2932; GFX1064-NEXT: s_not_b64 exec, exec 2933; GFX1064-NEXT: v_mov_b32_e32 v1, 0 2934; GFX1064-NEXT: s_not_b64 exec, exec 2935; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2936; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2937; GFX1064-NEXT: v_mov_b32_e32 v3, 0 2938; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2939; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 2940; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 2941; GFX1064-NEXT: v_mov_b32_e32 v2, v1 2942; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 2943; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 2944; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 2945; GFX1064-NEXT: v_mov_b32_e32 v2, s4 2946; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 2947; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 2948; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 2949; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2950; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2951; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2952; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 2953; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 2954; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2955; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 2956; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 2957; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 2958; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 2959; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 2960; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 2961; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 2962; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2963; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 2964; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2965; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2966; GFX1064-NEXT: s_mov_b32 s2, -1 2967; GFX1064-NEXT: ; implicit-def: $vgpr0 2968; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 2969; GFX1064-NEXT: s_cbranch_execz BB15_2 2970; GFX1064-NEXT: ; %bb.1: 2971; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 2972; GFX1064-NEXT: v_mov_b32_e32 v4, s7 2973; GFX1064-NEXT: s_mov_b32 s3, s7 2974; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2975; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 2976; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 2977; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2978; GFX1064-NEXT: buffer_gl0_inv 2979; GFX1064-NEXT: BB15_2: 2980; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2981; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 2982; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 2983; GFX1064-NEXT: v_mov_b32_e32 v0, v3 2984; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 2985; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 2986; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2987; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 2988; GFX1064-NEXT: s_endpgm 2989; 2990; GFX1032-LABEL: or_i32_varying: 2991; GFX1032: ; %bb.0: ; %entry 2992; GFX1032-NEXT: v_mov_b32_e32 v1, v0 2993; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2994; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2995; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 2996; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2997; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 2998; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 2999; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3000; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3001; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3002; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3003; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3004; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3005; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3006; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3007; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3008; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3009; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3010; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3011; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3012; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3013; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3014; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3015; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3016; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3017; GFX1032-NEXT: s_mov_b32 s2, -1 3018; GFX1032-NEXT: ; implicit-def: $vgpr0 3019; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3020; GFX1032-NEXT: s_cbranch_execz BB15_2 3021; GFX1032-NEXT: ; %bb.1: 3022; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3023; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3024; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3025; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3026; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 3027; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3028; GFX1032-NEXT: buffer_gl0_inv 3029; GFX1032-NEXT: BB15_2: 3030; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3031; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3032; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3033; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3034; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 3035; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3036; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3037; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3038; GFX1032-NEXT: s_endpgm 3039entry: 3040 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3041 %old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3042 store i32 %old, i32 addrspace(1)* %out 3043 ret void 3044} 3045 3046define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { 3047; 3048; 3049; GFX7LESS-LABEL: xor_i32_varying: 3050; GFX7LESS: ; %bb.0: ; %entry 3051; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3052; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3053; GFX7LESS-NEXT: s_mov_b32 m0, -1 3054; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3055; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 3056; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3057; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3058; GFX7LESS-NEXT: s_mov_b32 s2, -1 3059; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3060; GFX7LESS-NEXT: s_endpgm 3061; 3062; GFX8-LABEL: xor_i32_varying: 3063; GFX8: ; %bb.0: ; %entry 3064; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3065; GFX8-NEXT: v_mov_b32_e32 v2, v0 3066; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3067; GFX8-NEXT: v_mov_b32_e32 v1, 0 3068; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3069; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3070; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3071; GFX8-NEXT: s_not_b64 exec, exec 3072; GFX8-NEXT: v_mov_b32_e32 v2, 0 3073; GFX8-NEXT: s_not_b64 exec, exec 3074; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3075; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3076; GFX8-NEXT: s_nop 1 3077; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3078; GFX8-NEXT: s_nop 1 3079; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3080; GFX8-NEXT: s_nop 1 3081; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3082; GFX8-NEXT: s_nop 1 3083; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3084; GFX8-NEXT: s_nop 1 3085; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3086; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3087; GFX8-NEXT: s_nop 0 3088; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3089; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3090; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3091; GFX8-NEXT: ; implicit-def: $vgpr0 3092; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3093; GFX8-NEXT: s_cbranch_execz BB16_2 3094; GFX8-NEXT: ; %bb.1: 3095; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3096; GFX8-NEXT: v_mov_b32_e32 v3, s4 3097; GFX8-NEXT: s_mov_b32 m0, -1 3098; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3099; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 3100; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3101; GFX8-NEXT: BB16_2: 3102; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3103; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3104; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3105; GFX8-NEXT: v_mov_b32_e32 v0, v1 3106; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 3107; GFX8-NEXT: s_mov_b32 s3, 0xf000 3108; GFX8-NEXT: s_mov_b32 s2, -1 3109; GFX8-NEXT: s_nop 0 3110; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3111; GFX8-NEXT: s_endpgm 3112; 3113; GFX9-LABEL: xor_i32_varying: 3114; GFX9: ; %bb.0: ; %entry 3115; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3116; GFX9-NEXT: v_mov_b32_e32 v2, v0 3117; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3118; GFX9-NEXT: v_mov_b32_e32 v1, 0 3119; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3120; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3121; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3122; GFX9-NEXT: s_not_b64 exec, exec 3123; GFX9-NEXT: v_mov_b32_e32 v2, 0 3124; GFX9-NEXT: s_not_b64 exec, exec 3125; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3126; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3127; GFX9-NEXT: s_nop 1 3128; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3129; GFX9-NEXT: s_nop 1 3130; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3131; GFX9-NEXT: s_nop 1 3132; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3133; GFX9-NEXT: s_nop 1 3134; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3135; GFX9-NEXT: s_nop 1 3136; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3137; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3138; GFX9-NEXT: s_nop 0 3139; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3140; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3141; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3142; GFX9-NEXT: ; implicit-def: $vgpr0 3143; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3144; GFX9-NEXT: s_cbranch_execz BB16_2 3145; GFX9-NEXT: ; %bb.1: 3146; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3147; GFX9-NEXT: v_mov_b32_e32 v3, s4 3148; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3149; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 3150; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3151; GFX9-NEXT: BB16_2: 3152; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3153; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3154; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3155; GFX9-NEXT: v_mov_b32_e32 v0, v1 3156; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 3157; GFX9-NEXT: s_mov_b32 s3, 0xf000 3158; GFX9-NEXT: s_mov_b32 s2, -1 3159; GFX9-NEXT: s_nop 0 3160; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3161; GFX9-NEXT: s_endpgm 3162; 3163; GFX1064-LABEL: xor_i32_varying: 3164; GFX1064: ; %bb.0: ; %entry 3165; GFX1064-NEXT: v_mov_b32_e32 v1, v0 3166; GFX1064-NEXT: s_not_b64 exec, exec 3167; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3168; GFX1064-NEXT: s_not_b64 exec, exec 3169; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3170; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3171; GFX1064-NEXT: v_mov_b32_e32 v3, 0 3172; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3173; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3174; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3175; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3176; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3177; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3178; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 3179; GFX1064-NEXT: v_mov_b32_e32 v2, s4 3180; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3181; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 3182; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3183; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3184; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3185; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3186; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 3187; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 3188; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3189; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3190; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3191; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 3192; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 3193; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 3194; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3195; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3196; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3197; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 3198; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3199; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3200; GFX1064-NEXT: s_mov_b32 s2, -1 3201; GFX1064-NEXT: ; implicit-def: $vgpr0 3202; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3203; GFX1064-NEXT: s_cbranch_execz BB16_2 3204; GFX1064-NEXT: ; %bb.1: 3205; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3206; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3207; GFX1064-NEXT: s_mov_b32 s3, s7 3208; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3209; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3210; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 3211; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3212; GFX1064-NEXT: buffer_gl0_inv 3213; GFX1064-NEXT: BB16_2: 3214; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3215; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3216; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3217; GFX1064-NEXT: v_mov_b32_e32 v0, v3 3218; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 3219; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3220; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3221; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3222; GFX1064-NEXT: s_endpgm 3223; 3224; GFX1032-LABEL: xor_i32_varying: 3225; GFX1032: ; %bb.0: ; %entry 3226; GFX1032-NEXT: v_mov_b32_e32 v1, v0 3227; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3228; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3229; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3230; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3231; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 3232; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 3233; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 3234; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 3235; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3236; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 3237; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3238; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3239; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3240; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3241; GFX1032-NEXT: v_mov_b32_e32 v3, 0 3242; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 3243; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 3244; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 3245; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3246; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3247; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3248; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 3249; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3250; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3251; GFX1032-NEXT: s_mov_b32 s2, -1 3252; GFX1032-NEXT: ; implicit-def: $vgpr0 3253; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3254; GFX1032-NEXT: s_cbranch_execz BB16_2 3255; GFX1032-NEXT: ; %bb.1: 3256; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3257; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3258; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3259; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3260; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 3261; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3262; GFX1032-NEXT: buffer_gl0_inv 3263; GFX1032-NEXT: BB16_2: 3264; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3265; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3266; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3267; GFX1032-NEXT: v_mov_b32_e32 v0, v3 3268; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 3269; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3270; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3271; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3272; GFX1032-NEXT: s_endpgm 3273entry: 3274 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3275 %old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3276 store i32 %old, i32 addrspace(1)* %out 3277 ret void 3278} 3279 3280define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { 3281; 3282; 3283; GFX7LESS-LABEL: max_i32_varying: 3284; GFX7LESS: ; %bb.0: ; %entry 3285; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3286; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3287; GFX7LESS-NEXT: s_mov_b32 m0, -1 3288; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3289; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 3290; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3291; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3292; GFX7LESS-NEXT: s_mov_b32 s2, -1 3293; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3294; GFX7LESS-NEXT: s_endpgm 3295; 3296; GFX8-LABEL: max_i32_varying: 3297; GFX8: ; %bb.0: ; %entry 3298; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3299; GFX8-NEXT: v_mov_b32_e32 v2, v0 3300; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3301; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3302; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3303; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 3304; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3305; GFX8-NEXT: s_not_b64 exec, exec 3306; GFX8-NEXT: v_mov_b32_e32 v2, v1 3307; GFX8-NEXT: s_not_b64 exec, exec 3308; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3309; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3310; GFX8-NEXT: s_nop 1 3311; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3312; GFX8-NEXT: s_nop 1 3313; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3314; GFX8-NEXT: s_nop 1 3315; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3316; GFX8-NEXT: s_nop 1 3317; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3318; GFX8-NEXT: s_nop 1 3319; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3320; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3321; GFX8-NEXT: s_nop 0 3322; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3323; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3324; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3325; GFX8-NEXT: ; implicit-def: $vgpr0 3326; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3327; GFX8-NEXT: s_cbranch_execz BB17_2 3328; GFX8-NEXT: ; %bb.1: 3329; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3330; GFX8-NEXT: v_mov_b32_e32 v3, s4 3331; GFX8-NEXT: s_mov_b32 m0, -1 3332; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3333; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 3334; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3335; GFX8-NEXT: BB17_2: 3336; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3337; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3338; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3339; GFX8-NEXT: v_mov_b32_e32 v0, v1 3340; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 3341; GFX8-NEXT: s_mov_b32 s3, 0xf000 3342; GFX8-NEXT: s_mov_b32 s2, -1 3343; GFX8-NEXT: s_nop 0 3344; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3345; GFX8-NEXT: s_endpgm 3346; 3347; GFX9-LABEL: max_i32_varying: 3348; GFX9: ; %bb.0: ; %entry 3349; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3350; GFX9-NEXT: v_mov_b32_e32 v2, v0 3351; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3352; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3353; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3354; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 3355; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3356; GFX9-NEXT: s_not_b64 exec, exec 3357; GFX9-NEXT: v_mov_b32_e32 v2, v1 3358; GFX9-NEXT: s_not_b64 exec, exec 3359; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3360; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3361; GFX9-NEXT: s_nop 1 3362; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3363; GFX9-NEXT: s_nop 1 3364; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3365; GFX9-NEXT: s_nop 1 3366; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3367; GFX9-NEXT: s_nop 1 3368; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3369; GFX9-NEXT: s_nop 1 3370; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3371; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3372; GFX9-NEXT: s_nop 0 3373; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3374; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3375; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3376; GFX9-NEXT: ; implicit-def: $vgpr0 3377; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3378; GFX9-NEXT: s_cbranch_execz BB17_2 3379; GFX9-NEXT: ; %bb.1: 3380; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3381; GFX9-NEXT: v_mov_b32_e32 v3, s4 3382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3383; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 3384; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3385; GFX9-NEXT: BB17_2: 3386; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3387; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3388; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3389; GFX9-NEXT: v_mov_b32_e32 v0, v1 3390; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 3391; GFX9-NEXT: s_mov_b32 s3, 0xf000 3392; GFX9-NEXT: s_mov_b32 s2, -1 3393; GFX9-NEXT: s_nop 0 3394; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3395; GFX9-NEXT: s_endpgm 3396; 3397; GFX1064-LABEL: max_i32_varying: 3398; GFX1064: ; %bb.0: ; %entry 3399; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3400; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3401; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 3402; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3403; GFX1064-NEXT: s_not_b64 exec, exec 3404; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3405; GFX1064-NEXT: s_not_b64 exec, exec 3406; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3407; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3408; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3409; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3410; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3411; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3412; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3413; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3414; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3415; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3416; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3417; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3418; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3419; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3420; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3421; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3422; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3423; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3424; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3425; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3426; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3427; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3428; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3429; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3430; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3431; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3432; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3433; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3434; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3435; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3436; GFX1064-NEXT: s_mov_b32 s2, -1 3437; GFX1064-NEXT: ; implicit-def: $vgpr0 3438; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3439; GFX1064-NEXT: s_cbranch_execz BB17_2 3440; GFX1064-NEXT: ; %bb.1: 3441; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3442; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3443; GFX1064-NEXT: s_mov_b32 s3, s7 3444; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3445; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3446; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 3447; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3448; GFX1064-NEXT: buffer_gl0_inv 3449; GFX1064-NEXT: BB17_2: 3450; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3451; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3452; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3453; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3454; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 3455; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3456; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3457; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3458; GFX1064-NEXT: s_endpgm 3459; 3460; GFX1032-LABEL: max_i32_varying: 3461; GFX1032: ; %bb.0: ; %entry 3462; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3463; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3464; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 3465; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3466; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3467; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3468; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3469; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3470; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3471; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3472; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3473; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3474; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3475; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3476; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3477; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3478; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3479; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3480; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3481; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3482; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3483; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3484; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3485; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3486; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3487; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3488; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3489; GFX1032-NEXT: s_mov_b32 s2, -1 3490; GFX1032-NEXT: ; implicit-def: $vgpr0 3491; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3492; GFX1032-NEXT: s_cbranch_execz BB17_2 3493; GFX1032-NEXT: ; %bb.1: 3494; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3495; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3496; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3497; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3498; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 3499; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3500; GFX1032-NEXT: buffer_gl0_inv 3501; GFX1032-NEXT: BB17_2: 3502; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3503; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3504; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3505; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3506; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 3507; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3508; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3509; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3510; GFX1032-NEXT: s_endpgm 3511entry: 3512 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3513 %old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3514 store i32 %old, i32 addrspace(1)* %out 3515 ret void 3516} 3517 3518define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { 3519; 3520; 3521; GFX7LESS-LABEL: max_i64_constant: 3522; GFX7LESS: ; %bb.0: ; %entry 3523; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3524; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3525; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3526; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3527; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3528; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3529; GFX7LESS-NEXT: s_cbranch_execz BB18_2 3530; GFX7LESS-NEXT: ; %bb.1: 3531; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3532; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3533; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3534; GFX7LESS-NEXT: s_mov_b32 m0, -1 3535; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3536; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3537; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3538; GFX7LESS-NEXT: BB18_2: 3539; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3540; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3541; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3542; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3543; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 3544; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3545; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3546; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3547; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3548; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3549; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1] 3550; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3551; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3552; GFX7LESS-NEXT: s_mov_b32 s2, -1 3553; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3554; GFX7LESS-NEXT: s_endpgm 3555; 3556; GFX8-LABEL: max_i64_constant: 3557; GFX8: ; %bb.0: ; %entry 3558; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3559; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3560; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3561; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3562; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3563; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3564; GFX8-NEXT: s_cbranch_execz BB18_2 3565; GFX8-NEXT: ; %bb.1: 3566; GFX8-NEXT: v_mov_b32_e32 v0, 5 3567; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3568; GFX8-NEXT: v_mov_b32_e32 v1, 0 3569; GFX8-NEXT: s_mov_b32 m0, -1 3570; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3571; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3572; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3573; GFX8-NEXT: BB18_2: 3574; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3575; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3576; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3577; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 3578; GFX8-NEXT: v_readfirstlane_b32 s3, v1 3579; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3580; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3581; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3582; GFX8-NEXT: v_mov_b32_e32 v2, s3 3583; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3584; GFX8-NEXT: v_mov_b32_e32 v2, s2 3585; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3586; GFX8-NEXT: s_mov_b32 s3, 0xf000 3587; GFX8-NEXT: s_mov_b32 s2, -1 3588; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3589; GFX8-NEXT: s_endpgm 3590; 3591; GFX9-LABEL: max_i64_constant: 3592; GFX9: ; %bb.0: ; %entry 3593; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3594; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3595; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3596; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3597; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 3598; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3599; GFX9-NEXT: s_cbranch_execz BB18_2 3600; GFX9-NEXT: ; %bb.1: 3601; GFX9-NEXT: v_mov_b32_e32 v0, 5 3602; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3603; GFX9-NEXT: v_mov_b32_e32 v1, 0 3604; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3605; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3606; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3607; GFX9-NEXT: BB18_2: 3608; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3609; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3610; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3611; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 3612; GFX9-NEXT: v_readfirstlane_b32 s3, v1 3613; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3614; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3615; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3616; GFX9-NEXT: v_mov_b32_e32 v2, s3 3617; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3618; GFX9-NEXT: v_mov_b32_e32 v2, s2 3619; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3620; GFX9-NEXT: s_mov_b32 s3, 0xf000 3621; GFX9-NEXT: s_mov_b32 s2, -1 3622; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3623; GFX9-NEXT: s_endpgm 3624; 3625; GFX1064-LABEL: max_i64_constant: 3626; GFX1064: ; %bb.0: ; %entry 3627; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3628; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3629; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3630; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3631; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 3632; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 3633; GFX1064-NEXT: s_cbranch_execz BB18_2 3634; GFX1064-NEXT: ; %bb.1: 3635; GFX1064-NEXT: v_mov_b32_e32 v0, 5 3636; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3637; GFX1064-NEXT: v_mov_b32_e32 v1, 0 3638; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3639; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3640; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3641; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3642; GFX1064-NEXT: buffer_gl0_inv 3643; GFX1064-NEXT: BB18_2: 3644; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3645; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 3646; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 3647; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 3648; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc 3649; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 3650; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1] 3651; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 3652; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 3653; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3654; GFX1064-NEXT: s_mov_b32 s2, -1 3655; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3656; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3657; GFX1064-NEXT: s_endpgm 3658; 3659; GFX1032-LABEL: max_i64_constant: 3660; GFX1032: ; %bb.0: ; %entry 3661; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3662; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3663; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3664; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 3665; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 3666; GFX1032-NEXT: s_cbranch_execz BB18_2 3667; GFX1032-NEXT: ; %bb.1: 3668; GFX1032-NEXT: v_mov_b32_e32 v0, 5 3669; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3670; GFX1032-NEXT: v_mov_b32_e32 v1, 0 3671; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3672; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3673; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] 3674; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3675; GFX1032-NEXT: buffer_gl0_inv 3676; GFX1032-NEXT: BB18_2: 3677; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3678; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 3679; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 3680; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 3681; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo 3682; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 3683; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1] 3684; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 3685; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 3686; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3687; GFX1032-NEXT: s_mov_b32 s2, -1 3688; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3689; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3690; GFX1032-NEXT: s_endpgm 3691entry: 3692 %old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel 3693 store i64 %old, i64 addrspace(1)* %out 3694 ret void 3695} 3696 3697define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { 3698; 3699; 3700; GFX7LESS-LABEL: min_i32_varying: 3701; GFX7LESS: ; %bb.0: ; %entry 3702; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3703; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 3704; GFX7LESS-NEXT: s_mov_b32 m0, -1 3705; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3706; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 3707; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3708; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3709; GFX7LESS-NEXT: s_mov_b32 s2, -1 3710; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 3711; GFX7LESS-NEXT: s_endpgm 3712; 3713; GFX8-LABEL: min_i32_varying: 3714; GFX8: ; %bb.0: ; %entry 3715; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3716; GFX8-NEXT: v_mov_b32_e32 v2, v0 3717; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3718; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3719; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3720; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 3721; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3722; GFX8-NEXT: s_not_b64 exec, exec 3723; GFX8-NEXT: v_mov_b32_e32 v2, v1 3724; GFX8-NEXT: s_not_b64 exec, exec 3725; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 3726; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3727; GFX8-NEXT: s_nop 1 3728; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3729; GFX8-NEXT: s_nop 1 3730; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3731; GFX8-NEXT: s_nop 1 3732; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3733; GFX8-NEXT: s_nop 1 3734; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3735; GFX8-NEXT: s_nop 1 3736; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3737; GFX8-NEXT: v_readlane_b32 s4, v2, 63 3738; GFX8-NEXT: s_nop 0 3739; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3740; GFX8-NEXT: s_mov_b64 exec, s[2:3] 3741; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3742; GFX8-NEXT: ; implicit-def: $vgpr0 3743; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3744; GFX8-NEXT: s_cbranch_execz BB19_2 3745; GFX8-NEXT: ; %bb.1: 3746; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3747; GFX8-NEXT: v_mov_b32_e32 v3, s4 3748; GFX8-NEXT: s_mov_b32 m0, -1 3749; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3750; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 3751; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3752; GFX8-NEXT: BB19_2: 3753; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3754; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3755; GFX8-NEXT: v_readfirstlane_b32 s2, v0 3756; GFX8-NEXT: v_mov_b32_e32 v0, v1 3757; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 3758; GFX8-NEXT: s_mov_b32 s3, 0xf000 3759; GFX8-NEXT: s_mov_b32 s2, -1 3760; GFX8-NEXT: s_nop 0 3761; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 3762; GFX8-NEXT: s_endpgm 3763; 3764; GFX9-LABEL: min_i32_varying: 3765; GFX9: ; %bb.0: ; %entry 3766; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3767; GFX9-NEXT: v_mov_b32_e32 v2, v0 3768; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3769; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3770; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3771; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 3772; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3773; GFX9-NEXT: s_not_b64 exec, exec 3774; GFX9-NEXT: v_mov_b32_e32 v2, v1 3775; GFX9-NEXT: s_not_b64 exec, exec 3776; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 3777; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3778; GFX9-NEXT: s_nop 1 3779; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3780; GFX9-NEXT: s_nop 1 3781; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3782; GFX9-NEXT: s_nop 1 3783; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3784; GFX9-NEXT: s_nop 1 3785; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 3786; GFX9-NEXT: s_nop 1 3787; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 3788; GFX9-NEXT: v_readlane_b32 s4, v2, 63 3789; GFX9-NEXT: s_nop 0 3790; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 3791; GFX9-NEXT: s_mov_b64 exec, s[2:3] 3792; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3793; GFX9-NEXT: ; implicit-def: $vgpr0 3794; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 3795; GFX9-NEXT: s_cbranch_execz BB19_2 3796; GFX9-NEXT: ; %bb.1: 3797; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 3798; GFX9-NEXT: v_mov_b32_e32 v3, s4 3799; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3800; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 3801; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3802; GFX9-NEXT: BB19_2: 3803; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 3804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3805; GFX9-NEXT: v_readfirstlane_b32 s2, v0 3806; GFX9-NEXT: v_mov_b32_e32 v0, v1 3807; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 3808; GFX9-NEXT: s_mov_b32 s3, 0xf000 3809; GFX9-NEXT: s_mov_b32 s2, -1 3810; GFX9-NEXT: s_nop 0 3811; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 3812; GFX9-NEXT: s_endpgm 3813; 3814; GFX1064-LABEL: min_i32_varying: 3815; GFX1064: ; %bb.0: ; %entry 3816; GFX1064-NEXT: v_mov_b32_e32 v2, v0 3817; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3818; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 3819; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3820; GFX1064-NEXT: s_not_b64 exec, exec 3821; GFX1064-NEXT: v_mov_b32_e32 v2, v1 3822; GFX1064-NEXT: s_not_b64 exec, exec 3823; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3824; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3825; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3826; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3827; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3828; GFX1064-NEXT: v_mov_b32_e32 v3, v2 3829; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3830; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3831; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 3832; GFX1064-NEXT: v_mov_b32_e32 v3, s4 3833; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 3834; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 3835; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3836; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3837; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3838; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3839; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 3840; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 3841; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3842; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3843; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 3844; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 3845; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 3846; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 3847; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 3848; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 3849; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 3850; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 3851; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 3852; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3853; GFX1064-NEXT: s_mov_b32 s2, -1 3854; GFX1064-NEXT: ; implicit-def: $vgpr0 3855; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 3856; GFX1064-NEXT: s_cbranch_execz BB19_2 3857; GFX1064-NEXT: ; %bb.1: 3858; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3859; GFX1064-NEXT: v_mov_b32_e32 v4, s7 3860; GFX1064-NEXT: s_mov_b32 s3, s7 3861; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3862; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 3863; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 3864; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3865; GFX1064-NEXT: buffer_gl0_inv 3866; GFX1064-NEXT: BB19_2: 3867; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 3868; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 3869; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 3870; GFX1064-NEXT: v_mov_b32_e32 v0, v1 3871; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 3872; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 3873; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 3874; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 3875; GFX1064-NEXT: s_endpgm 3876; 3877; GFX1032-LABEL: min_i32_varying: 3878; GFX1032: ; %bb.0: ; %entry 3879; GFX1032-NEXT: v_mov_b32_e32 v2, v0 3880; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3881; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 3882; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3883; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3884; GFX1032-NEXT: v_mov_b32_e32 v2, v1 3885; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 3886; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3887; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3888; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 3889; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 3890; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 3891; GFX1032-NEXT: v_mov_b32_e32 v3, v2 3892; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 3893; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3894; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3895; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3896; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 3897; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 3898; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 3899; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 3900; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3901; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3902; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 3903; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 3904; GFX1032-NEXT: s_mov_b32 exec_lo, s2 3905; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 3906; GFX1032-NEXT: s_mov_b32 s2, -1 3907; GFX1032-NEXT: ; implicit-def: $vgpr0 3908; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 3909; GFX1032-NEXT: s_cbranch_execz BB19_2 3910; GFX1032-NEXT: ; %bb.1: 3911; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 3912; GFX1032-NEXT: v_mov_b32_e32 v4, s4 3913; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3914; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 3915; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 3916; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3917; GFX1032-NEXT: buffer_gl0_inv 3918; GFX1032-NEXT: BB19_2: 3919; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 3920; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 3921; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 3922; GFX1032-NEXT: v_mov_b32_e32 v0, v1 3923; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 3924; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 3925; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 3926; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 3927; GFX1032-NEXT: s_endpgm 3928entry: 3929 %lane = call i32 @llvm.amdgcn.workitem.id.x() 3930 %old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel 3931 store i32 %old, i32 addrspace(1)* %out 3932 ret void 3933} 3934 3935define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { 3936; 3937; 3938; GFX7LESS-LABEL: min_i64_constant: 3939; GFX7LESS: ; %bb.0: ; %entry 3940; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3941; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 3942; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 3943; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3944; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 3945; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 3946; GFX7LESS-NEXT: s_cbranch_execz BB20_2 3947; GFX7LESS-NEXT: ; %bb.1: 3948; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3949; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 3950; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 3951; GFX7LESS-NEXT: s_mov_b32 m0, -1 3952; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3953; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3954; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3955; GFX7LESS-NEXT: BB20_2: 3956; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 3957; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 3958; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 3959; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 3960; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 3961; GFX7LESS-NEXT: s_mov_b32 s2, -1 3962; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3963; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 3964; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 3965; GFX7LESS-NEXT: v_mov_b32_e32 v3, s4 3966; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3967; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3968; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 3969; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 3970; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3971; GFX7LESS-NEXT: s_endpgm 3972; 3973; GFX8-LABEL: min_i64_constant: 3974; GFX8: ; %bb.0: ; %entry 3975; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3976; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 3977; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 3978; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 3979; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3980; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 3981; GFX8-NEXT: s_cbranch_execz BB20_2 3982; GFX8-NEXT: ; %bb.1: 3983; GFX8-NEXT: v_mov_b32_e32 v0, 5 3984; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 3985; GFX8-NEXT: v_mov_b32_e32 v1, 0 3986; GFX8-NEXT: s_mov_b32 m0, -1 3987; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3988; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 3989; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3990; GFX8-NEXT: BB20_2: 3991; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 3992; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3993; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3994; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 3995; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3996; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 3997; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 3998; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 3999; GFX8-NEXT: v_mov_b32_e32 v2, s5 4000; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4001; GFX8-NEXT: v_mov_b32_e32 v2, s4 4002; GFX8-NEXT: s_mov_b32 s2, -1 4003; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4004; GFX8-NEXT: s_mov_b32 s3, 0xf000 4005; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4006; GFX8-NEXT: s_endpgm 4007; 4008; GFX9-LABEL: min_i64_constant: 4009; GFX9: ; %bb.0: ; %entry 4010; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4011; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4012; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4013; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4014; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4015; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4016; GFX9-NEXT: s_cbranch_execz BB20_2 4017; GFX9-NEXT: ; %bb.1: 4018; GFX9-NEXT: v_mov_b32_e32 v0, 5 4019; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4020; GFX9-NEXT: v_mov_b32_e32 v1, 0 4021; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4022; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4023; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4024; GFX9-NEXT: BB20_2: 4025; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4026; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4027; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4028; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 4029; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4030; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc 4031; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4032; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4033; GFX9-NEXT: v_mov_b32_e32 v2, s5 4034; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4035; GFX9-NEXT: v_mov_b32_e32 v2, s4 4036; GFX9-NEXT: s_mov_b32 s2, -1 4037; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4038; GFX9-NEXT: s_mov_b32 s3, 0xf000 4039; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4040; GFX9-NEXT: s_endpgm 4041; 4042; GFX1064-LABEL: min_i64_constant: 4043; GFX1064: ; %bb.0: ; %entry 4044; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4045; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4046; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4047; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4048; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4049; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4050; GFX1064-NEXT: s_cbranch_execz BB20_2 4051; GFX1064-NEXT: ; %bb.1: 4052; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4053; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4054; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4055; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4056; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4057; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4058; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4059; GFX1064-NEXT: buffer_gl0_inv 4060; GFX1064-NEXT: BB20_2: 4061; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4062; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4063; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4064; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4065; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc 4066; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4067; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 4068; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4069; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4070; GFX1064-NEXT: s_mov_b32 s2, -1 4071; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4072; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4073; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4074; GFX1064-NEXT: s_endpgm 4075; 4076; GFX1032-LABEL: min_i64_constant: 4077; GFX1032: ; %bb.0: ; %entry 4078; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4079; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4080; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4081; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4082; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4083; GFX1032-NEXT: s_cbranch_execz BB20_2 4084; GFX1032-NEXT: ; %bb.1: 4085; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4086; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4087; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4088; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4089; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4090; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] 4091; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4092; GFX1032-NEXT: buffer_gl0_inv 4093; GFX1032-NEXT: BB20_2: 4094; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4095; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4096; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4097; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4098; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo 4099; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4100; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1] 4101; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4102; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4103; GFX1032-NEXT: s_mov_b32 s2, -1 4104; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4105; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4106; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4107; GFX1032-NEXT: s_endpgm 4108entry: 4109 %old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel 4110 store i64 %old, i64 addrspace(1)* %out 4111 ret void 4112} 4113 4114define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { 4115; 4116; 4117; GFX7LESS-LABEL: umax_i32_varying: 4118; GFX7LESS: ; %bb.0: ; %entry 4119; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4120; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4121; GFX7LESS-NEXT: s_mov_b32 m0, -1 4122; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4123; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 4124; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4125; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4126; GFX7LESS-NEXT: s_mov_b32 s2, -1 4127; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4128; GFX7LESS-NEXT: s_endpgm 4129; 4130; GFX8-LABEL: umax_i32_varying: 4131; GFX8: ; %bb.0: ; %entry 4132; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4133; GFX8-NEXT: v_mov_b32_e32 v2, v0 4134; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4135; GFX8-NEXT: v_mov_b32_e32 v1, 0 4136; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4137; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4138; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4139; GFX8-NEXT: s_not_b64 exec, exec 4140; GFX8-NEXT: v_mov_b32_e32 v2, 0 4141; GFX8-NEXT: s_not_b64 exec, exec 4142; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4143; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4144; GFX8-NEXT: s_nop 1 4145; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4146; GFX8-NEXT: s_nop 1 4147; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4148; GFX8-NEXT: s_nop 1 4149; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4150; GFX8-NEXT: s_nop 1 4151; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4152; GFX8-NEXT: s_nop 1 4153; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4154; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4155; GFX8-NEXT: s_nop 0 4156; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4157; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4158; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4159; GFX8-NEXT: ; implicit-def: $vgpr0 4160; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4161; GFX8-NEXT: s_cbranch_execz BB21_2 4162; GFX8-NEXT: ; %bb.1: 4163; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4164; GFX8-NEXT: v_mov_b32_e32 v3, s4 4165; GFX8-NEXT: s_mov_b32 m0, -1 4166; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4167; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 4168; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4169; GFX8-NEXT: BB21_2: 4170; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4171; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4172; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4173; GFX8-NEXT: v_mov_b32_e32 v0, v1 4174; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 4175; GFX8-NEXT: s_mov_b32 s3, 0xf000 4176; GFX8-NEXT: s_mov_b32 s2, -1 4177; GFX8-NEXT: s_nop 0 4178; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4179; GFX8-NEXT: s_endpgm 4180; 4181; GFX9-LABEL: umax_i32_varying: 4182; GFX9: ; %bb.0: ; %entry 4183; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4184; GFX9-NEXT: v_mov_b32_e32 v2, v0 4185; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4186; GFX9-NEXT: v_mov_b32_e32 v1, 0 4187; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4188; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4189; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4190; GFX9-NEXT: s_not_b64 exec, exec 4191; GFX9-NEXT: v_mov_b32_e32 v2, 0 4192; GFX9-NEXT: s_not_b64 exec, exec 4193; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4194; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4195; GFX9-NEXT: s_nop 1 4196; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4197; GFX9-NEXT: s_nop 1 4198; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4199; GFX9-NEXT: s_nop 1 4200; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4201; GFX9-NEXT: s_nop 1 4202; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4203; GFX9-NEXT: s_nop 1 4204; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4205; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4206; GFX9-NEXT: s_nop 0 4207; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4208; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4209; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4210; GFX9-NEXT: ; implicit-def: $vgpr0 4211; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4212; GFX9-NEXT: s_cbranch_execz BB21_2 4213; GFX9-NEXT: ; %bb.1: 4214; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4215; GFX9-NEXT: v_mov_b32_e32 v3, s4 4216; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4217; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 4218; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4219; GFX9-NEXT: BB21_2: 4220; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4221; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4222; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4223; GFX9-NEXT: v_mov_b32_e32 v0, v1 4224; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 4225; GFX9-NEXT: s_mov_b32 s3, 0xf000 4226; GFX9-NEXT: s_mov_b32 s2, -1 4227; GFX9-NEXT: s_nop 0 4228; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4229; GFX9-NEXT: s_endpgm 4230; 4231; GFX1064-LABEL: umax_i32_varying: 4232; GFX1064: ; %bb.0: ; %entry 4233; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4234; GFX1064-NEXT: s_not_b64 exec, exec 4235; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4236; GFX1064-NEXT: s_not_b64 exec, exec 4237; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4238; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4239; GFX1064-NEXT: v_mov_b32_e32 v3, 0 4240; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4241; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4242; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4243; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4244; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4245; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4246; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4247; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4248; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4249; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4250; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4251; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4252; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4253; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4254; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4255; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4256; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4257; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4258; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4259; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4260; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4261; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4262; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4263; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4264; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4265; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4266; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4267; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4268; GFX1064-NEXT: s_mov_b32 s2, -1 4269; GFX1064-NEXT: ; implicit-def: $vgpr0 4270; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4271; GFX1064-NEXT: s_cbranch_execz BB21_2 4272; GFX1064-NEXT: ; %bb.1: 4273; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4274; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4275; GFX1064-NEXT: s_mov_b32 s3, s7 4276; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4277; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4278; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 4279; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4280; GFX1064-NEXT: buffer_gl0_inv 4281; GFX1064-NEXT: BB21_2: 4282; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4283; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4284; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4285; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4286; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 4287; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4288; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4289; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4290; GFX1064-NEXT: s_endpgm 4291; 4292; GFX1032-LABEL: umax_i32_varying: 4293; GFX1032: ; %bb.0: ; %entry 4294; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4295; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4296; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4297; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4298; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4299; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 4300; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 4301; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 4302; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 4303; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4304; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4305; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4306; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4307; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4308; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4309; GFX1032-NEXT: v_mov_b32_e32 v3, 0 4310; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4311; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4312; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4313; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4314; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4315; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4316; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4317; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4318; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4319; GFX1032-NEXT: s_mov_b32 s2, -1 4320; GFX1032-NEXT: ; implicit-def: $vgpr0 4321; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4322; GFX1032-NEXT: s_cbranch_execz BB21_2 4323; GFX1032-NEXT: ; %bb.1: 4324; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4325; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4326; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4327; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4328; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 4329; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4330; GFX1032-NEXT: buffer_gl0_inv 4331; GFX1032-NEXT: BB21_2: 4332; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4333; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4334; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4335; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4336; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 4337; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4338; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4339; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4340; GFX1032-NEXT: s_endpgm 4341entry: 4342 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4343 %old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4344 store i32 %old, i32 addrspace(1)* %out 4345 ret void 4346} 4347 4348define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { 4349; 4350; 4351; GFX7LESS-LABEL: umax_i64_constant: 4352; GFX7LESS: ; %bb.0: ; %entry 4353; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4354; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4355; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4356; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4357; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4358; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4359; GFX7LESS-NEXT: s_cbranch_execz BB22_2 4360; GFX7LESS-NEXT: ; %bb.1: 4361; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4362; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4363; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4364; GFX7LESS-NEXT: s_mov_b32 m0, -1 4365; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4366; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4367; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4368; GFX7LESS-NEXT: BB22_2: 4369; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4370; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4371; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4372; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4373; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4374; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4375; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4376; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4377; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 4378; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4379; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 4380; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4381; GFX7LESS-NEXT: s_mov_b32 s2, -1 4382; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4383; GFX7LESS-NEXT: s_endpgm 4384; 4385; GFX8-LABEL: umax_i64_constant: 4386; GFX8: ; %bb.0: ; %entry 4387; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4388; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4389; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4390; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4391; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4392; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4393; GFX8-NEXT: s_cbranch_execz BB22_2 4394; GFX8-NEXT: ; %bb.1: 4395; GFX8-NEXT: v_mov_b32_e32 v0, 5 4396; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4397; GFX8-NEXT: v_mov_b32_e32 v1, 0 4398; GFX8-NEXT: s_mov_b32 m0, -1 4399; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4400; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4402; GFX8-NEXT: BB22_2: 4403; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4404; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4405; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4406; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4407; GFX8-NEXT: v_mov_b32_e32 v1, 0 4408; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4409; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4410; GFX8-NEXT: v_mov_b32_e32 v1, s3 4411; GFX8-NEXT: v_mov_b32_e32 v2, s2 4412; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4413; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4414; GFX8-NEXT: s_mov_b32 s3, 0xf000 4415; GFX8-NEXT: s_mov_b32 s2, -1 4416; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4417; GFX8-NEXT: s_endpgm 4418; 4419; GFX9-LABEL: umax_i64_constant: 4420; GFX9: ; %bb.0: ; %entry 4421; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4422; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4423; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4424; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4425; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4426; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4427; GFX9-NEXT: s_cbranch_execz BB22_2 4428; GFX9-NEXT: ; %bb.1: 4429; GFX9-NEXT: v_mov_b32_e32 v0, 5 4430; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4431; GFX9-NEXT: v_mov_b32_e32 v1, 0 4432; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4433; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4434; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4435; GFX9-NEXT: BB22_2: 4436; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4437; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4438; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4439; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4440; GFX9-NEXT: v_mov_b32_e32 v1, 0 4441; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4442; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4443; GFX9-NEXT: v_mov_b32_e32 v1, s3 4444; GFX9-NEXT: v_mov_b32_e32 v2, s2 4445; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4446; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 4447; GFX9-NEXT: s_mov_b32 s3, 0xf000 4448; GFX9-NEXT: s_mov_b32 s2, -1 4449; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4450; GFX9-NEXT: s_endpgm 4451; 4452; GFX1064-LABEL: umax_i64_constant: 4453; GFX1064: ; %bb.0: ; %entry 4454; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4455; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4456; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4457; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4458; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4459; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4460; GFX1064-NEXT: s_cbranch_execz BB22_2 4461; GFX1064-NEXT: ; %bb.1: 4462; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4463; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4464; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4465; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4466; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4467; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4468; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4469; GFX1064-NEXT: buffer_gl0_inv 4470; GFX1064-NEXT: BB22_2: 4471; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4472; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4473; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4474; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4475; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4476; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc 4477; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1] 4478; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4479; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc 4480; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4481; GFX1064-NEXT: s_mov_b32 s2, -1 4482; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4483; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4484; GFX1064-NEXT: s_endpgm 4485; 4486; GFX1032-LABEL: umax_i64_constant: 4487; GFX1032: ; %bb.0: ; %entry 4488; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4489; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4490; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4491; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4492; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4493; GFX1032-NEXT: s_cbranch_execz BB22_2 4494; GFX1032-NEXT: ; %bb.1: 4495; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4496; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4497; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4498; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4499; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4500; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] 4501; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4502; GFX1032-NEXT: buffer_gl0_inv 4503; GFX1032-NEXT: BB22_2: 4504; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4505; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4506; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4507; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4508; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4509; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo 4510; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1] 4511; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4512; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo 4513; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4514; GFX1032-NEXT: s_mov_b32 s2, -1 4515; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4516; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4517; GFX1032-NEXT: s_endpgm 4518entry: 4519 %old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel 4520 store i64 %old, i64 addrspace(1)* %out 4521 ret void 4522} 4523 4524define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { 4525; 4526; 4527; GFX7LESS-LABEL: umin_i32_varying: 4528; GFX7LESS: ; %bb.0: ; %entry 4529; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4530; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo 4531; GFX7LESS-NEXT: s_mov_b32 m0, -1 4532; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4533; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 4534; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4535; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4536; GFX7LESS-NEXT: s_mov_b32 s2, -1 4537; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 4538; GFX7LESS-NEXT: s_endpgm 4539; 4540; GFX8-LABEL: umin_i32_varying: 4541; GFX8: ; %bb.0: ; %entry 4542; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4543; GFX8-NEXT: v_mov_b32_e32 v2, v0 4544; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4545; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4546; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4547; GFX8-NEXT: v_mov_b32_e32 v1, -1 4548; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4549; GFX8-NEXT: s_not_b64 exec, exec 4550; GFX8-NEXT: v_mov_b32_e32 v2, -1 4551; GFX8-NEXT: s_not_b64 exec, exec 4552; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 4553; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4554; GFX8-NEXT: s_nop 1 4555; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4556; GFX8-NEXT: s_nop 1 4557; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4558; GFX8-NEXT: s_nop 1 4559; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4560; GFX8-NEXT: s_nop 1 4561; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4562; GFX8-NEXT: s_nop 1 4563; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4564; GFX8-NEXT: v_readlane_b32 s4, v2, 63 4565; GFX8-NEXT: s_nop 0 4566; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4567; GFX8-NEXT: s_mov_b64 exec, s[2:3] 4568; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4569; GFX8-NEXT: ; implicit-def: $vgpr0 4570; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4571; GFX8-NEXT: s_cbranch_execz BB23_2 4572; GFX8-NEXT: ; %bb.1: 4573; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4574; GFX8-NEXT: v_mov_b32_e32 v3, s4 4575; GFX8-NEXT: s_mov_b32 m0, -1 4576; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4577; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 4578; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4579; GFX8-NEXT: BB23_2: 4580; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4581; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4582; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4583; GFX8-NEXT: v_mov_b32_e32 v0, v1 4584; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 4585; GFX8-NEXT: s_mov_b32 s3, 0xf000 4586; GFX8-NEXT: s_mov_b32 s2, -1 4587; GFX8-NEXT: s_nop 0 4588; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 4589; GFX8-NEXT: s_endpgm 4590; 4591; GFX9-LABEL: umin_i32_varying: 4592; GFX9: ; %bb.0: ; %entry 4593; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4594; GFX9-NEXT: v_mov_b32_e32 v2, v0 4595; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4596; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4597; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4598; GFX9-NEXT: v_mov_b32_e32 v1, -1 4599; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4600; GFX9-NEXT: s_not_b64 exec, exec 4601; GFX9-NEXT: v_mov_b32_e32 v2, -1 4602; GFX9-NEXT: s_not_b64 exec, exec 4603; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 4604; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf 4605; GFX9-NEXT: s_nop 1 4606; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf 4607; GFX9-NEXT: s_nop 1 4608; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf 4609; GFX9-NEXT: s_nop 1 4610; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf 4611; GFX9-NEXT: s_nop 1 4612; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 4613; GFX9-NEXT: s_nop 1 4614; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 4615; GFX9-NEXT: v_readlane_b32 s4, v2, 63 4616; GFX9-NEXT: s_nop 0 4617; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 4618; GFX9-NEXT: s_mov_b64 exec, s[2:3] 4619; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4620; GFX9-NEXT: ; implicit-def: $vgpr0 4621; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4622; GFX9-NEXT: s_cbranch_execz BB23_2 4623; GFX9-NEXT: ; %bb.1: 4624; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo 4625; GFX9-NEXT: v_mov_b32_e32 v3, s4 4626; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4627; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 4628; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4629; GFX9-NEXT: BB23_2: 4630; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4631; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4632; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4633; GFX9-NEXT: v_mov_b32_e32 v0, v1 4634; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 4635; GFX9-NEXT: s_mov_b32 s3, 0xf000 4636; GFX9-NEXT: s_mov_b32 s2, -1 4637; GFX9-NEXT: s_nop 0 4638; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 4639; GFX9-NEXT: s_endpgm 4640; 4641; GFX1064-LABEL: umin_i32_varying: 4642; GFX1064: ; %bb.0: ; %entry 4643; GFX1064-NEXT: v_mov_b32_e32 v1, v0 4644; GFX1064-NEXT: s_not_b64 exec, exec 4645; GFX1064-NEXT: v_mov_b32_e32 v1, -1 4646; GFX1064-NEXT: s_not_b64 exec, exec 4647; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4648; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4649; GFX1064-NEXT: v_mov_b32_e32 v3, -1 4650; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4651; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4652; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4653; GFX1064-NEXT: v_mov_b32_e32 v2, v1 4654; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4655; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4656; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 4657; GFX1064-NEXT: v_mov_b32_e32 v2, s4 4658; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 4659; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 4660; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4661; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4662; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4663; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4664; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 4665; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 4666; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4667; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4668; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 4669; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 4670; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 4671; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 4672; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 4673; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4674; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 4675; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 4676; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 4677; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4678; GFX1064-NEXT: s_mov_b32 s2, -1 4679; GFX1064-NEXT: ; implicit-def: $vgpr0 4680; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc 4681; GFX1064-NEXT: s_cbranch_execz BB23_2 4682; GFX1064-NEXT: ; %bb.1: 4683; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4684; GFX1064-NEXT: v_mov_b32_e32 v4, s7 4685; GFX1064-NEXT: s_mov_b32 s3, s7 4686; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4687; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4688; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 4689; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4690; GFX1064-NEXT: buffer_gl0_inv 4691; GFX1064-NEXT: BB23_2: 4692; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4693; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] 4694; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 4695; GFX1064-NEXT: v_mov_b32_e32 v0, v3 4696; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 4697; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4698; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4699; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 4700; GFX1064-NEXT: s_endpgm 4701; 4702; GFX1032-LABEL: umin_i32_varying: 4703; GFX1032: ; %bb.0: ; %entry 4704; GFX1032-NEXT: v_mov_b32_e32 v1, v0 4705; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4706; GFX1032-NEXT: v_mov_b32_e32 v1, -1 4707; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 4708; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4709; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4710; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf 4711; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf 4712; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf 4713; GFX1032-NEXT: v_mov_b32_e32 v2, v1 4714; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 4715; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4716; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4717; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4718; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 4719; GFX1032-NEXT: v_mov_b32_e32 v3, -1 4720; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 4721; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 4722; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 4723; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4724; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4725; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 4726; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 4727; GFX1032-NEXT: s_mov_b32 exec_lo, s2 4728; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4729; GFX1032-NEXT: s_mov_b32 s2, -1 4730; GFX1032-NEXT: ; implicit-def: $vgpr0 4731; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo 4732; GFX1032-NEXT: s_cbranch_execz BB23_2 4733; GFX1032-NEXT: ; %bb.1: 4734; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo 4735; GFX1032-NEXT: v_mov_b32_e32 v4, s4 4736; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4737; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4738; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 4739; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4740; GFX1032-NEXT: buffer_gl0_inv 4741; GFX1032-NEXT: BB23_2: 4742; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4743; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 4744; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 4745; GFX1032-NEXT: v_mov_b32_e32 v0, v3 4746; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 4747; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4748; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4749; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 4750; GFX1032-NEXT: s_endpgm 4751entry: 4752 %lane = call i32 @llvm.amdgcn.workitem.id.x() 4753 %old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel 4754 store i32 %old, i32 addrspace(1)* %out 4755 ret void 4756} 4757 4758define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { 4759; 4760; 4761; GFX7LESS-LABEL: umin_i64_constant: 4762; GFX7LESS: ; %bb.0: ; %entry 4763; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4764; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4765; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 4766; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4767; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 4768; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc 4769; GFX7LESS-NEXT: s_cbranch_execz BB24_2 4770; GFX7LESS-NEXT: ; %bb.1: 4771; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4772; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 4773; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 4774; GFX7LESS-NEXT: s_mov_b32 m0, -1 4775; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4776; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4777; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4778; GFX7LESS-NEXT: BB24_2: 4779; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] 4780; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) 4781; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 4782; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 4783; GFX7LESS-NEXT: s_mov_b32 s2, -1 4784; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4785; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4786; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 4787; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4788; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4789; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 4790; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4791; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 4792; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4793; GFX7LESS-NEXT: s_endpgm 4794; 4795; GFX8-LABEL: umin_i64_constant: 4796; GFX8: ; %bb.0: ; %entry 4797; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4798; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4799; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4800; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4801; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4802; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc 4803; GFX8-NEXT: s_cbranch_execz BB24_2 4804; GFX8-NEXT: ; %bb.1: 4805; GFX8-NEXT: v_mov_b32_e32 v0, 5 4806; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4807; GFX8-NEXT: v_mov_b32_e32 v1, 0 4808; GFX8-NEXT: s_mov_b32 m0, -1 4809; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4810; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4811; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4812; GFX8-NEXT: BB24_2: 4813; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] 4814; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4815; GFX8-NEXT: v_readfirstlane_b32 s5, v1 4816; GFX8-NEXT: v_readfirstlane_b32 s4, v0 4817; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4818; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4819; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4820; GFX8-NEXT: v_mov_b32_e32 v2, s5 4821; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4822; GFX8-NEXT: v_mov_b32_e32 v2, s4 4823; GFX8-NEXT: s_mov_b32 s2, -1 4824; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4825; GFX8-NEXT: s_mov_b32 s3, 0xf000 4826; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4827; GFX8-NEXT: s_endpgm 4828; 4829; GFX9-LABEL: umin_i64_constant: 4830; GFX9: ; %bb.0: ; %entry 4831; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4832; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 4833; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 4834; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4835; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 4836; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 4837; GFX9-NEXT: s_cbranch_execz BB24_2 4838; GFX9-NEXT: ; %bb.1: 4839; GFX9-NEXT: v_mov_b32_e32 v0, 5 4840; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4841; GFX9-NEXT: v_mov_b32_e32 v1, 0 4842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4843; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4844; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4845; GFX9-NEXT: BB24_2: 4846; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 4847; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4848; GFX9-NEXT: v_readfirstlane_b32 s5, v1 4849; GFX9-NEXT: v_readfirstlane_b32 s4, v0 4850; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4851; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4852; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 4853; GFX9-NEXT: v_mov_b32_e32 v2, s5 4854; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4855; GFX9-NEXT: v_mov_b32_e32 v2, s4 4856; GFX9-NEXT: s_mov_b32 s2, -1 4857; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4858; GFX9-NEXT: s_mov_b32 s3, 0xf000 4859; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4860; GFX9-NEXT: s_endpgm 4861; 4862; GFX1064-LABEL: umin_i64_constant: 4863; GFX1064: ; %bb.0: ; %entry 4864; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4865; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4866; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 4867; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 4868; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 4869; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 4870; GFX1064-NEXT: s_cbranch_execz BB24_2 4871; GFX1064-NEXT: ; %bb.1: 4872; GFX1064-NEXT: v_mov_b32_e32 v0, 5 4873; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4874; GFX1064-NEXT: v_mov_b32_e32 v1, 0 4875; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4876; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 4877; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4878; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4879; GFX1064-NEXT: buffer_gl0_inv 4880; GFX1064-NEXT: BB24_2: 4881; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 4882; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 4883; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 4884; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 4885; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 4886; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc 4887; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 4888; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc 4889; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc 4890; GFX1064-NEXT: s_mov_b32 s2, -1 4891; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 4892; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 4893; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4894; GFX1064-NEXT: s_endpgm 4895; 4896; GFX1032-LABEL: umin_i64_constant: 4897; GFX1032: ; %bb.0: ; %entry 4898; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4899; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 4900; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 4901; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 4902; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo 4903; GFX1032-NEXT: s_cbranch_execz BB24_2 4904; GFX1032-NEXT: ; %bb.1: 4905; GFX1032-NEXT: v_mov_b32_e32 v0, 5 4906; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo 4907; GFX1032-NEXT: v_mov_b32_e32 v1, 0 4908; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4909; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 4910; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] 4911; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4912; GFX1032-NEXT: buffer_gl0_inv 4913; GFX1032-NEXT: BB24_2: 4914; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 4915; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 4916; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 4917; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 4918; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 4919; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo 4920; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] 4921; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo 4922; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo 4923; GFX1032-NEXT: s_mov_b32 s2, -1 4924; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 4925; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 4926; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4927; GFX1032-NEXT: s_endpgm 4928entry: 4929 %old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel 4930 store i64 %old, i64 addrspace(1)* %out 4931 ret void 4932} 4933