1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i1 @llvm.amdgcn.wqm.vote(i1) 9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) 10declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) 11 12; Show what the atomic optimization pass will do for raw buffers. 13 14define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) { 15; GFX7-LABEL: add_i32_constant: 16; GFX7: ; %bb.0: ; %entry 17; GFX7-NEXT: s_mov_b64 s[10:11], exec 18; GFX7-NEXT: ; implicit-def: $vgpr0 19; GFX7-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 20; GFX7-NEXT: s_cbranch_execz BB0_4 21; GFX7-NEXT: ; %bb.1: 22; GFX7-NEXT: s_mov_b64 s[12:13], exec 23; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 24; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s13, v0 25; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 26; GFX7-NEXT: ; implicit-def: $vgpr1 27; GFX7-NEXT: s_and_saveexec_b64 s[10:11], vcc 28; GFX7-NEXT: s_cbranch_execz BB0_3 29; GFX7-NEXT: ; %bb.2: 30; GFX7-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 31; GFX7-NEXT: v_mul_u32_u24_e64 v1, s12, 5 32; GFX7-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 33; GFX7-NEXT: BB0_3: 34; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] 35; GFX7-NEXT: s_waitcnt vmcnt(0) 36; GFX7-NEXT: v_readfirstlane_b32 s4, v1 37; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 38; GFX7-NEXT: BB0_4: ; %Flow 39; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] 40; GFX7-NEXT: s_wqm_b64 s[4:5], -1 41; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] 42; GFX7-NEXT: s_cbranch_vccnz BB0_6 43; GFX7-NEXT: ; %bb.5: ; %if 44; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 45; GFX7-NEXT: BB0_6: ; %UnifiedReturnBlock 46; GFX7-NEXT: s_endpgm 47; 48; GFX8-LABEL: add_i32_constant: 49; GFX8: ; %bb.0: ; %entry 50; GFX8-NEXT: s_mov_b64 s[10:11], exec 51; GFX8-NEXT: ; implicit-def: $vgpr0 52; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 53; GFX8-NEXT: s_cbranch_execz BB0_4 54; GFX8-NEXT: ; %bb.1: 55; GFX8-NEXT: s_mov_b64 s[12:13], exec 56; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 57; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 58; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 59; GFX8-NEXT: ; implicit-def: $vgpr1 60; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc 61; GFX8-NEXT: s_cbranch_execz BB0_3 62; GFX8-NEXT: ; %bb.2: 63; GFX8-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 64; GFX8-NEXT: v_mul_u32_u24_e64 v1, s12, 5 65; GFX8-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 66; GFX8-NEXT: BB0_3: 67; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] 68; GFX8-NEXT: s_waitcnt vmcnt(0) 69; GFX8-NEXT: v_readfirstlane_b32 s4, v1 70; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 71; GFX8-NEXT: BB0_4: ; %Flow 72; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 73; GFX8-NEXT: s_wqm_b64 s[4:5], -1 74; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] 75; GFX8-NEXT: s_cbranch_vccnz BB0_6 76; GFX8-NEXT: ; %bb.5: ; %if 77; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 78; GFX8-NEXT: BB0_6: ; %UnifiedReturnBlock 79; GFX8-NEXT: s_endpgm 80; 81; GFX9-LABEL: add_i32_constant: 82; GFX9: ; %bb.0: ; %entry 83; GFX9-NEXT: s_mov_b64 s[10:11], exec 84; GFX9-NEXT: ; implicit-def: $vgpr0 85; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 86; GFX9-NEXT: s_cbranch_execz BB0_4 87; GFX9-NEXT: ; %bb.1: 88; GFX9-NEXT: s_mov_b64 s[12:13], exec 89; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 90; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 91; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 92; GFX9-NEXT: ; implicit-def: $vgpr1 93; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc 94; GFX9-NEXT: s_cbranch_execz BB0_3 95; GFX9-NEXT: ; %bb.2: 96; GFX9-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 97; GFX9-NEXT: v_mul_u32_u24_e64 v1, s12, 5 98; GFX9-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 99; GFX9-NEXT: BB0_3: 100; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] 101; GFX9-NEXT: s_waitcnt vmcnt(0) 102; GFX9-NEXT: v_readfirstlane_b32 s4, v1 103; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 104; GFX9-NEXT: BB0_4: ; %Flow 105; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] 106; GFX9-NEXT: s_wqm_b64 s[4:5], -1 107; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 108; GFX9-NEXT: s_cbranch_vccnz BB0_6 109; GFX9-NEXT: ; %bb.5: ; %if 110; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 111; GFX9-NEXT: BB0_6: ; %UnifiedReturnBlock 112; GFX9-NEXT: s_endpgm 113; 114; GFX1064-LABEL: add_i32_constant: 115; GFX1064: ; %bb.0: ; %entry 116; GFX1064-NEXT: s_mov_b64 s[10:11], exec 117; GFX1064-NEXT: ; implicit-def: $vgpr0 118; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 119; GFX1064-NEXT: s_cbranch_execz BB0_4 120; GFX1064-NEXT: ; %bb.1: 121; GFX1064-NEXT: s_mov_b64 s[12:13], exec 122; GFX1064-NEXT: ; implicit-def: $vgpr1 123; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 124; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s13, v0 125; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 126; GFX1064-NEXT: s_and_saveexec_b64 s[30:31], vcc 127; GFX1064-NEXT: s_cbranch_execz BB0_3 128; GFX1064-NEXT: ; %bb.2: 129; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 130; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s12, 5 131; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 132; GFX1064-NEXT: BB0_3: 133; GFX1064-NEXT: v_nop 134; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31] 135; GFX1064-NEXT: s_waitcnt vmcnt(0) 136; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 137; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 138; GFX1064-NEXT: BB0_4: ; %Flow 139; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 140; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 141; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] 142; GFX1064-NEXT: s_cbranch_vccnz BB0_6 143; GFX1064-NEXT: ; %bb.5: ; %if 144; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 145; GFX1064-NEXT: BB0_6: ; %UnifiedReturnBlock 146; GFX1064-NEXT: s_endpgm 147; 148; GFX1032-LABEL: add_i32_constant: 149; GFX1032: ; %bb.0: ; %entry 150; GFX1032-NEXT: s_mov_b32 s9, exec_lo 151; GFX1032-NEXT: ; implicit-def: $vgpr0 152; GFX1032-NEXT: ; implicit-def: $vcc_hi 153; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 154; GFX1032-NEXT: s_cbranch_execz BB0_4 155; GFX1032-NEXT: ; %bb.1: 156; GFX1032-NEXT: s_mov_b32 s10, exec_lo 157; GFX1032-NEXT: ; implicit-def: $vgpr1 158; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0 159; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 160; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo 161; GFX1032-NEXT: s_cbranch_execz BB0_3 162; GFX1032-NEXT: ; %bb.2: 163; GFX1032-NEXT: s_bcnt1_i32_b32 s10, s10 164; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s10, 5 165; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 166; GFX1032-NEXT: BB0_3: 167; GFX1032-NEXT: v_nop 168; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 169; GFX1032-NEXT: s_waitcnt vmcnt(0) 170; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 171; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4 172; GFX1032-NEXT: BB0_4: ; %Flow 173; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 174; GFX1032-NEXT: s_wqm_b32 s4, -1 175; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 176; GFX1032-NEXT: s_cbranch_vccnz BB0_6 177; GFX1032-NEXT: ; %bb.5: ; %if 178; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 179; GFX1032-NEXT: BB0_6: ; %UnifiedReturnBlock 180; GFX1032-NEXT: s_endpgm 181entry: 182 %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 183 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 184 %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 185 %cond = and i1 %cond1, %cond2 186 br i1 %cond, label %if, label %else 187if: 188 %bitcast = bitcast i32 %old to float 189 call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) 190 ret void 191else: 192 ret void 193} 194 195define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) { 196; GFX7-LABEL: add_i32_varying: 197; GFX7: ; %bb.0: ; %entry 198; GFX7-NEXT: s_wqm_b64 s[8:9], -1 199; GFX7-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 200; GFX7-NEXT: s_andn2_b64 vcc, exec, s[8:9] 201; GFX7-NEXT: s_cbranch_vccnz BB1_2 202; GFX7-NEXT: ; %bb.1: ; %if 203; GFX7-NEXT: s_waitcnt vmcnt(0) 204; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 205; GFX7-NEXT: BB1_2: ; %else 206; GFX7-NEXT: s_endpgm 207; 208; GFX8-LABEL: add_i32_varying: 209; GFX8: ; %bb.0: ; %entry 210; GFX8-NEXT: s_mov_b64 s[10:11], exec 211; GFX8-NEXT: ; implicit-def: $vgpr3 212; GFX8-NEXT: v_mov_b32_e32 v2, v0 213; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 214; GFX8-NEXT: s_cbranch_execz BB1_4 215; GFX8-NEXT: ; %bb.1: 216; GFX8-NEXT: s_mov_b64 s[10:11], exec 217; GFX8-NEXT: s_or_saveexec_b64 s[12:13], -1 218; GFX8-NEXT: v_mov_b32_e32 v1, 0 219; GFX8-NEXT: s_mov_b64 exec, s[12:13] 220; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 221; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 222; GFX8-NEXT: s_not_b64 exec, exec 223; GFX8-NEXT: v_mov_b32_e32 v2, 0 224; GFX8-NEXT: s_not_b64 exec, exec 225; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 226; GFX8-NEXT: s_nop 0 227; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 228; GFX8-NEXT: s_nop 1 229; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 230; GFX8-NEXT: s_nop 1 231; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 232; GFX8-NEXT: s_nop 1 233; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 234; GFX8-NEXT: s_nop 1 235; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 236; GFX8-NEXT: s_nop 1 237; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 238; GFX8-NEXT: v_readlane_b32 s12, v2, 63 239; GFX8-NEXT: s_nop 0 240; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 241; GFX8-NEXT: s_mov_b64 exec, s[10:11] 242; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 243; GFX8-NEXT: ; implicit-def: $vgpr0 244; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc 245; GFX8-NEXT: s_cbranch_execz BB1_3 246; GFX8-NEXT: ; %bb.2: 247; GFX8-NEXT: v_mov_b32_e32 v0, s12 248; GFX8-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 249; GFX8-NEXT: BB1_3: 250; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] 251; GFX8-NEXT: s_waitcnt vmcnt(0) 252; GFX8-NEXT: v_readfirstlane_b32 s4, v0 253; GFX8-NEXT: v_mov_b32_e32 v0, v1 254; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0 255; GFX8-NEXT: BB1_4: ; %Flow 256; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 257; GFX8-NEXT: s_wqm_b64 s[4:5], -1 258; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] 259; GFX8-NEXT: s_cbranch_vccnz BB1_6 260; GFX8-NEXT: ; %bb.5: ; %if 261; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0 262; GFX8-NEXT: BB1_6: ; %UnifiedReturnBlock 263; GFX8-NEXT: s_endpgm 264; 265; GFX9-LABEL: add_i32_varying: 266; GFX9: ; %bb.0: ; %entry 267; GFX9-NEXT: s_mov_b64 s[10:11], exec 268; GFX9-NEXT: ; implicit-def: $vgpr3 269; GFX9-NEXT: v_mov_b32_e32 v2, v0 270; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 271; GFX9-NEXT: s_cbranch_execz BB1_4 272; GFX9-NEXT: ; %bb.1: 273; GFX9-NEXT: s_mov_b64 s[10:11], exec 274; GFX9-NEXT: s_or_saveexec_b64 s[12:13], -1 275; GFX9-NEXT: v_mov_b32_e32 v1, 0 276; GFX9-NEXT: s_mov_b64 exec, s[12:13] 277; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 278; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s11, v0 279; GFX9-NEXT: s_not_b64 exec, exec 280; GFX9-NEXT: v_mov_b32_e32 v2, 0 281; GFX9-NEXT: s_not_b64 exec, exec 282; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 283; GFX9-NEXT: s_nop 0 284; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 285; GFX9-NEXT: s_nop 1 286; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 287; GFX9-NEXT: s_nop 1 288; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 289; GFX9-NEXT: s_nop 1 290; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 291; GFX9-NEXT: s_nop 1 292; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 293; GFX9-NEXT: s_nop 1 294; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 295; GFX9-NEXT: v_readlane_b32 s12, v2, 63 296; GFX9-NEXT: s_nop 0 297; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 298; GFX9-NEXT: s_mov_b64 exec, s[10:11] 299; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 300; GFX9-NEXT: ; implicit-def: $vgpr0 301; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc 302; GFX9-NEXT: s_cbranch_execz BB1_3 303; GFX9-NEXT: ; %bb.2: 304; GFX9-NEXT: v_mov_b32_e32 v0, s12 305; GFX9-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 306; GFX9-NEXT: BB1_3: 307; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] 308; GFX9-NEXT: s_waitcnt vmcnt(0) 309; GFX9-NEXT: v_readfirstlane_b32 s4, v0 310; GFX9-NEXT: v_mov_b32_e32 v0, v1 311; GFX9-NEXT: v_add_u32_e32 v3, s4, v0 312; GFX9-NEXT: BB1_4: ; %Flow 313; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] 314; GFX9-NEXT: s_wqm_b64 s[4:5], -1 315; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 316; GFX9-NEXT: s_cbranch_vccnz BB1_6 317; GFX9-NEXT: ; %bb.5: ; %if 318; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0 319; GFX9-NEXT: BB1_6: ; %UnifiedReturnBlock 320; GFX9-NEXT: s_endpgm 321; 322; GFX1064-LABEL: add_i32_varying: 323; GFX1064: ; %bb.0: ; %entry 324; GFX1064-NEXT: s_mov_b64 s[10:11], exec 325; GFX1064-NEXT: ; implicit-def: $vgpr4 326; GFX1064-NEXT: v_mov_b32_e32 v2, v0 327; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 328; GFX1064-NEXT: s_cbranch_execz BB1_4 329; GFX1064-NEXT: ; %bb.1: 330; GFX1064-NEXT: s_mov_b64 s[10:11], exec 331; GFX1064-NEXT: s_or_saveexec_b64 s[12:13], -1 332; GFX1064-NEXT: v_mov_b32_e32 v1, 0 333; GFX1064-NEXT: s_mov_b64 exec, s[12:13] 334; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0 335; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s11, v0 336; GFX1064-NEXT: s_not_b64 exec, exec 337; GFX1064-NEXT: v_mov_b32_e32 v2, 0 338; GFX1064-NEXT: s_not_b64 exec, exec 339; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 340; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 341; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 342; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 343; GFX1064-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 344; GFX1064-NEXT: v_mov_b32_e32 v3, v2 345; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 346; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 347; GFX1064-NEXT: v_readlane_b32 s12, v2, 31 348; GFX1064-NEXT: v_mov_b32_e32 v3, s12 349; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 350; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 351; GFX1064-NEXT: v_readlane_b32 s12, v2, 15 352; GFX1064-NEXT: v_readlane_b32 s13, v2, 31 353; GFX1064-NEXT: v_writelane_b32 v1, s12, 16 354; GFX1064-NEXT: v_readlane_b32 s12, v2, 63 355; GFX1064-NEXT: v_writelane_b32 v1, s13, 32 356; GFX1064-NEXT: v_readlane_b32 s13, v2, 47 357; GFX1064-NEXT: v_writelane_b32 v1, s13, 48 358; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 359; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 360; GFX1064-NEXT: ; implicit-def: $vgpr0 361; GFX1064-NEXT: s_and_saveexec_b64 s[30:31], vcc 362; GFX1064-NEXT: s_cbranch_execz BB1_3 363; GFX1064-NEXT: ; %bb.2: 364; GFX1064-NEXT: v_mov_b32_e32 v0, s12 365; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 366; GFX1064-NEXT: BB1_3: 367; GFX1064-NEXT: v_nop 368; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31] 369; GFX1064-NEXT: s_waitcnt vmcnt(0) 370; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 371; GFX1064-NEXT: v_mov_b32_e32 v0, v1 372; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0 373; GFX1064-NEXT: BB1_4: ; %Flow 374; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 375; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 376; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] 377; GFX1064-NEXT: s_cbranch_vccnz BB1_6 378; GFX1064-NEXT: ; %bb.5: ; %if 379; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0 380; GFX1064-NEXT: BB1_6: ; %UnifiedReturnBlock 381; GFX1064-NEXT: s_endpgm 382; 383; GFX1032-LABEL: add_i32_varying: 384; GFX1032: ; %bb.0: ; %entry 385; GFX1032-NEXT: s_mov_b32 s9, exec_lo 386; GFX1032-NEXT: ; implicit-def: $vgpr4 387; GFX1032-NEXT: ; implicit-def: $vcc_hi 388; GFX1032-NEXT: v_mov_b32_e32 v2, v0 389; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 390; GFX1032-NEXT: s_cbranch_execz BB1_4 391; GFX1032-NEXT: ; %bb.1: 392; GFX1032-NEXT: s_mov_b32 s9, exec_lo 393; GFX1032-NEXT: s_or_saveexec_b32 s10, -1 394; GFX1032-NEXT: v_mov_b32_e32 v1, 0 395; GFX1032-NEXT: s_mov_b32 exec_lo, s10 396; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s9, 0 397; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 398; GFX1032-NEXT: v_mov_b32_e32 v2, 0 399; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 400; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 401; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 402; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 403; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:0 404; GFX1032-NEXT: v_add_nc_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:0 405; GFX1032-NEXT: v_mov_b32_e32 v3, v2 406; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 407; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 408; GFX1032-NEXT: v_readlane_b32 s10, v2, 31 409; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf 410; GFX1032-NEXT: v_readlane_b32 s11, v2, 15 411; GFX1032-NEXT: v_writelane_b32 v1, s11, 16 412; GFX1032-NEXT: s_mov_b32 exec_lo, s9 413; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 414; GFX1032-NEXT: ; implicit-def: $vgpr0 415; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo 416; GFX1032-NEXT: s_cbranch_execz BB1_3 417; GFX1032-NEXT: ; %bb.2: 418; GFX1032-NEXT: v_mov_b32_e32 v0, s10 419; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 420; GFX1032-NEXT: BB1_3: 421; GFX1032-NEXT: v_nop 422; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 423; GFX1032-NEXT: s_waitcnt vmcnt(0) 424; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 425; GFX1032-NEXT: v_mov_b32_e32 v0, v1 426; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0 427; GFX1032-NEXT: BB1_4: ; %Flow 428; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 429; GFX1032-NEXT: s_wqm_b32 s4, -1 430; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 431; GFX1032-NEXT: s_cbranch_vccnz BB1_6 432; GFX1032-NEXT: ; %bb.5: ; %if 433; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 434; GFX1032-NEXT: BB1_6: ; %UnifiedReturnBlock 435; GFX1032-NEXT: s_endpgm 436entry: 437 %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 438 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0) 439 %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 440 %cond = and i1 %cond1, %cond2 441 br i1 %cond, label %if, label %else 442if: 443 %bitcast = bitcast i32 %old to float 444 call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) 445 ret void 446else: 447 ret void 448} 449