1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s 7 8declare i1 @llvm.amdgcn.wqm.vote(i1) 9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) 10declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) 11 12; Show what the atomic optimization pass will do for raw buffers. 13 14define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) { 15; GFX7-LABEL: add_i32_constant: 16; GFX7: ; %bb.0: ; %entry 17; GFX7-NEXT: s_mov_b64 s[10:11], exec 18; GFX7-NEXT: ; implicit-def: $vgpr0 19; GFX7-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 20; GFX7-NEXT: s_cbranch_execz BB0_4 21; GFX7-NEXT: ; %bb.1: 22; GFX7-NEXT: s_mov_b64 s[12:13], exec 23; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 24; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s13, v0 25; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 26; GFX7-NEXT: ; implicit-def: $vgpr1 27; GFX7-NEXT: s_and_saveexec_b64 s[10:11], vcc 28; GFX7-NEXT: s_cbranch_execz BB0_3 29; GFX7-NEXT: ; %bb.2: 30; GFX7-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 31; GFX7-NEXT: s_mul_i32 s12, s12, 5 32; GFX7-NEXT: v_mov_b32_e32 v1, s12 33; GFX7-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 34; GFX7-NEXT: BB0_3: 35; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] 36; GFX7-NEXT: s_waitcnt vmcnt(0) 37; GFX7-NEXT: v_readfirstlane_b32 s4, v1 38; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 39; GFX7-NEXT: BB0_4: ; %Flow 40; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] 41; GFX7-NEXT: s_wqm_b64 s[4:5], -1 42; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] 43; GFX7-NEXT: s_cbranch_vccnz BB0_6 44; GFX7-NEXT: ; %bb.5: ; %if 45; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 46; GFX7-NEXT: BB0_6: ; %UnifiedReturnBlock 47; GFX7-NEXT: s_endpgm 48; 49; GFX8-LABEL: add_i32_constant: 50; GFX8: ; %bb.0: ; %entry 51; GFX8-NEXT: s_mov_b64 s[10:11], exec 52; GFX8-NEXT: ; implicit-def: $vgpr0 53; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 54; GFX8-NEXT: s_cbranch_execz BB0_4 55; GFX8-NEXT: ; %bb.1: 56; GFX8-NEXT: s_mov_b64 s[12:13], exec 57; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 58; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 59; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 60; GFX8-NEXT: ; implicit-def: $vgpr1 61; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc 62; GFX8-NEXT: s_cbranch_execz BB0_3 63; GFX8-NEXT: ; %bb.2: 64; GFX8-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 65; GFX8-NEXT: s_mul_i32 s12, s12, 5 66; GFX8-NEXT: v_mov_b32_e32 v1, s12 67; GFX8-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 68; GFX8-NEXT: BB0_3: 69; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] 70; GFX8-NEXT: s_waitcnt vmcnt(0) 71; GFX8-NEXT: v_readfirstlane_b32 s4, v1 72; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 73; GFX8-NEXT: BB0_4: ; %Flow 74; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 75; GFX8-NEXT: s_wqm_b64 s[4:5], -1 76; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] 77; GFX8-NEXT: s_cbranch_vccnz BB0_6 78; GFX8-NEXT: ; %bb.5: ; %if 79; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 80; GFX8-NEXT: BB0_6: ; %UnifiedReturnBlock 81; GFX8-NEXT: s_endpgm 82; 83; GFX9-LABEL: add_i32_constant: 84; GFX9: ; %bb.0: ; %entry 85; GFX9-NEXT: s_mov_b64 s[10:11], exec 86; GFX9-NEXT: ; implicit-def: $vgpr0 87; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 88; GFX9-NEXT: s_cbranch_execz BB0_4 89; GFX9-NEXT: ; %bb.1: 90; GFX9-NEXT: s_mov_b64 s[12:13], exec 91; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 92; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 93; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 94; GFX9-NEXT: ; implicit-def: $vgpr1 95; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc 96; GFX9-NEXT: s_cbranch_execz BB0_3 97; GFX9-NEXT: ; %bb.2: 98; GFX9-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 99; GFX9-NEXT: s_mul_i32 s12, s12, 5 100; GFX9-NEXT: v_mov_b32_e32 v1, s12 101; GFX9-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 102; GFX9-NEXT: BB0_3: 103; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] 104; GFX9-NEXT: s_waitcnt vmcnt(0) 105; GFX9-NEXT: v_readfirstlane_b32 s4, v1 106; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 107; GFX9-NEXT: BB0_4: ; %Flow 108; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] 109; GFX9-NEXT: s_wqm_b64 s[4:5], -1 110; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 111; GFX9-NEXT: s_cbranch_vccnz BB0_6 112; GFX9-NEXT: ; %bb.5: ; %if 113; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 114; GFX9-NEXT: BB0_6: ; %UnifiedReturnBlock 115; GFX9-NEXT: s_endpgm 116; 117; GFX1064-LABEL: add_i32_constant: 118; GFX1064: ; %bb.0: ; %entry 119; GFX1064-NEXT: s_mov_b64 s[10:11], exec 120; GFX1064-NEXT: ; implicit-def: $vgpr0 121; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 122; GFX1064-NEXT: s_cbranch_execz BB0_4 123; GFX1064-NEXT: ; %bb.1: 124; GFX1064-NEXT: s_mov_b64 s[12:13], exec 125; GFX1064-NEXT: ; implicit-def: $vgpr1 126; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0 127; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s13, v0 128; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 129; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc 130; GFX1064-NEXT: s_cbranch_execz BB0_3 131; GFX1064-NEXT: ; %bb.2: 132; GFX1064-NEXT: s_bcnt1_i32_b64 s12, s[12:13] 133; GFX1064-NEXT: s_mul_i32 s12, s12, 5 134; GFX1064-NEXT: v_mov_b32_e32 v1, s12 135; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 136; GFX1064-NEXT: BB0_3: 137; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 138; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] 139; GFX1064-NEXT: s_waitcnt vmcnt(0) 140; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 141; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 142; GFX1064-NEXT: BB0_4: ; %Flow 143; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 144; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 145; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] 146; GFX1064-NEXT: s_cbranch_vccnz BB0_6 147; GFX1064-NEXT: ; %bb.5: ; %if 148; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 149; GFX1064-NEXT: BB0_6: ; %UnifiedReturnBlock 150; GFX1064-NEXT: s_endpgm 151; 152; GFX1032-LABEL: add_i32_constant: 153; GFX1032: ; %bb.0: ; %entry 154; GFX1032-NEXT: s_mov_b32 s9, exec_lo 155; GFX1032-NEXT: ; implicit-def: $vgpr0 156; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 157; GFX1032-NEXT: s_cbranch_execz BB0_4 158; GFX1032-NEXT: ; %bb.1: 159; GFX1032-NEXT: s_mov_b32 s10, exec_lo 160; GFX1032-NEXT: ; implicit-def: $vgpr1 161; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0 162; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 163; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo 164; GFX1032-NEXT: s_cbranch_execz BB0_3 165; GFX1032-NEXT: ; %bb.2: 166; GFX1032-NEXT: s_bcnt1_i32_b32 s10, s10 167; GFX1032-NEXT: s_mul_i32 s10, s10, 5 168; GFX1032-NEXT: v_mov_b32_e32 v1, s10 169; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc 170; GFX1032-NEXT: BB0_3: 171; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 172; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 173; GFX1032-NEXT: s_waitcnt vmcnt(0) 174; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 175; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4 176; GFX1032-NEXT: BB0_4: ; %Flow 177; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 178; GFX1032-NEXT: s_wqm_b32 s4, -1 179; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 180; GFX1032-NEXT: s_cbranch_vccnz BB0_6 181; GFX1032-NEXT: ; %bb.5: ; %if 182; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 183; GFX1032-NEXT: BB0_6: ; %UnifiedReturnBlock 184; GFX1032-NEXT: s_endpgm 185entry: 186 %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 187 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) 188 %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 189 %cond = and i1 %cond1, %cond2 190 br i1 %cond, label %if, label %else 191if: 192 %bitcast = bitcast i32 %old to float 193 call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) 194 ret void 195else: 196 ret void 197} 198 199define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) { 200; GFX7-LABEL: add_i32_varying: 201; GFX7: ; %bb.0: ; %entry 202; GFX7-NEXT: s_wqm_b64 s[8:9], -1 203; GFX7-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 204; GFX7-NEXT: s_andn2_b64 vcc, exec, s[8:9] 205; GFX7-NEXT: s_cbranch_vccnz BB1_2 206; GFX7-NEXT: ; %bb.1: ; %if 207; GFX7-NEXT: s_waitcnt vmcnt(0) 208; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 209; GFX7-NEXT: BB1_2: ; %else 210; GFX7-NEXT: s_endpgm 211; 212; GFX8-LABEL: add_i32_varying: 213; GFX8: ; %bb.0: ; %entry 214; GFX8-NEXT: s_mov_b64 s[8:9], exec 215; GFX8-NEXT: s_mov_b64 s[10:11], s[8:9] 216; GFX8-NEXT: v_mov_b32_e32 v2, v0 217; GFX8-NEXT: ; implicit-def: $vgpr0 218; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 219; GFX8-NEXT: s_cbranch_execz BB1_4 220; GFX8-NEXT: ; %bb.1: 221; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 222; GFX8-NEXT: v_mov_b32_e32 v1, 0 223; GFX8-NEXT: s_mov_b64 exec, s[10:11] 224; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 225; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 226; GFX8-NEXT: s_not_b64 exec, exec 227; GFX8-NEXT: v_mov_b32_e32 v2, 0 228; GFX8-NEXT: s_not_b64 exec, exec 229; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 230; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 231; GFX8-NEXT: s_nop 1 232; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 233; GFX8-NEXT: s_nop 1 234; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 235; GFX8-NEXT: s_nop 1 236; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 237; GFX8-NEXT: s_nop 1 238; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 239; GFX8-NEXT: s_nop 1 240; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 241; GFX8-NEXT: v_readlane_b32 s12, v2, 63 242; GFX8-NEXT: s_nop 0 243; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 244; GFX8-NEXT: s_mov_b64 exec, s[10:11] 245; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 246; GFX8-NEXT: ; implicit-def: $vgpr0 247; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc 248; GFX8-NEXT: s_cbranch_execz BB1_3 249; GFX8-NEXT: ; %bb.2: 250; GFX8-NEXT: v_mov_b32_e32 v0, s12 251; GFX8-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 252; GFX8-NEXT: BB1_3: 253; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] 254; GFX8-NEXT: s_waitcnt vmcnt(0) 255; GFX8-NEXT: v_readfirstlane_b32 s4, v0 256; GFX8-NEXT: v_mov_b32_e32 v0, v1 257; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 258; GFX8-NEXT: BB1_4: ; %Flow 259; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] 260; GFX8-NEXT: s_wqm_b64 s[4:5], -1 261; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] 262; GFX8-NEXT: s_cbranch_vccnz BB1_6 263; GFX8-NEXT: ; %bb.5: ; %if 264; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 265; GFX8-NEXT: BB1_6: ; %UnifiedReturnBlock 266; GFX8-NEXT: s_endpgm 267; 268; GFX9-LABEL: add_i32_varying: 269; GFX9: ; %bb.0: ; %entry 270; GFX9-NEXT: s_mov_b64 s[8:9], exec 271; GFX9-NEXT: s_mov_b64 s[10:11], s[8:9] 272; GFX9-NEXT: v_mov_b32_e32 v2, v0 273; GFX9-NEXT: ; implicit-def: $vgpr0 274; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 275; GFX9-NEXT: s_cbranch_execz BB1_4 276; GFX9-NEXT: ; %bb.1: 277; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 278; GFX9-NEXT: v_mov_b32_e32 v1, 0 279; GFX9-NEXT: s_mov_b64 exec, s[10:11] 280; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 281; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 282; GFX9-NEXT: s_not_b64 exec, exec 283; GFX9-NEXT: v_mov_b32_e32 v2, 0 284; GFX9-NEXT: s_not_b64 exec, exec 285; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 286; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 287; GFX9-NEXT: s_nop 1 288; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 289; GFX9-NEXT: s_nop 1 290; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 291; GFX9-NEXT: s_nop 1 292; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 293; GFX9-NEXT: s_nop 1 294; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf 295; GFX9-NEXT: s_nop 1 296; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 297; GFX9-NEXT: v_readlane_b32 s12, v2, 63 298; GFX9-NEXT: s_nop 0 299; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf 300; GFX9-NEXT: s_mov_b64 exec, s[10:11] 301; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 302; GFX9-NEXT: ; implicit-def: $vgpr0 303; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc 304; GFX9-NEXT: s_cbranch_execz BB1_3 305; GFX9-NEXT: ; %bb.2: 306; GFX9-NEXT: v_mov_b32_e32 v0, s12 307; GFX9-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 308; GFX9-NEXT: BB1_3: 309; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] 310; GFX9-NEXT: s_waitcnt vmcnt(0) 311; GFX9-NEXT: v_readfirstlane_b32 s4, v0 312; GFX9-NEXT: v_mov_b32_e32 v0, v1 313; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 314; GFX9-NEXT: BB1_4: ; %Flow 315; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] 316; GFX9-NEXT: s_wqm_b64 s[4:5], -1 317; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 318; GFX9-NEXT: s_cbranch_vccnz BB1_6 319; GFX9-NEXT: ; %bb.5: ; %if 320; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 321; GFX9-NEXT: BB1_6: ; %UnifiedReturnBlock 322; GFX9-NEXT: s_endpgm 323; 324; GFX1064-LABEL: add_i32_varying: 325; GFX1064: ; %bb.0: ; %entry 326; GFX1064-NEXT: s_mov_b64 s[8:9], exec 327; GFX1064-NEXT: v_mov_b32_e32 v1, v0 328; GFX1064-NEXT: s_mov_b64 s[10:11], s[8:9] 329; GFX1064-NEXT: ; implicit-def: $vgpr0 330; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] 331; GFX1064-NEXT: s_cbranch_execz BB1_4 332; GFX1064-NEXT: ; %bb.1: 333; GFX1064-NEXT: s_not_b64 exec, exec 334; GFX1064-NEXT: v_mov_b32_e32 v1, 0 335; GFX1064-NEXT: s_not_b64 exec, exec 336; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 337; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 338; GFX1064-NEXT: v_mov_b32_e32 v3, 0 339; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 340; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 341; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 342; GFX1064-NEXT: v_mov_b32_e32 v2, v1 343; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 344; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 345; GFX1064-NEXT: v_readlane_b32 s12, v1, 31 346; GFX1064-NEXT: v_mov_b32_e32 v2, s12 347; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf 348; GFX1064-NEXT: v_readlane_b32 s12, v1, 15 349; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 350; GFX1064-NEXT: v_readlane_b32 s13, v1, 31 351; GFX1064-NEXT: v_writelane_b32 v3, s12, 16 352; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 353; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 354; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 355; GFX1064-NEXT: v_readlane_b32 s12, v1, 63 356; GFX1064-NEXT: v_readlane_b32 s14, v1, 47 357; GFX1064-NEXT: v_writelane_b32 v3, s13, 32 358; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 359; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 360; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1 361; GFX1064-NEXT: v_writelane_b32 v3, s14, 48 362; GFX1064-NEXT: s_mov_b64 exec, s[10:11] 363; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 364; GFX1064-NEXT: ; implicit-def: $vgpr0 365; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc 366; GFX1064-NEXT: s_cbranch_execz BB1_3 367; GFX1064-NEXT: ; %bb.2: 368; GFX1064-NEXT: v_mov_b32_e32 v0, s12 369; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 370; GFX1064-NEXT: BB1_3: 371; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 372; GFX1064-NEXT: s_or_b64 exec, exec, s[28:29] 373; GFX1064-NEXT: s_waitcnt vmcnt(0) 374; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 375; GFX1064-NEXT: v_mov_b32_e32 v0, v3 376; GFX1064-NEXT: v_add_nc_u32_e32 v0, s4, v0 377; GFX1064-NEXT: BB1_4: ; %Flow 378; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 379; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 380; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] 381; GFX1064-NEXT: s_cbranch_vccnz BB1_6 382; GFX1064-NEXT: ; %bb.5: ; %if 383; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 384; GFX1064-NEXT: BB1_6: ; %UnifiedReturnBlock 385; GFX1064-NEXT: s_endpgm 386; 387; GFX1032-LABEL: add_i32_varying: 388; GFX1032: ; %bb.0: ; %entry 389; GFX1032-NEXT: s_mov_b32 s8, exec_lo 390; GFX1032-NEXT: v_mov_b32_e32 v1, v0 391; GFX1032-NEXT: s_mov_b32 s9, s8 392; GFX1032-NEXT: ; implicit-def: $vgpr0 393; GFX1032-NEXT: s_and_saveexec_b32 s8, s9 394; GFX1032-NEXT: s_cbranch_execz BB1_4 395; GFX1032-NEXT: ; %bb.1: 396; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 397; GFX1032-NEXT: v_mov_b32_e32 v1, 0 398; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo 399; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 400; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 401; GFX1032-NEXT: v_mov_b32_e32 v3, 0 402; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 403; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 404; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 405; GFX1032-NEXT: v_mov_b32_e32 v2, v1 406; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 407; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf 408; GFX1032-NEXT: v_readlane_b32 s11, v1, 31 409; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf 410; GFX1032-NEXT: v_readlane_b32 s10, v1, 15 411; GFX1032-NEXT: s_mov_b32 exec_lo, s9 412; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 413; GFX1032-NEXT: s_or_saveexec_b32 s9, -1 414; GFX1032-NEXT: v_writelane_b32 v3, s10, 16 415; GFX1032-NEXT: s_mov_b32 exec_lo, s9 416; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 417; GFX1032-NEXT: ; implicit-def: $vgpr0 418; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo 419; GFX1032-NEXT: s_cbranch_execz BB1_3 420; GFX1032-NEXT: ; %bb.2: 421; GFX1032-NEXT: v_mov_b32_e32 v0, s11 422; GFX1032-NEXT: s_mov_b32 s10, s11 423; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc 424; GFX1032-NEXT: BB1_3: 425; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 426; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 427; GFX1032-NEXT: s_waitcnt vmcnt(0) 428; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 429; GFX1032-NEXT: v_mov_b32_e32 v0, v3 430; GFX1032-NEXT: v_add_nc_u32_e32 v0, s4, v0 431; GFX1032-NEXT: BB1_4: ; %Flow 432; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 433; GFX1032-NEXT: s_wqm_b32 s4, -1 434; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 435; GFX1032-NEXT: s_cbranch_vccnz BB1_6 436; GFX1032-NEXT: ; %bb.5: ; %if 437; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 438; GFX1032-NEXT: BB1_6: ; %UnifiedReturnBlock 439; GFX1032-NEXT: s_endpgm 440entry: 441 %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 442 %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0) 443 %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true) 444 %cond = and i1 %cond1, %cond2 445 br i1 %cond, label %if, label %else 446if: 447 %bitcast = bitcast i32 %old to float 448 call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0) 449 ret void 450else: 451 ret void 452} 453