1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s 4 5; Check that WQM isn't triggered by image load/store intrinsics. 6define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) { 7; GFX9-W64-LABEL: test1: 8; GFX9-W64: ; %bb.0: ; %main_body 9; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 10; GFX9-W64-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm 11; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 12; GFX9-W64-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm 13; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 14; GFX9-W64-NEXT: ; return to shader part epilog 15; 16; GFX10-W32-LABEL: test1: 17; GFX10-W32: ; %bb.0: ; %main_body 18; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 19; GFX10-W32-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 20; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 21; GFX10-W32-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 22; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 23; GFX10-W32-NEXT: ; return to shader part epilog 24main_body: 25 %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 26 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 27 ret <4 x float> %tex 28} 29 30; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible 31define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 32; GFX9-W64-LABEL: test2: 33; GFX9-W64: ; %bb.0: ; %main_body 34; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 35; GFX9-W64-NEXT: s_wqm_b64 exec, exec 36; GFX9-W64-NEXT: s_mov_b32 m0, s3 37; GFX9-W64-NEXT: s_nop 0 38; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 39; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 40; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 41; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 42; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 43; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf 44; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 45; GFX9-W64-NEXT: ; return to shader part epilog 46; 47; GFX10-W32-LABEL: test2: 48; GFX10-W32: ; %bb.0: ; %main_body 49; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 50; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 51; GFX10-W32-NEXT: s_mov_b32 m0, s3 52; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 53; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 54; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 55; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 56; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 57; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D 58; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 59; GFX10-W32-NEXT: ; return to shader part epilog 60main_body: 61 %inst23 = extractelement <2 x float> %pos, i32 0 62 %inst24 = extractelement <2 x float> %pos, i32 1 63 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 64 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 65 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 66 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 67 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 68 ret <4 x float> %tex 69} 70 71; ... but disabled for stores (and, in this simple case, not re-enabled) ... 72define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { 73; GFX9-W64-LABEL: test3: 74; GFX9-W64: ; %bb.0: ; %main_body 75; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 76; GFX9-W64-NEXT: s_wqm_b64 exec, exec 77; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 78; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 79; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 80; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen 81; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 82; GFX9-W64-NEXT: ; return to shader part epilog 83; 84; GFX10-W32-LABEL: test3: 85; GFX10-W32: ; %bb.0: ; %main_body 86; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 87; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 88; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 89; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 90; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 91; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen 92; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 93; GFX10-W32-NEXT: ; return to shader part epilog 94main_body: 95 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 96 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 97 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 98 99 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0) 100 101 ret <4 x float> %tex 102} 103 104; ... and disabled for export. 105define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 106; GFX9-W64-LABEL: test3x: 107; GFX9-W64: ; %bb.0: ; %main_body 108; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 109; GFX9-W64-NEXT: s_wqm_b64 exec, exec 110; GFX9-W64-NEXT: s_mov_b32 m0, s3 111; GFX9-W64-NEXT: s_nop 0 112; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 113; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 114; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 115; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 116; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 117; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf 118; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 119; GFX9-W64-NEXT: exp mrt0 v0, v1, v2, v3 done vm 120; GFX9-W64-NEXT: s_endpgm 121; 122; GFX10-W32-LABEL: test3x: 123; GFX10-W32: ; %bb.0: ; %main_body 124; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 125; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 126; GFX10-W32-NEXT: s_mov_b32 m0, s3 127; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 128; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 129; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 130; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 131; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 132; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D 133; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 134; GFX10-W32-NEXT: exp mrt0 v0, v1, v2, v3 done vm 135; GFX10-W32-NEXT: s_endpgm 136main_body: 137 %inst23 = extractelement <2 x float> %pos, i32 0 138 %inst24 = extractelement <2 x float> %pos, i32 1 139 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 140 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 141 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 142 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 143 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 144 %tex.0 = extractelement <4 x float> %tex, i32 0 145 %tex.1 = extractelement <4 x float> %tex, i32 1 146 %tex.2 = extractelement <4 x float> %tex, i32 2 147 %tex.3 = extractelement <4 x float> %tex, i32 3 148 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true) 149 ret void 150} 151 152; Check that WQM is re-enabled when required. 153define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { 154; GFX9-W64-LABEL: test4: 155; GFX9-W64: ; %bb.0: ; %main_body 156; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 157; GFX9-W64-NEXT: s_wqm_b64 exec, exec 158; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1 159; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 160; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 161; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 162; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 163; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 164; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen 165; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 166; GFX9-W64-NEXT: ; return to shader part epilog 167; 168; GFX10-W32-LABEL: test4: 169; GFX10-W32: ; %bb.0: ; %main_body 170; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 171; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 172; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1 173; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 174; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 175; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 176; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 177; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 178; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen 179; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 180; GFX10-W32-NEXT: ; return to shader part epilog 181main_body: 182 %c.1 = mul i32 %c, %d 183 184 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0) 185 %c.1.bc = bitcast i32 %c.1 to float 186 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 187 %tex0 = extractelement <4 x float> %tex, i32 0 188 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 189 ret <4 x float> %dtex 190} 191 192; Check that WQM is triggered by the wqm intrinsic. 193; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this 194; does not happen - the v_add should write the return reg directly. 195define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { 196; GFX9-W64-LABEL: test5: 197; GFX9-W64: ; %bb.0: ; %main_body 198; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 199; GFX9-W64-NEXT: s_wqm_b64 exec, exec 200; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 201; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 202; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 203; GFX9-W64-NEXT: s_nop 0 204; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 205; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 206; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 207; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 208; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 209; GFX9-W64-NEXT: ; return to shader part epilog 210; 211; GFX10-W32-LABEL: test5: 212; GFX10-W32: ; %bb.0: ; %main_body 213; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 214; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 215; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 216; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 217; GFX10-W32-NEXT: s_clause 0x1 218; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 219; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 220; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 221; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 222; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 223; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 224; GFX10-W32-NEXT: ; return to shader part epilog 225main_body: 226 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 227 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 228 %out = fadd float %src0, %src1 229 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 230 ret float %out.0 231} 232 233; Check that the wqm intrinsic works correctly for integers. 234define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { 235; GFX9-W64-LABEL: test6: 236; GFX9-W64: ; %bb.0: ; %main_body 237; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 238; GFX9-W64-NEXT: s_wqm_b64 exec, exec 239; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 240; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 241; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 242; GFX9-W64-NEXT: s_nop 0 243; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 244; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 245; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 246; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 247; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 248; GFX9-W64-NEXT: ; return to shader part epilog 249; 250; GFX10-W32-LABEL: test6: 251; GFX10-W32: ; %bb.0: ; %main_body 252; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 253; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 254; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 255; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 256; GFX10-W32-NEXT: s_clause 0x1 257; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 258; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 259; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 260; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 261; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 262; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 263; GFX10-W32-NEXT: ; return to shader part epilog 264main_body: 265 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 266 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 267 %out = fadd float %src0, %src1 268 %out.0 = bitcast float %out to i32 269 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 270 %out.2 = bitcast i32 %out.1 to float 271 ret float %out.2 272} 273 274; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. 275 276; Check that WWM is triggered by the wwm intrinsic. 277define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 278; GFX9-W64-LABEL: test_wwm1: 279; GFX9-W64: ; %bb.0: ; %main_body 280; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 281; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 282; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 283; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 284; GFX9-W64-NEXT: s_nop 0 285; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 286; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 287; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 288; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 289; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 290; GFX9-W64-NEXT: ; return to shader part epilog 291; 292; GFX10-W32-LABEL: test_wwm1: 293; GFX10-W32: ; %bb.0: ; %main_body 294; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 295; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 296; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 297; GFX10-W32-NEXT: s_clause 0x1 298; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 299; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 300; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 301; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 302; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 303; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 304; GFX10-W32-NEXT: ; return to shader part epilog 305main_body: 306 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 307 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 308 %out = fadd float %src0, %src1 309 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 310 ret float %out.0 311} 312 313; Same as above, but with an integer type. 314define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { 315; GFX9-W64-LABEL: test_wwm2: 316; GFX9-W64: ; %bb.0: ; %main_body 317; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 318; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 319; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 320; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 321; GFX9-W64-NEXT: s_nop 0 322; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 323; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 324; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 325; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 326; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 327; GFX9-W64-NEXT: ; return to shader part epilog 328; 329; GFX10-W32-LABEL: test_wwm2: 330; GFX10-W32: ; %bb.0: ; %main_body 331; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 332; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 333; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 334; GFX10-W32-NEXT: s_clause 0x1 335; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 336; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 337; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 338; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 339; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 340; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 341; GFX10-W32-NEXT: ; return to shader part epilog 342main_body: 343 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 344 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 345 %src0.0 = bitcast float %src0 to i32 346 %src1.0 = bitcast float %src1 to i32 347 %out = add i32 %src0.0, %src1.0 348 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 349 %out.1 = bitcast i32 %out.0 to float 350 ret float %out.1 351} 352 353; Check that we don't leave WWM on for computations that don't require WWM, 354; since that will lead clobbering things that aren't supposed to be clobbered 355; in cases like this. 356; We enforce this by checking that v_add gets emitted in the same block as 357; WWM computations. 358define amdgpu_ps float @test_wwm3(i32 inreg %idx) { 359; GFX9-W64-LABEL: test_wwm3: 360; GFX9-W64: ; %bb.0: ; %main_body 361; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 362; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 363; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 364; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 365; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 366; GFX9-W64-NEXT: s_cbranch_execz .LBB9_2 367; GFX9-W64-NEXT: ; %bb.1: ; %if 368; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 369; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 370; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 371; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 372; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 373; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 374; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 375; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 376; GFX9-W64-NEXT: .LBB9_2: ; %endif 377; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 378; GFX9-W64-NEXT: ; return to shader part epilog 379; 380; GFX10-W32-LABEL: test_wwm3: 381; GFX10-W32: ; %bb.0: ; %main_body 382; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 383; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 384; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 385; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 386; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 387; GFX10-W32-NEXT: s_cbranch_execz .LBB9_2 388; GFX10-W32-NEXT: ; %bb.1: ; %if 389; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 390; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 391; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 392; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 393; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 394; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 395; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 396; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 397; GFX10-W32-NEXT: .LBB9_2: ; %endif 398; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 399; GFX10-W32-NEXT: ; return to shader part epilog 400main_body: 401 ; use mbcnt to make sure the branch is divergent 402 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 403 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 404 %cc = icmp uge i32 %hi, 32 405 br i1 %cc, label %endif, label %if 406 407if: 408 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 409 %out = fadd float %src, %src 410 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 411 %out.1 = fadd float %src, %out.0 412 br label %endif 413 414endif: 415 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 416 ret float %out.2 417} 418 419; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM 420; write could clobber disabled channels in the non-WWM one. 421; We enforce this by checking that v_mov gets emitted in the same block as 422; WWM computations. 423define amdgpu_ps float @test_wwm4(i32 inreg %idx) { 424; GFX9-W64-LABEL: test_wwm4: 425; GFX9-W64: ; %bb.0: ; %main_body 426; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 427; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 428; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 429; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 430; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 431; GFX9-W64-NEXT: s_cbranch_execz .LBB10_2 432; GFX9-W64-NEXT: ; %bb.1: ; %if 433; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 434; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 435; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 436; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 437; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 438; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 439; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 440; GFX9-W64-NEXT: .LBB10_2: ; %endif 441; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 442; GFX9-W64-NEXT: ; return to shader part epilog 443; 444; GFX10-W32-LABEL: test_wwm4: 445; GFX10-W32: ; %bb.0: ; %main_body 446; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 447; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 448; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 449; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 450; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 451; GFX10-W32-NEXT: s_cbranch_execz .LBB10_2 452; GFX10-W32-NEXT: ; %bb.1: ; %if 453; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 454; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 455; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 456; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 457; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 458; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 459; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 460; GFX10-W32-NEXT: .LBB10_2: ; %endif 461; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 462; GFX10-W32-NEXT: ; return to shader part epilog 463main_body: 464 ; use mbcnt to make sure the branch is divergent 465 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 466 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 467 %cc = icmp uge i32 %hi, 32 468 br i1 %cc, label %endif, label %if 469 470if: 471 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 472 %out = fadd float %src, %src 473 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 474 br label %endif 475 476endif: 477 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 478 ret float %out.1 479} 480 481; Make sure the transition from Exact to WWM then WQM works properly. 482define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { 483; GFX9-W64-LABEL: test_wwm5: 484; GFX9-W64: ; %bb.0: ; %main_body 485; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 486; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 487; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 488; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 489; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 490; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 491; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 492; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 493; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 494; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 495; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 496; GFX9-W64-NEXT: s_wqm_b64 exec, exec 497; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 498; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 499; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 500; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 501; GFX9-W64-NEXT: ; return to shader part epilog 502; 503; GFX10-W32-LABEL: test_wwm5: 504; GFX10-W32: ; %bb.0: ; %main_body 505; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 506; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 507; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 508; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 509; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 510; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 511; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 512; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 513; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 514; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 515; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 516; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 517; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 518; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 519; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 520; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 521; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 522; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 523; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 524; GFX10-W32-NEXT: ; return to shader part epilog 525main_body: 526 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 527 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 528 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 529 %temp = fadd float %src1, %src1 530 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) 531 %out = fadd float %temp.0, %temp.0 532 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 533 ret float %out.0 534} 535 536; Check that WWM is turned on correctly across basic block boundaries. 537; if..then..endif version 538;SI-CHECK: buffer_load_dword 539;VI-CHECK: flat_load_dword 540;SI-CHECK: buffer_load_dword 541;VI-CHECK: flat_load_dword 542define amdgpu_ps float @test_wwm6_then() { 543; GFX9-W64-LABEL: test_wwm6_then: 544; GFX9-W64: ; %bb.0: ; %main_body 545; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 546; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 547; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 548; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 549; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 550; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 551; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 552; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 553; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 554; GFX9-W64-NEXT: s_cbranch_execz .LBB12_2 555; GFX9-W64-NEXT: ; %bb.1: ; %if 556; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 557; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 558; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 559; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 560; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 561; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 562; GFX9-W64-NEXT: .LBB12_2: ; %endif 563; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 564; GFX9-W64-NEXT: ; return to shader part epilog 565; 566; GFX10-W32-LABEL: test_wwm6_then: 567; GFX10-W32: ; %bb.0: ; %main_body 568; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 569; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 570; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 571; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 572; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 573; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 574; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 575; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 576; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 577; GFX10-W32-NEXT: s_cbranch_execz .LBB12_2 578; GFX10-W32-NEXT: ; %bb.1: ; %if 579; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 580; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 581; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 582; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 583; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 584; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 585; GFX10-W32-NEXT: .LBB12_2: ; %endif 586; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 587; GFX10-W32-NEXT: ; return to shader part epilog 588main_body: 589 %src0 = load volatile float, float addrspace(1)* undef 590 ; use mbcnt to make sure the branch is divergent 591 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 592 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 593 %cc = icmp uge i32 %hi, 32 594 br i1 %cc, label %endif, label %if 595 596if: 597 %src1 = load volatile float, float addrspace(1)* undef 598 %out = fadd float %src0, %src1 599 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 600 br label %endif 601 602endif: 603 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 604 ret float %out.1 605} 606 607; Check that WWM is turned on correctly across basic block boundaries. 608; loop version 609;SI-CHECK: buffer_load_dword 610;VI-CHECK: flat_load_dword 611;SI-CHECK: buffer_load_dword 612;VI-CHECK: flat_load_dword 613define amdgpu_ps float @test_wwm6_loop() { 614; GFX9-W64-LABEL: test_wwm6_loop: 615; GFX9-W64: ; %bb.0: ; %main_body 616; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 617; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 618; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 619; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 620; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 621; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 622; GFX9-W64-NEXT: .LBB13_1: ; %loop 623; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 624; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 625; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 626; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 627; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 628; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 629; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 630; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 631; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 632; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 633; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 634; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 635; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 636; GFX9-W64-NEXT: s_cbranch_execnz .LBB13_1 637; GFX9-W64-NEXT: ; %bb.2: ; %endloop 638; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 639; GFX9-W64-NEXT: ; return to shader part epilog 640; 641; GFX10-W32-LABEL: test_wwm6_loop: 642; GFX10-W32: ; %bb.0: ; %main_body 643; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 644; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 645; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 646; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 647; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 648; GFX10-W32-NEXT: s_mov_b32 s0, 0 649; GFX10-W32-NEXT: .LBB13_1: ; %loop 650; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 651; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 652; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 653; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 654; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 655; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 656; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 657; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 658; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 659; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 660; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 661; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 662; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 663; GFX10-W32-NEXT: s_cbranch_execnz .LBB13_1 664; GFX10-W32-NEXT: ; %bb.2: ; %endloop 665; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 666; GFX10-W32-NEXT: ; return to shader part epilog 667main_body: 668 %src0 = load volatile float, float addrspace(1)* undef 669 ; use mbcnt to make sure the branch is divergent 670 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 671 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 672 br label %loop 673 674loop: 675 %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] 676 %src1 = load volatile float, float addrspace(1)* undef 677 %out = fadd float %src0, %src1 678 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 679 %counter.1 = sub i32 %counter, 1 680 %cc = icmp ne i32 %counter.1, 0 681 br i1 %cc, label %loop, label %endloop 682 683endloop: 684 ret float %out.0 685} 686 687; Check that @llvm.amdgcn.set.inactive disables WWM. 688define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { 689; GFX9-W64-LABEL: test_wwm_set_inactive1: 690; GFX9-W64: ; %bb.0: ; %main_body 691; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 692; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 693; GFX9-W64-NEXT: s_not_b64 exec, exec 694; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 695; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 696; GFX9-W64-NEXT: s_not_b64 exec, exec 697; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 698; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 699; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 700; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 701; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 702; GFX9-W64-NEXT: s_endpgm 703; 704; GFX10-W32-LABEL: test_wwm_set_inactive1: 705; GFX10-W32: ; %bb.0: ; %main_body 706; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 707; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 708; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 709; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 710; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 711; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 712; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 713; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 714; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 715; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 716; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 717; GFX10-W32-NEXT: s_endpgm 718main_body: 719 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 720 %src.0 = bitcast float %src to i32 721 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) 722 %out = add i32 %src.1, %src.1 723 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 724 %out.1 = bitcast i32 %out.0 to float 725 call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 726 ret void 727} 728 729; Check that Strict WQM is triggered by the strict_wqm intrinsic. 730define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) { 731; GFX9-W64-LABEL: test_strict_wqm1: 732; GFX9-W64: ; %bb.0: ; %main_body 733; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 734; GFX9-W64-NEXT: s_wqm_b64 exec, exec 735; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 736; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 737; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 738; GFX9-W64-NEXT: s_nop 0 739; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 740; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 741; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 742; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 743; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 744; GFX9-W64-NEXT: ; return to shader part epilog 745; 746; GFX10-W32-LABEL: test_strict_wqm1: 747; GFX10-W32: ; %bb.0: ; %main_body 748; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 749; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 750; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 751; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 752; GFX10-W32-NEXT: s_clause 0x1 753; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 754; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 755; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 756; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 757; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 758; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 759; GFX10-W32-NEXT: ; return to shader part epilog 760main_body: 761 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 762 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 763 %out = fadd float %src0, %src1 764 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 765 ret float %out.0 766} 767 768; Same as above, but with an integer type. 769define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) { 770; GFX9-W64-LABEL: test_strict_wqm2: 771; GFX9-W64: ; %bb.0: ; %main_body 772; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 773; GFX9-W64-NEXT: s_wqm_b64 exec, exec 774; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 775; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 776; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 777; GFX9-W64-NEXT: s_nop 0 778; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 779; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 780; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 781; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 782; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 783; GFX9-W64-NEXT: ; return to shader part epilog 784; 785; GFX10-W32-LABEL: test_strict_wqm2: 786; GFX10-W32: ; %bb.0: ; %main_body 787; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 788; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 789; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 790; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 791; GFX10-W32-NEXT: s_clause 0x1 792; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 793; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 794; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 795; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 796; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 797; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 798; GFX10-W32-NEXT: ; return to shader part epilog 799main_body: 800 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 801 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 802 %src0.0 = bitcast float %src0 to i32 803 %src1.0 = bitcast float %src1 to i32 804 %out = add i32 %src0.0, %src1.0 805 %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out) 806 %out.1 = bitcast i32 %out.0 to float 807 ret float %out.1 808} 809 810; Check that we don't leave Strict WQM on for computations that don't require it, 811; since that will lead clobbering things that aren't supposed to be clobbered 812; in cases like this. 813; We enforce this by checking that v_add gets emitted in the same block as 814; WWM computations. 815define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { 816; GFX9-W64-LABEL: test_strict_wqm3: 817; GFX9-W64: ; %bb.0: ; %main_body 818; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 819; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 820; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 821; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 822; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 823; GFX9-W64-NEXT: s_cbranch_execz .LBB17_2 824; GFX9-W64-NEXT: ; %bb.1: ; %if 825; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 826; GFX9-W64-NEXT: s_wqm_b64 exec, exec 827; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 828; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 829; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 830; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 831; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 832; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 833; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 834; GFX9-W64-NEXT: .LBB17_2: ; %endif 835; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 836; GFX9-W64-NEXT: ; return to shader part epilog 837; 838; GFX10-W32-LABEL: test_strict_wqm3: 839; GFX10-W32: ; %bb.0: ; %main_body 840; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 841; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 842; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 843; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 844; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 845; GFX10-W32-NEXT: s_cbranch_execz .LBB17_2 846; GFX10-W32-NEXT: ; %bb.1: ; %if 847; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 848; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 849; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 850; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 851; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 852; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 853; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 854; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 855; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 856; GFX10-W32-NEXT: .LBB17_2: ; %endif 857; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 858; GFX10-W32-NEXT: ; return to shader part epilog 859main_body: 860 ; use mbcnt to make sure the branch is divergent 861 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 862 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 863 %cc = icmp uge i32 %hi, 32 864 br i1 %cc, label %endif, label %if 865 866if: 867 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 868 %out = fadd float %src, %src 869 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 870 %out.1 = fadd float %src, %out.0 871 br label %endif 872 873endif: 874 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 875 ret float %out.2 876} 877 878; Check that Strict WQM writes aren't coalesced with non-strict writes, since 879; the Strict WQM write could clobber disabled channels in the non-strict one. 880; We enforce this by checking that v_mov gets emitted in the same block as 881; WWM computations. 882define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { 883; GFX9-W64-LABEL: test_strict_wqm4: 884; GFX9-W64: ; %bb.0: ; %main_body 885; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 886; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 887; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 888; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 889; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 890; GFX9-W64-NEXT: s_cbranch_execz .LBB18_2 891; GFX9-W64-NEXT: ; %bb.1: ; %if 892; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 893; GFX9-W64-NEXT: s_wqm_b64 exec, exec 894; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 895; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 896; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 897; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 898; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 899; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 900; GFX9-W64-NEXT: .LBB18_2: ; %endif 901; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 902; GFX9-W64-NEXT: ; return to shader part epilog 903; 904; GFX10-W32-LABEL: test_strict_wqm4: 905; GFX10-W32: ; %bb.0: ; %main_body 906; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 907; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 908; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 909; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 910; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 911; GFX10-W32-NEXT: s_cbranch_execz .LBB18_2 912; GFX10-W32-NEXT: ; %bb.1: ; %if 913; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 914; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 915; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 916; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 917; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 918; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 919; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 920; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 921; GFX10-W32-NEXT: .LBB18_2: ; %endif 922; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 923; GFX10-W32-NEXT: ; return to shader part epilog 924main_body: 925 ; use mbcnt to make sure the branch is divergent 926 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 927 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 928 %cc = icmp uge i32 %hi, 32 929 br i1 %cc, label %endif, label %if 930 931if: 932 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 933 %out = fadd float %src, %src 934 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 935 br label %endif 936 937endif: 938 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 939 ret float %out.1 940} 941 942; Make sure the transition from Exact to Strict WQM then WQM works properly. 943define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) { 944; GFX9-W64-LABEL: test_strict_wqm5: 945; GFX9-W64: ; %bb.0: ; %main_body 946; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 947; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 948; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 949; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 950; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 951; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 952; GFX9-W64-NEXT: s_wqm_b64 exec, exec 953; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 954; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 955; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 956; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 957; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 958; GFX9-W64-NEXT: s_wqm_b64 exec, exec 959; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 960; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 961; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 962; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 963; GFX9-W64-NEXT: ; return to shader part epilog 964; 965; GFX10-W32-LABEL: test_strict_wqm5: 966; GFX10-W32: ; %bb.0: ; %main_body 967; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 968; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 969; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 970; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 971; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 972; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 973; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 974; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 975; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 976; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 977; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 978; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 979; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 980; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 981; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 982; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 983; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 984; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 985; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 986; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 987; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 988; GFX10-W32-NEXT: ; return to shader part epilog 989main_body: 990 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 991 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 992 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 993 %temp = fadd float %src1, %src1 994 %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) 995 %out = fadd float %temp.0, %temp.0 996 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 997 ret float %out.0 998} 999 1000; Check that Strict WQM is turned on correctly across basic block boundaries. 1001; if..then..endif version 1002;SI-CHECK: buffer_load_dword 1003;VI-CHECK: flat_load_dword 1004;SI-CHECK: buffer_load_dword 1005;VI-CHECK: flat_load_dword 1006define amdgpu_ps float @test_strict_wqm6_then() { 1007; GFX9-W64-LABEL: test_strict_wqm6_then: 1008; GFX9-W64: ; %bb.0: ; %main_body 1009; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1010; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1011; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 1012; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1013; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 1014; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1015; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1016; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 1017; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 1018; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1019; GFX9-W64-NEXT: s_cbranch_execz .LBB20_2 1020; GFX9-W64-NEXT: ; %bb.1: ; %if 1021; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1022; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1023; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 1024; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1025; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 1026; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1027; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 1028; GFX9-W64-NEXT: .LBB20_2: ; %endif 1029; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1030; GFX9-W64-NEXT: ; return to shader part epilog 1031; 1032; GFX10-W32-LABEL: test_strict_wqm6_then: 1033; GFX10-W32: ; %bb.0: ; %main_body 1034; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1035; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1036; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 1037; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1038; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 1039; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1040; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1041; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 1042; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 1043; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 1044; GFX10-W32-NEXT: s_cbranch_execz .LBB20_2 1045; GFX10-W32-NEXT: ; %bb.1: ; %if 1046; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1047; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1048; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 1049; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1050; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 1051; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1052; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 1053; GFX10-W32-NEXT: .LBB20_2: ; %endif 1054; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1055; GFX10-W32-NEXT: ; return to shader part epilog 1056main_body: 1057 %src0 = load volatile float, float addrspace(1)* undef 1058 ; use mbcnt to make sure the branch is divergent 1059 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1060 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1061 %cc = icmp uge i32 %hi, 32 1062 br i1 %cc, label %endif, label %if 1063 1064if: 1065 %src1 = load volatile float, float addrspace(1)* undef 1066 %out = fadd float %src0, %src1 1067 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 1068 br label %endif 1069 1070endif: 1071 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 1072 ret float %out.1 1073} 1074 1075; Check that Strict WQM is turned on correctly across basic block boundaries. 1076; loop version 1077;SI-CHECK: buffer_load_dword 1078;VI-CHECK: flat_load_dword 1079;SI-CHECK: buffer_load_dword 1080;VI-CHECK: flat_load_dword 1081define amdgpu_ps float @test_strict_wqm6_loop() { 1082; GFX9-W64-LABEL: test_strict_wqm6_loop: 1083; GFX9-W64: ; %bb.0: ; %main_body 1084; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1085; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1086; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 1087; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1088; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 1089; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 1090; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 1091; GFX9-W64-NEXT: .LBB21_1: ; %loop 1092; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 1093; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1094; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1095; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 1096; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1097; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1098; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 1099; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1100; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1101; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1102; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 1103; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1104; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1105; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1106; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 1107; GFX9-W64-NEXT: s_cbranch_execnz .LBB21_1 1108; GFX9-W64-NEXT: ; %bb.2: ; %endloop 1109; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1110; GFX9-W64-NEXT: ; return to shader part epilog 1111; 1112; GFX10-W32-LABEL: test_strict_wqm6_loop: 1113; GFX10-W32: ; %bb.0: ; %main_body 1114; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1115; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1116; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 1117; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1118; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 1119; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 1120; GFX10-W32-NEXT: s_mov_b32 s0, 0 1121; GFX10-W32-NEXT: .LBB21_1: ; %loop 1122; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 1123; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1124; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1125; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 1126; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1127; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1128; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 1129; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1130; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1131; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 1132; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1133; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 1134; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1135; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 1136; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 1137; GFX10-W32-NEXT: s_cbranch_execnz .LBB21_1 1138; GFX10-W32-NEXT: ; %bb.2: ; %endloop 1139; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1140; GFX10-W32-NEXT: ; return to shader part epilog 1141main_body: 1142 %src0 = load volatile float, float addrspace(1)* undef 1143 ; use mbcnt to make sure the branch is divergent 1144 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1145 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1146 br label %loop 1147 1148loop: 1149 %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] 1150 %src1 = load volatile float, float addrspace(1)* undef 1151 %out = fadd float %src0, %src1 1152 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 1153 %counter.1 = sub i32 %counter, 1 1154 %cc = icmp ne i32 %counter.1, 0 1155 br i1 %cc, label %loop, label %endloop 1156 1157endloop: 1158 ret float %out.0 1159} 1160 1161; Check that enabling WQM anywhere enables WQM for the set.inactive source. 1162define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { 1163; GFX9-W64-LABEL: test_set_inactive2: 1164; GFX9-W64: ; %bb.0: ; %main_body 1165; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1166; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1167; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 1168; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0 1169; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen 1170; GFX9-W64-NEXT: s_nop 0 1171; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 1172; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec 1173; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec 1174; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 1175; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1176; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1 1177; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1178; GFX9-W64-NEXT: s_endpgm 1179; 1180; GFX10-W32-LABEL: test_set_inactive2: 1181; GFX10-W32: ; %bb.0: ; %main_body 1182; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 1183; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1184; GFX10-W32-NEXT: v_mov_b32_e32 v0, s1 1185; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 1186; GFX10-W32-NEXT: s_clause 0x1 1187; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 1188; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1189; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec 1190; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec killed $exec 1191; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 1192; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1193; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 1194; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1195; GFX10-W32-NEXT: s_endpgm 1196main_body: 1197 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 1198 %src1.0 = bitcast float %src1 to i32 1199 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) 1200 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 1201 %src0.0 = bitcast float %src0 to i32 1202 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) 1203 %out = add i32 %src0.1, %src1.1 1204 %out.0 = bitcast i32 %out to float 1205 call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 1206 ret void 1207} 1208 1209; Check a case of one branch of an if-else requiring WQM, the other requiring 1210; exact. 1211; Note: In this particular case, the save-and-restore could be avoided if the 1212; analysis understood that the two branches of the if-else are mutually 1213; exclusive. 1214define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 1215; GFX9-W64-LABEL: test_control_flow_0: 1216; GFX9-W64: ; %bb.0: ; %main_body 1217; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1218; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1219; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1220; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1221; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1222; GFX9-W64-NEXT: s_cbranch_execz .LBB23_2 1223; GFX9-W64-NEXT: ; %bb.1: ; %ELSE 1224; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] 1225; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1226; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1227; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 1228; GFX9-W64-NEXT: .LBB23_2: ; %Flow 1229; GFX9-W64-NEXT: s_or_saveexec_b64 s[14:15], s[14:15] 1230; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[14:15] 1231; GFX9-W64-NEXT: s_cbranch_execz .LBB23_4 1232; GFX9-W64-NEXT: ; %bb.3: ; %IF 1233; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1234; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1235; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 1236; GFX9-W64-NEXT: .LBB23_4: ; %END 1237; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1238; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1239; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1240; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1241; GFX9-W64-NEXT: ; return to shader part epilog 1242; 1243; GFX10-W32-LABEL: test_control_flow_0: 1244; GFX10-W32: ; %bb.0: ; %main_body 1245; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1246; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1247; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 1248; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 1249; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1250; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2 1251; GFX10-W32-NEXT: ; %bb.1: ; %ELSE 1252; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 1253; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1254; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1255; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 1256; GFX10-W32-NEXT: .LBB23_2: ; %Flow 1257; GFX10-W32-NEXT: s_or_saveexec_b32 s13, s13 1258; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s13 1259; GFX10-W32-NEXT: s_cbranch_execz .LBB23_4 1260; GFX10-W32-NEXT: ; %bb.3: ; %IF 1261; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1262; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1263; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1264; GFX10-W32-NEXT: .LBB23_4: ; %END 1265; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1266; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1267; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1268; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1269; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1270; GFX10-W32-NEXT: ; return to shader part epilog 1271main_body: 1272 %cmp = icmp eq i32 %z, 0 1273 br i1 %cmp, label %IF, label %ELSE 1274 1275IF: 1276 %c.bc = bitcast i32 %c to float 1277 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1278 %tex0 = extractelement <4 x float> %tex, i32 0 1279 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1280 %data.if = extractelement <4 x float> %dtex, i32 0 1281 br label %END 1282 1283ELSE: 1284 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 1285 br label %END 1286 1287END: 1288 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 1289 ret float %r 1290} 1291 1292; Reverse branch order compared to the previous test. 1293define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 1294; GFX9-W64-LABEL: test_control_flow_1: 1295; GFX9-W64: ; %bb.0: ; %main_body 1296; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1297; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1298; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1299; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1300; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1301; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2 1302; GFX9-W64-NEXT: ; %bb.1: ; %IF 1303; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1304; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1305; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 1306; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1307; GFX9-W64-NEXT: .LBB24_2: ; %Flow 1308; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15] 1309; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1310; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1] 1311; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] 1312; GFX9-W64-NEXT: s_cbranch_execz .LBB24_4 1313; GFX9-W64-NEXT: ; %bb.3: ; %ELSE 1314; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1315; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1316; GFX9-W64-NEXT: .LBB24_4: ; %END 1317; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1318; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1319; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1320; GFX9-W64-NEXT: ; return to shader part epilog 1321; 1322; GFX10-W32-LABEL: test_control_flow_1: 1323; GFX10-W32: ; %bb.0: ; %main_body 1324; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1325; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1326; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 1327; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 1328; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1329; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 1330; GFX10-W32-NEXT: ; %bb.1: ; %IF 1331; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1332; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1333; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1334; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1335; GFX10-W32-NEXT: .LBB24_2: ; %Flow 1336; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13 1337; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1338; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0 1339; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 1340; GFX10-W32-NEXT: s_cbranch_execz .LBB24_4 1341; GFX10-W32-NEXT: ; %bb.3: ; %ELSE 1342; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1343; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1344; GFX10-W32-NEXT: .LBB24_4: ; %END 1345; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1346; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1347; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1348; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1349; GFX10-W32-NEXT: ; return to shader part epilog 1350main_body: 1351 %cmp = icmp eq i32 %z, 0 1352 br i1 %cmp, label %ELSE, label %IF 1353 1354IF: 1355 %c.bc = bitcast i32 %c to float 1356 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1357 %tex0 = extractelement <4 x float> %tex, i32 0 1358 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1359 %data.if = extractelement <4 x float> %dtex, i32 0 1360 br label %END 1361 1362ELSE: 1363 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 1364 br label %END 1365 1366END: 1367 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 1368 ret float %r 1369} 1370 1371; Check that branch conditions are properly marked as needing WQM... 1372define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 1373; GFX9-W64-LABEL: test_control_flow_2: 1374; GFX9-W64: ; %bb.0: ; %main_body 1375; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1376; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1377; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1378; GFX9-W64-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen 1379; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1380; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 1381; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1382; GFX9-W64-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen 1383; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1384; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 1385; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 1386; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1387; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1388; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1389; GFX9-W64-NEXT: ; %bb.1: ; %ELSE 1390; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5 1391; GFX9-W64-NEXT: ; implicit-def: $vgpr5 1392; GFX9-W64-NEXT: ; %bb.2: ; %Flow 1393; GFX9-W64-NEXT: s_or_saveexec_b64 s[14:15], s[14:15] 1394; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[14:15] 1395; GFX9-W64-NEXT: ; %bb.3: ; %IF 1396; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3 1397; GFX9-W64-NEXT: ; %bb.4: ; %END 1398; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1399; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1400; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1401; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1402; GFX9-W64-NEXT: ; return to shader part epilog 1403; 1404; GFX10-W32-LABEL: test_control_flow_2: 1405; GFX10-W32: ; %bb.0: ; %main_body 1406; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1407; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1408; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1409; GFX10-W32-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen 1410; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1411; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 1412; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1413; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0 1414; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1415; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen 1416; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1417; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1418; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 1419; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1420; GFX10-W32-NEXT: ; %bb.1: ; %ELSE 1421; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5 1422; GFX10-W32-NEXT: ; implicit-def: $vgpr5 1423; GFX10-W32-NEXT: ; %bb.2: ; %Flow 1424; GFX10-W32-NEXT: s_or_saveexec_b32 s13, s13 1425; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s13 1426; GFX10-W32-NEXT: ; %bb.3: ; %IF 1427; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3 1428; GFX10-W32-NEXT: ; %bb.4: ; %END 1429; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1430; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1431; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1432; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1433; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1434; GFX10-W32-NEXT: ; return to shader part epilog 1435main_body: 1436 %idx.1 = extractelement <3 x i32> %idx, i32 0 1437 %data.1 = extractelement <2 x float> %data, i32 0 1438 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) 1439 1440 ; The load that determines the branch (and should therefore be WQM) is 1441 ; surrounded by stores that require disabled WQM. 1442 %idx.2 = extractelement <3 x i32> %idx, i32 1 1443 %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0) 1444 1445 %idx.3 = extractelement <3 x i32> %idx, i32 2 1446 %data.3 = extractelement <2 x float> %data, i32 1 1447 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0) 1448 1449 %cc = fcmp ogt float %z, 0.0 1450 br i1 %cc, label %IF, label %ELSE 1451 1452IF: 1453 %coord.IF = mul i32 %coord, 3 1454 br label %END 1455 1456ELSE: 1457 %coord.ELSE = mul i32 %coord, 4 1458 br label %END 1459 1460END: 1461 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] 1462 %coord.END.bc = bitcast i32 %coord.END to float 1463 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1464 ret <4 x float> %tex 1465} 1466 1467; ... but only if they really do need it. 1468define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { 1469; GFX9-W64-LABEL: test_control_flow_3: 1470; GFX9-W64: ; %bb.0: ; %main_body 1471; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1472; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1473; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 1474; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1475; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1476; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 1477; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1478; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 1479; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1480; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1481; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1482; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1483; GFX9-W64-NEXT: ; %bb.1: ; %ELSE 1484; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1 1485; GFX9-W64-NEXT: ; implicit-def: $vgpr1 1486; GFX9-W64-NEXT: ; %bb.2: ; %Flow 1487; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] 1488; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] 1489; GFX9-W64-NEXT: ; %bb.3: ; %IF 1490; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 1491; GFX9-W64-NEXT: ; %bb.4: ; %END 1492; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1493; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1494; GFX9-W64-NEXT: ; return to shader part epilog 1495; 1496; GFX10-W32-LABEL: test_control_flow_3: 1497; GFX10-W32: ; %bb.0: ; %main_body 1498; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1499; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1500; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1501; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1502; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1503; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1504; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1505; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1 1506; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1507; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1508; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 1509; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 1510; GFX10-W32-NEXT: ; %bb.1: ; %ELSE 1511; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 1512; GFX10-W32-NEXT: ; implicit-def: $vgpr1 1513; GFX10-W32-NEXT: ; %bb.2: ; %Flow 1514; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s0 1515; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 1516; GFX10-W32-NEXT: ; %bb.3: ; %IF 1517; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 1518; GFX10-W32-NEXT: ; %bb.4: ; %END 1519; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1520; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1521; GFX10-W32-NEXT: ; return to shader part epilog 1522main_body: 1523 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1524 %tex0 = extractelement <4 x float> %tex, i32 0 1525 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1526 %dtex.1 = extractelement <4 x float> %dtex, i32 0 1527 call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 1528 1529 %cc = fcmp ogt float %dtex.1, 0.0 1530 br i1 %cc, label %IF, label %ELSE 1531 1532IF: 1533 %tex.IF = fmul float %dtex.1, 3.0 1534 br label %END 1535 1536ELSE: 1537 %tex.ELSE = fmul float %dtex.1, 4.0 1538 br label %END 1539 1540END: 1541 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] 1542 ret float %tex.END 1543} 1544 1545; Another test that failed at some point because of terminator handling. 1546define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { 1547; GFX9-W64-LABEL: test_control_flow_4: 1548; GFX9-W64: ; %bb.0: ; %main_body 1549; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1550; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1551; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1552; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1553; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2 1554; GFX9-W64-NEXT: ; %bb.1: ; %IF 1555; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] 1556; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0 1557; GFX9-W64-NEXT: v_mov_b32_e32 v2, 1 1558; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1559; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen 1560; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 1561; GFX9-W64-NEXT: .LBB27_2: ; %END 1562; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1563; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1564; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1565; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1566; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1567; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1568; GFX9-W64-NEXT: ; return to shader part epilog 1569; 1570; GFX10-W32-LABEL: test_control_flow_4: 1571; GFX10-W32: ; %bb.0: ; %main_body 1572; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1573; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1574; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1575; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 1576; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 1577; GFX10-W32-NEXT: ; %bb.1: ; %IF 1578; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 1579; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0 1580; GFX10-W32-NEXT: v_mov_b32_e32 v2, 1 1581; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1582; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen 1583; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 1584; GFX10-W32-NEXT: .LBB27_2: ; %END 1585; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1586; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1587; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1588; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1589; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1590; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1591; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1592; GFX10-W32-NEXT: ; return to shader part epilog 1593main_body: 1594 %cond = icmp eq i32 %y, 0 1595 br i1 %cond, label %IF, label %END 1596 1597IF: 1598 %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0) 1599 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) 1600 br label %END 1601 1602END: 1603 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1604 %tex0 = extractelement <4 x float> %tex, i32 0 1605 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1606 ret <4 x float> %dtex 1607} 1608 1609; Kill is performed in WQM mode so that uniform kill behaves correctly ... 1610define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { 1611; GFX9-W64-LABEL: test_kill_0: 1612; GFX9-W64: ; %bb.0: ; %main_body 1613; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1614; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1615; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1616; GFX9-W64-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf 1617; GFX9-W64-NEXT: s_nop 0 1618; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1619; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1620; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v6 1621; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc 1622; GFX9-W64-NEXT: s_cbranch_scc0 .LBB28_2 1623; GFX9-W64-NEXT: ; %bb.1: ; %main_body 1624; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc 1625; GFX9-W64-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 1626; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1627; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1628; GFX9-W64-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf 1629; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1630; GFX9-W64-NEXT: v_add_f32_e32 v0, v7, v11 1631; GFX9-W64-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen 1632; GFX9-W64-NEXT: v_add_f32_e32 v1, v8, v12 1633; GFX9-W64-NEXT: v_add_f32_e32 v2, v9, v13 1634; GFX9-W64-NEXT: v_add_f32_e32 v3, v10, v14 1635; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1636; GFX9-W64-NEXT: s_branch .LBB28_3 1637; GFX9-W64-NEXT: .LBB28_2: 1638; GFX9-W64-NEXT: s_mov_b64 exec, 0 1639; GFX9-W64-NEXT: exp null off, off, off, off done vm 1640; GFX9-W64-NEXT: s_endpgm 1641; GFX9-W64-NEXT: .LBB28_3: 1642; 1643; GFX10-W32-LABEL: test_kill_0: 1644; GFX10-W32: ; %bb.0: ; %main_body 1645; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1646; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1647; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1648; GFX10-W32-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1649; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1650; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1651; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v6 1652; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo 1653; GFX10-W32-NEXT: s_cbranch_scc0 .LBB28_2 1654; GFX10-W32-NEXT: ; %bb.1: ; %main_body 1655; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo 1656; GFX10-W32-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1657; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1658; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1659; GFX10-W32-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1660; GFX10-W32-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen 1661; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1662; GFX10-W32-NEXT: v_add_f32_e32 v4, v8, v12 1663; GFX10-W32-NEXT: v_add_f32_e32 v5, v10, v14 1664; GFX10-W32-NEXT: v_add_f32_e32 v0, v7, v11 1665; GFX10-W32-NEXT: v_add_f32_e32 v2, v9, v13 1666; GFX10-W32-NEXT: v_mov_b32_e32 v1, v4 1667; GFX10-W32-NEXT: v_mov_b32_e32 v3, v5 1668; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1669; GFX10-W32-NEXT: s_branch .LBB28_3 1670; GFX10-W32-NEXT: .LBB28_2: 1671; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 1672; GFX10-W32-NEXT: exp null off, off, off, off done vm 1673; GFX10-W32-NEXT: s_endpgm 1674; GFX10-W32-NEXT: .LBB28_3: 1675main_body: 1676 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1677 %idx.0 = extractelement <2 x i32> %idx, i32 0 1678 %data.0 = extractelement <2 x float> %data, i32 0 1679 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0) 1680 1681 %z.cmp = fcmp olt float %z, 0.0 1682 call void @llvm.amdgcn.kill(i1 %z.cmp) 1683 1684 %idx.1 = extractelement <2 x i32> %idx, i32 1 1685 %data.1 = extractelement <2 x float> %data, i32 1 1686 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) 1687 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1688 %tex2.0 = extractelement <4 x float> %tex2, i32 0 1689 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1690 %out = fadd <4 x float> %tex, %dtex 1691 1692 ret <4 x float> %out 1693} 1694 1695; ... but only if WQM is necessary. 1696; CHECK-LABEL: {{^}}test_kill_1: 1697; CHECK-NEXT: ; %main_body 1698; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 1699; CHECK: s_wqm_b64 exec, exec 1700; CHECK: image_sample 1701; CHECK: s_and_b64 exec, exec, [[ORIG]] 1702; CHECK: image_sample 1703; CHECK-NOT: wqm 1704; CHECK-DAG: buffer_store_dword 1705; CHECK-DAG: v_cmp_ 1706define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 1707; GFX9-W64-LABEL: test_kill_1: 1708; GFX9-W64: ; %bb.0: ; %main_body 1709; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1710; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2 1711; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1712; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0 1713; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 1714; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1715; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1716; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1717; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v4 1718; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc 1719; GFX9-W64-NEXT: buffer_store_dword v5, off, s[0:3], 0 1720; GFX9-W64-NEXT: s_cbranch_scc0 .LBB29_2 1721; GFX9-W64-NEXT: ; %bb.1: ; %main_body 1722; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc 1723; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1724; GFX9-W64-NEXT: s_branch .LBB29_3 1725; GFX9-W64-NEXT: .LBB29_2: 1726; GFX9-W64-NEXT: s_mov_b64 exec, 0 1727; GFX9-W64-NEXT: exp null off, off, off, off done vm 1728; GFX9-W64-NEXT: s_endpgm 1729; GFX9-W64-NEXT: .LBB29_3: 1730; 1731; GFX10-W32-LABEL: test_kill_1: 1732; GFX10-W32: ; %bb.0: ; %main_body 1733; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1734; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2 1735; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1736; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0 1737; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1738; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1739; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1740; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1741; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v4 1742; GFX10-W32-NEXT: buffer_store_dword v5, off, s[0:3], 0 1743; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo 1744; GFX10-W32-NEXT: s_cbranch_scc0 .LBB29_2 1745; GFX10-W32-NEXT: ; %bb.1: ; %main_body 1746; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo 1747; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1748; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1749; GFX10-W32-NEXT: s_branch .LBB29_3 1750; GFX10-W32-NEXT: .LBB29_2: 1751; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 1752; GFX10-W32-NEXT: exp null off, off, off, off done vm 1753; GFX10-W32-NEXT: s_endpgm 1754; GFX10-W32-NEXT: .LBB29_3: 1755main_body: 1756 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1757 %tex0 = extractelement <4 x float> %tex, i32 0 1758 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1759 1760 call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) 1761 1762 %z.cmp = fcmp olt float %z, 0.0 1763 call void @llvm.amdgcn.kill(i1 %z.cmp) 1764 1765 ret <4 x float> %dtex 1766} 1767 1768; Check prolog shaders. 1769; CHECK-LABEL: {{^}}test_prolog_1: 1770; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 1771; CHECK: s_wqm_b64 exec, exec 1772; CHECK: v_add_f32_e32 v0, 1773; CHECK: s_and_b64 exec, exec, [[ORIG]] 1774define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 { 1775; GFX9-W64-LABEL: test_prolog_1: 1776; GFX9-W64: ; %bb.0: ; %main_body 1777; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1778; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1779; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 1780; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1781; GFX9-W64-NEXT: ; return to shader part epilog 1782; 1783; GFX10-W32-LABEL: test_prolog_1: 1784; GFX10-W32: ; %bb.0: ; %main_body 1785; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1786; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1787; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 1788; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1789; GFX10-W32-NEXT: ; return to shader part epilog 1790main_body: 1791 %s = fadd float %a, %b 1792 ret float %s 1793} 1794 1795; CHECK-LABEL: {{^}}test_loop_vcc: 1796; CHECK-NEXT: ; %entry 1797; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 1798; CHECK: s_wqm_b64 exec, exec 1799; CHECK: v_mov 1800; CHECK: v_mov 1801; CHECK: v_mov 1802; CHECK: v_mov 1803; CHECK: s_and_b64 exec, exec, [[LIVE]] 1804; CHECK: image_store 1805; CHECK: s_wqm_b64 exec, exec 1806; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 1807; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000 1808 1809; CHECK: [[LOOPHDR:.LBB[0-9]+_[0-9]+]]: ; %body 1810; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] 1811; CHECK: [[LOOP:.LBB[0-9]+_[0-9]+]]: ; %loop 1812; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] 1813; CHECK: s_cbranch_vccz [[LOOPHDR]] 1814 1815; CHECK: ; %break 1816; CHECK: ; return 1817define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { 1818; GFX9-W64-LABEL: test_loop_vcc: 1819; GFX9-W64: ; %bb.0: ; %entry 1820; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec 1821; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1822; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 1823; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 1824; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 1825; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 1826; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] 1827; GFX9-W64-NEXT: s_mov_b32 s0, 0 1828; GFX9-W64-NEXT: s_mov_b32 s1, s0 1829; GFX9-W64-NEXT: s_mov_b32 s2, s0 1830; GFX9-W64-NEXT: s_mov_b32 s3, s0 1831; GFX9-W64-NEXT: s_mov_b32 s4, s0 1832; GFX9-W64-NEXT: s_mov_b32 s5, s0 1833; GFX9-W64-NEXT: s_mov_b32 s6, s0 1834; GFX9-W64-NEXT: s_mov_b32 s7, s0 1835; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm 1836; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1837; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 1838; GFX9-W64-NEXT: s_mov_b32 s10, 0x40e00000 1839; GFX9-W64-NEXT: s_branch .LBB31_2 1840; GFX9-W64-NEXT: .LBB31_1: ; %body 1841; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 1842; GFX9-W64-NEXT: s_mov_b32 s1, s0 1843; GFX9-W64-NEXT: s_mov_b32 s2, s0 1844; GFX9-W64-NEXT: s_mov_b32 s3, s0 1845; GFX9-W64-NEXT: s_mov_b32 s4, s0 1846; GFX9-W64-NEXT: s_mov_b32 s5, s0 1847; GFX9-W64-NEXT: s_mov_b32 s6, s0 1848; GFX9-W64-NEXT: s_mov_b32 s7, s0 1849; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf 1850; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 1851; GFX9-W64-NEXT: s_mov_b64 s[2:3], 0 1852; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 1853; GFX9-W64-NEXT: .LBB31_2: ; %loop 1854; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 1855; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1856; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 1857; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s10, v8 1858; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 1859; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 1860; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 1861; GFX9-W64-NEXT: s_and_b64 vcc, exec, vcc 1862; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 1863; GFX9-W64-NEXT: ; %bb.3: 1864; GFX9-W64-NEXT: s_mov_b64 s[2:3], -1 1865; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 1866; GFX9-W64-NEXT: ; implicit-def: $vgpr8 1867; GFX9-W64-NEXT: .LBB31_4: ; %break 1868; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] 1869; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1870; GFX9-W64-NEXT: ; return to shader part epilog 1871; 1872; GFX10-W32-LABEL: test_loop_vcc: 1873; GFX10-W32: ; %bb.0: ; %entry 1874; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo 1875; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1876; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0 1877; GFX10-W32-NEXT: s_mov_b32 s0, 0 1878; GFX10-W32-NEXT: s_mov_b32 s1, s0 1879; GFX10-W32-NEXT: s_mov_b32 s2, s0 1880; GFX10-W32-NEXT: s_mov_b32 s3, s0 1881; GFX10-W32-NEXT: s_mov_b32 s4, s0 1882; GFX10-W32-NEXT: s_mov_b32 s5, s0 1883; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 1884; GFX10-W32-NEXT: s_mov_b32 s6, s0 1885; GFX10-W32-NEXT: s_mov_b32 s7, s0 1886; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 1887; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1888; GFX10-W32-NEXT: s_branch .LBB31_2 1889; GFX10-W32-NEXT: .p2align 6 1890; GFX10-W32-NEXT: .LBB31_1: ; %body 1891; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 1892; GFX10-W32-NEXT: s_mov_b32 s1, s0 1893; GFX10-W32-NEXT: s_mov_b32 s2, s0 1894; GFX10-W32-NEXT: s_mov_b32 s3, s0 1895; GFX10-W32-NEXT: s_mov_b32 s4, s0 1896; GFX10-W32-NEXT: s_mov_b32 s5, s0 1897; GFX10-W32-NEXT: s_mov_b32 s6, s0 1898; GFX10-W32-NEXT: s_mov_b32 s7, s0 1899; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 1900; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 1901; GFX10-W32-NEXT: s_mov_b32 s1, 0 1902; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 1903; GFX10-W32-NEXT: .LBB31_2: ; %loop 1904; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 1905; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 1906; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1907; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3 1908; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2 1909; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 1910; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 1911; GFX10-W32-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo 1912; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 1913; GFX10-W32-NEXT: ; %bb.3: 1914; GFX10-W32-NEXT: s_mov_b32 s1, -1 1915; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 1916; GFX10-W32-NEXT: ; implicit-def: $vgpr8 1917; GFX10-W32-NEXT: .LBB31_4: ; %break 1918; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 1919; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1920; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 1921; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 1922; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6 1923; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7 1924; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1925; GFX10-W32-NEXT: ; return to shader part epilog 1926entry: 1927 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0) 1928 br label %loop 1929 1930loop: 1931 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 1932 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 1933 %cc = fcmp ogt float %ctr.iv, 7.0 1934 br i1 %cc, label %break, label %body 1935 1936body: 1937 %c.iv0 = extractelement <4 x float> %c.iv, i32 0 1938 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 1939 %ctr.next = fadd float %ctr.iv, 2.0 1940 br label %loop 1941 1942break: 1943 ret <4 x float> %c.iv 1944} 1945 1946; Only intrinsic stores need exact execution -- other stores do not have 1947; externally visible effects and may require WQM for correctness. 1948; CHECK-LABEL: {{^}}test_alloca: 1949; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 1950; CHECK: s_wqm_b64 exec, exec 1951 1952; CHECK: s_and_b64 exec, exec, [[LIVE]] 1953; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 1954; CHECK: s_wqm_b64 exec, exec 1955; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} 1956; CHECK: s_and_b64 exec, exec, [[LIVE]] 1957; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen 1958; CHECK: s_wqm_b64 exec, exec 1959; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen 1960 1961; CHECK: s_and_b64 exec, exec, [[LIVE]] 1962; CHECK: image_sample 1963; CHECK: buffer_store_dwordx4 1964define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { 1965; GFX9-W64-LABEL: test_alloca: 1966; GFX9-W64: ; %bb.0: ; %entry 1967; GFX9-W64-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1968; GFX9-W64-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1969; GFX9-W64-NEXT: s_mov_b32 s10, -1 1970; GFX9-W64-NEXT: s_mov_b32 s11, 0xe00000 1971; GFX9-W64-NEXT: s_add_u32 s8, s8, s0 1972; GFX9-W64-NEXT: s_addc_u32 s9, s9, 0 1973; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1974; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1975; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1976; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0 1977; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1978; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 1979; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1980; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1981; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 1982; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen 1983; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1984; GFX9-W64-NEXT: v_mov_b32_e32 v0, 4 1985; GFX9-W64-NEXT: v_lshl_add_u32 v0, v2, 2, v0 1986; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen 1987; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1988; GFX9-W64-NEXT: s_mov_b32 s0, 0 1989; GFX9-W64-NEXT: s_mov_b32 s1, s0 1990; GFX9-W64-NEXT: s_mov_b32 s2, s0 1991; GFX9-W64-NEXT: s_mov_b32 s3, s0 1992; GFX9-W64-NEXT: s_mov_b32 s4, s0 1993; GFX9-W64-NEXT: s_mov_b32 s5, s0 1994; GFX9-W64-NEXT: s_mov_b32 s6, s0 1995; GFX9-W64-NEXT: s_mov_b32 s7, s0 1996; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1997; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 1998; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1999; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2000; GFX9-W64-NEXT: s_endpgm 2001; 2002; GFX10-W32-LABEL: test_alloca: 2003; GFX10-W32: ; %bb.0: ; %entry 2004; GFX10-W32-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2005; GFX10-W32-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2006; GFX10-W32-NEXT: s_mov_b32 s10, -1 2007; GFX10-W32-NEXT: s_mov_b32 s11, 0x31c16000 2008; GFX10-W32-NEXT: s_add_u32 s8, s8, s0 2009; GFX10-W32-NEXT: s_addc_u32 s9, s9, 0 2010; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 2011; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2012; GFX10-W32-NEXT: v_mov_b32_e32 v3, 1 2013; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 4 2014; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 2015; GFX10-W32-NEXT: buffer_store_dword v0, off, s[0:3], 0 2016; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2017; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 2018; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2019; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 2020; GFX10-W32-NEXT: buffer_store_dword v0, v3, s[0:3], 0 idxen 2021; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2022; GFX10-W32-NEXT: buffer_load_dword v0, v2, s[8:11], 0 offen 2023; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 2024; GFX10-W32-NEXT: s_mov_b32 s0, 0 2025; GFX10-W32-NEXT: s_mov_b32 s1, s0 2026; GFX10-W32-NEXT: s_mov_b32 s2, s0 2027; GFX10-W32-NEXT: s_mov_b32 s3, s0 2028; GFX10-W32-NEXT: s_mov_b32 s4, s0 2029; GFX10-W32-NEXT: s_mov_b32 s5, s0 2030; GFX10-W32-NEXT: s_mov_b32 s6, s0 2031; GFX10-W32-NEXT: s_mov_b32 s7, s0 2032; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2033; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2034; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2035; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2036; GFX10-W32-NEXT: s_endpgm 2037entry: 2038 %array = alloca [32 x i32], align 4, addrspace(5) 2039 2040 call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) 2041 2042 %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0 2043 store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4 2044 2045 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) 2046 2047 %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx 2048 %c = load i32, i32 addrspace(5)* %c.gep, align 4 2049 %c.bc = bitcast i32 %c to float 2050 %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2051 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0) 2052 2053 ret void 2054} 2055 2056; Must return to exact at the end of a non-void returning shader, 2057; otherwise the EXEC mask exported by the epilog will be wrong. This is true 2058; even if the shader has no kills, because a kill could have happened in a 2059; previous shader fragment. 2060; CHECK-LABEL: {{^}}test_nonvoid_return: 2061; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 2062; CHECK: s_wqm_b64 exec, exec 2063; CHECK: s_and_b64 exec, exec, [[LIVE]] 2064; CHECK-NOT: exec 2065define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { 2066; GFX9-W64-LABEL: test_nonvoid_return: 2067; GFX9-W64: ; %bb.0: 2068; GFX9-W64-NEXT: s_mov_b32 s0, 0 2069; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec 2070; GFX9-W64-NEXT: s_mov_b32 s1, s0 2071; GFX9-W64-NEXT: s_mov_b32 s2, s0 2072; GFX9-W64-NEXT: s_mov_b32 s3, s0 2073; GFX9-W64-NEXT: s_mov_b32 s4, s0 2074; GFX9-W64-NEXT: s_mov_b32 s5, s0 2075; GFX9-W64-NEXT: s_mov_b32 s6, s0 2076; GFX9-W64-NEXT: s_mov_b32 s7, s0 2077; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2078; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 2079; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] 2080; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2081; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 2082; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2083; GFX9-W64-NEXT: ; return to shader part epilog 2084; 2085; GFX10-W32-LABEL: test_nonvoid_return: 2086; GFX10-W32: ; %bb.0: 2087; GFX10-W32-NEXT: s_mov_b32 s0, 0 2088; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo 2089; GFX10-W32-NEXT: s_mov_b32 s1, s0 2090; GFX10-W32-NEXT: s_mov_b32 s2, s0 2091; GFX10-W32-NEXT: s_mov_b32 s3, s0 2092; GFX10-W32-NEXT: s_mov_b32 s4, s0 2093; GFX10-W32-NEXT: s_mov_b32 s5, s0 2094; GFX10-W32-NEXT: s_mov_b32 s6, s0 2095; GFX10-W32-NEXT: s_mov_b32 s7, s0 2096; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2097; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D 2098; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 2099; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2100; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2101; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2102; GFX10-W32-NEXT: ; return to shader part epilog 2103 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2104 %tex0 = extractelement <4 x float> %tex, i32 0 2105 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2106 ret <4 x float> %dtex 2107} 2108 2109; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable: 2110; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 2111; CHECK: s_wqm_b64 exec, exec 2112; CHECK: s_and_b64 exec, exec, [[LIVE]] 2113; CHECK-NOT: exec 2114define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { 2115; GFX9-W64-LABEL: test_nonvoid_return_unreachable: 2116; GFX9-W64: ; %bb.0: ; %entry 2117; GFX9-W64-NEXT: s_mov_b32 s4, 0 2118; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 2119; GFX9-W64-NEXT: s_mov_b32 s5, s4 2120; GFX9-W64-NEXT: s_mov_b32 s6, s4 2121; GFX9-W64-NEXT: s_mov_b32 s7, s4 2122; GFX9-W64-NEXT: s_mov_b32 s8, s4 2123; GFX9-W64-NEXT: s_mov_b32 s9, s4 2124; GFX9-W64-NEXT: s_mov_b32 s10, s4 2125; GFX9-W64-NEXT: s_mov_b32 s11, s4 2126; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2127; GFX9-W64-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 2128; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 2129; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2130; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf 2131; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 2132; GFX9-W64-NEXT: s_cbranch_scc0 .LBB34_2 2133; GFX9-W64-NEXT: ; %bb.1: ; %else 2134; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2135; GFX9-W64-NEXT: s_branch .LBB34_3 2136; GFX9-W64-NEXT: .LBB34_2: ; %if 2137; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2138; GFX9-W64-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 2139; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2140; GFX9-W64-NEXT: .LBB34_3: 2141; 2142; GFX10-W32-LABEL: test_nonvoid_return_unreachable: 2143; GFX10-W32: ; %bb.0: ; %entry 2144; GFX10-W32-NEXT: s_mov_b32 s4, 0 2145; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 2146; GFX10-W32-NEXT: s_mov_b32 s5, s4 2147; GFX10-W32-NEXT: s_mov_b32 s6, s4 2148; GFX10-W32-NEXT: s_mov_b32 s7, s4 2149; GFX10-W32-NEXT: s_mov_b32 s8, s4 2150; GFX10-W32-NEXT: s_mov_b32 s9, s4 2151; GFX10-W32-NEXT: s_mov_b32 s10, s4 2152; GFX10-W32-NEXT: s_mov_b32 s11, s4 2153; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2154; GFX10-W32-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D 2155; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 2156; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2157; GFX10-W32-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2158; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 2159; GFX10-W32-NEXT: s_cbranch_scc0 .LBB34_2 2160; GFX10-W32-NEXT: ; %bb.1: ; %else 2161; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2162; GFX10-W32-NEXT: s_branch .LBB34_3 2163; GFX10-W32-NEXT: .LBB34_2: ; %if 2164; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2165; GFX10-W32-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 2166; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2167; GFX10-W32-NEXT: .LBB34_3: 2168entry: 2169 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2170 %tex0 = extractelement <4 x float> %tex, i32 0 2171 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2172 %cc = icmp sgt i32 %c, 0 2173 br i1 %cc, label %if, label %else 2174 2175if: 2176 store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef 2177 unreachable 2178 2179else: 2180 ret <4 x float> %dtex 2181} 2182 2183; Test awareness that s_wqm_b64 clobbers SCC. 2184; CHECK-LABEL: {{^}}test_scc: 2185; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 2186; CHECK: s_wqm_b64 exec, exec 2187; CHECK: s_cmp_ 2188; CHECK-NEXT: s_cbranch_scc 2189; CHECK: ; %else 2190; CHECK: image_sample 2191; CHECK: ; %if 2192; CHECK: image_sample 2193; CHECK: ; %end 2194; CHECK: s_and_b64 exec, exec, [[ORIG]] 2195define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { 2196; GFX9-W64-LABEL: test_scc: 2197; GFX9-W64: ; %bb.0: ; %main_body 2198; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 2199; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 2200; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2201; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 2202; GFX9-W64-NEXT: s_cbranch_scc0 .LBB35_2 2203; GFX9-W64-NEXT: ; %bb.1: ; %else 2204; GFX9-W64-NEXT: s_mov_b32 s4, 0 2205; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2206; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 2207; GFX9-W64-NEXT: s_mov_b32 s5, s4 2208; GFX9-W64-NEXT: s_mov_b32 s6, s4 2209; GFX9-W64-NEXT: s_mov_b32 s7, s4 2210; GFX9-W64-NEXT: s_mov_b32 s8, s4 2211; GFX9-W64-NEXT: s_mov_b32 s9, s4 2212; GFX9-W64-NEXT: s_mov_b32 s10, s4 2213; GFX9-W64-NEXT: s_mov_b32 s11, s4 2214; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf 2215; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 2216; GFX9-W64-NEXT: s_branch .LBB35_4 2217; GFX9-W64-NEXT: .LBB35_2: 2218; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 2219; GFX9-W64-NEXT: .LBB35_3: ; %if 2220; GFX9-W64-NEXT: s_mov_b32 s4, 0 2221; GFX9-W64-NEXT: s_mov_b32 s5, s4 2222; GFX9-W64-NEXT: s_mov_b32 s6, s4 2223; GFX9-W64-NEXT: s_mov_b32 s7, s4 2224; GFX9-W64-NEXT: s_mov_b32 s8, s4 2225; GFX9-W64-NEXT: s_mov_b32 s9, s4 2226; GFX9-W64-NEXT: s_mov_b32 s10, s4 2227; GFX9-W64-NEXT: s_mov_b32 s11, s4 2228; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2229; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2230; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf 2231; GFX9-W64-NEXT: .LBB35_4: ; %end 2232; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 2233; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0 2234; GFX9-W64-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen 2235; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2236; GFX9-W64-NEXT: ; return to shader part epilog 2237; 2238; GFX10-W32-LABEL: test_scc: 2239; GFX10-W32: ; %bb.0: ; %main_body 2240; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 2241; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo 2242; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2243; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 2244; GFX10-W32-NEXT: s_cbranch_scc0 .LBB35_2 2245; GFX10-W32-NEXT: ; %bb.1: ; %else 2246; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2247; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1 2248; GFX10-W32-NEXT: s_mov_b32 s0, 0 2249; GFX10-W32-NEXT: s_mov_b32 s1, s0 2250; GFX10-W32-NEXT: s_mov_b32 s2, s0 2251; GFX10-W32-NEXT: s_mov_b32 s3, s0 2252; GFX10-W32-NEXT: s_mov_b32 s4, s0 2253; GFX10-W32-NEXT: s_mov_b32 s5, s0 2254; GFX10-W32-NEXT: s_mov_b32 s6, s0 2255; GFX10-W32-NEXT: s_mov_b32 s7, s0 2256; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D 2257; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 2258; GFX10-W32-NEXT: s_branch .LBB35_4 2259; GFX10-W32-NEXT: .LBB35_2: 2260; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 2261; GFX10-W32-NEXT: .LBB35_3: ; %if 2262; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2263; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2264; GFX10-W32-NEXT: s_mov_b32 s0, 0 2265; GFX10-W32-NEXT: s_mov_b32 s1, s0 2266; GFX10-W32-NEXT: s_mov_b32 s2, s0 2267; GFX10-W32-NEXT: s_mov_b32 s3, s0 2268; GFX10-W32-NEXT: s_mov_b32 s4, s0 2269; GFX10-W32-NEXT: s_mov_b32 s5, s0 2270; GFX10-W32-NEXT: s_mov_b32 s6, s0 2271; GFX10-W32-NEXT: s_mov_b32 s7, s0 2272; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2273; GFX10-W32-NEXT: .LBB35_4: ; %end 2274; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 2275; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0 2276; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen 2277; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2278; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2279; GFX10-W32-NEXT: ; return to shader part epilog 2280main_body: 2281 %cc = icmp sgt i32 %sel, 0 2282 br i1 %cc, label %if, label %else 2283 2284if: 2285 %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2286 br label %end 2287 2288else: 2289 %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2290 br label %end 2291 2292end: 2293 %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] 2294 call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2295 ret <4 x float> %r 2296} 2297 2298; Check a case of a block being entirely WQM except for a bit of WWM. 2299; There was a bug where it forgot to enter and leave WWM. 2300define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2301; GFX9-W64-LABEL: test_wwm_within_wqm: 2302; GFX9-W64: ; %bb.0: ; %main_body 2303; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2304; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2305; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2306; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 2307; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2308; GFX9-W64-NEXT: s_cbranch_execz .LBB36_2 2309; GFX9-W64-NEXT: ; %bb.1: ; %IF 2310; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2311; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2312; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2313; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2314; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0 2315; GFX9-W64-NEXT: s_not_b64 exec, exec 2316; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 2317; GFX9-W64-NEXT: s_not_b64 exec, exec 2318; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2319; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2320; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2321; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2323; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 2324; GFX9-W64-NEXT: .LBB36_2: ; %ENDIF 2325; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2326; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2327; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2328; GFX9-W64-NEXT: ; return to shader part epilog 2329; 2330; GFX10-W32-LABEL: test_wwm_within_wqm: 2331; GFX10-W32: ; %bb.0: ; %main_body 2332; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2333; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2334; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2335; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 2336; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 2337; GFX10-W32-NEXT: s_cbranch_execz .LBB36_2 2338; GFX10-W32-NEXT: ; %bb.1: ; %IF 2339; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2340; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2341; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2342; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2343; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0 2344; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2345; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 2346; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2347; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2348; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2349; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2350; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2351; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2352; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 2353; GFX10-W32-NEXT: .LBB36_2: ; %ENDIF 2354; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2355; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2356; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2357; GFX10-W32-NEXT: ; return to shader part epilog 2358main_body: 2359 %c.bc = bitcast i32 %c to float 2360 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2361 %tex0 = extractelement <4 x float> %tex, i32 0 2362 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2363 %cmp = icmp eq i32 %z, 0 2364 br i1 %cmp, label %IF, label %ENDIF 2365 2366IF: 2367 %dataf = extractelement <4 x float> %dtex, i32 0 2368 %data1 = fptosi float %dataf to i32 2369 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) 2370 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) 2371 %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3) 2372 %data4f = sitofp i32 %data4 to float 2373 br label %ENDIF 2374 2375ENDIF: 2376 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] 2377 ret float %r 2378} 2379 2380; Check that WWM is triggered by the strict_wwm intrinsic. 2381define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 2382; GFX9-W64-LABEL: test_strict_wwm1: 2383; GFX9-W64: ; %bb.0: ; %main_body 2384; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2385; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2386; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 2387; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2388; GFX9-W64-NEXT: s_nop 0 2389; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2390; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2391; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 2392; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2393; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2394; GFX9-W64-NEXT: ; return to shader part epilog 2395; 2396; GFX10-W32-LABEL: test_strict_wwm1: 2397; GFX10-W32: ; %bb.0: ; %main_body 2398; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2399; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2400; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 2401; GFX10-W32-NEXT: s_clause 0x1 2402; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2403; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2404; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2405; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 2406; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2407; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2408; GFX10-W32-NEXT: ; return to shader part epilog 2409main_body: 2410 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 2411 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 2412 %out = fadd float %src0, %src1 2413 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2414 ret float %out.0 2415} 2416 2417; Same as above, but with an integer type. 2418define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) { 2419; GFX9-W64-LABEL: test_strict_wwm2: 2420; GFX9-W64: ; %bb.0: ; %main_body 2421; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2422; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2423; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 2424; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2425; GFX9-W64-NEXT: s_nop 0 2426; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2427; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2428; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 2429; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2430; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2431; GFX9-W64-NEXT: ; return to shader part epilog 2432; 2433; GFX10-W32-LABEL: test_strict_wwm2: 2434; GFX10-W32: ; %bb.0: ; %main_body 2435; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2436; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2437; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 2438; GFX10-W32-NEXT: s_clause 0x1 2439; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2440; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2441; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2442; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 2443; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2444; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2445; GFX10-W32-NEXT: ; return to shader part epilog 2446main_body: 2447 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 2448 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 2449 %src0.0 = bitcast float %src0 to i32 2450 %src1.0 = bitcast float %src1 to i32 2451 %out = add i32 %src0.0, %src1.0 2452 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) 2453 %out.1 = bitcast i32 %out.0 to float 2454 ret float %out.1 2455} 2456 2457; Check that we don't leave WWM on for computations that don't require WWM, 2458; since that will lead clobbering things that aren't supposed to be clobbered 2459; in cases like this. 2460; We enforce this by checking that v_add gets emitted in the same block as 2461; WWM computations. 2462define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { 2463; GFX9-W64-LABEL: test_strict_wwm3: 2464; GFX9-W64: ; %bb.0: ; %main_body 2465; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2466; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2467; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 2468; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2469; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 2470; GFX9-W64-NEXT: s_cbranch_execz .LBB39_2 2471; GFX9-W64-NEXT: ; %bb.1: ; %if 2472; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2473; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2474; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2475; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2476; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 2477; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2478; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2479; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 2480; GFX9-W64-NEXT: .LBB39_2: ; %endif 2481; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 2482; GFX9-W64-NEXT: ; return to shader part epilog 2483; 2484; GFX10-W32-LABEL: test_strict_wwm3: 2485; GFX10-W32: ; %bb.0: ; %main_body 2486; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2487; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2488; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 2489; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2490; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2491; GFX10-W32-NEXT: s_cbranch_execz .LBB39_2 2492; GFX10-W32-NEXT: ; %bb.1: ; %if 2493; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2494; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2495; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2496; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2497; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 2498; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2499; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2500; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 2501; GFX10-W32-NEXT: .LBB39_2: ; %endif 2502; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2503; GFX10-W32-NEXT: ; return to shader part epilog 2504main_body: 2505 ; use mbcnt to make sure the branch is divergent 2506 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2507 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2508 %cc = icmp uge i32 %hi, 32 2509 br i1 %cc, label %endif, label %if 2510 2511if: 2512 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2513 %out = fadd float %src, %src 2514 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2515 %out.1 = fadd float %src, %out.0 2516 br label %endif 2517 2518endif: 2519 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 2520 ret float %out.2 2521} 2522 2523; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM 2524; write could clobber disabled channels in the non-WWM one. 2525; We enforce this by checking that v_mov gets emitted in the same block as 2526; WWM computations. 2527define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { 2528; GFX9-W64-LABEL: test_strict_wwm4: 2529; GFX9-W64: ; %bb.0: ; %main_body 2530; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2531; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2532; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 2533; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2534; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 2535; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2 2536; GFX9-W64-NEXT: ; %bb.1: ; %if 2537; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2538; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2539; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2540; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2541; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 2542; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2543; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2544; GFX9-W64-NEXT: .LBB40_2: ; %endif 2545; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 2546; GFX9-W64-NEXT: ; return to shader part epilog 2547; 2548; GFX10-W32-LABEL: test_strict_wwm4: 2549; GFX10-W32: ; %bb.0: ; %main_body 2550; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2551; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2552; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 2553; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2554; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2555; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2 2556; GFX10-W32-NEXT: ; %bb.1: ; %if 2557; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2558; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2559; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2560; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2561; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 2562; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2563; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2564; GFX10-W32-NEXT: .LBB40_2: ; %endif 2565; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2566; GFX10-W32-NEXT: ; return to shader part epilog 2567main_body: 2568 ; use mbcnt to make sure the branch is divergent 2569 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2570 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2571 %cc = icmp uge i32 %hi, 32 2572 br i1 %cc, label %endif, label %if 2573 2574if: 2575 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2576 %out = fadd float %src, %src 2577 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2578 br label %endif 2579 2580endif: 2581 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 2582 ret float %out.1 2583} 2584 2585; Make sure the transition from Exact to WWM then WQM works properly. 2586define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) { 2587; GFX9-W64-LABEL: test_strict_wwm5: 2588; GFX9-W64: ; %bb.0: ; %main_body 2589; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 2590; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 2591; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 2592; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2593; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 2594; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2595; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 2596; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2597; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2598; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 2599; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2600; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2601; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2602; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 2603; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 2604; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 2605; GFX9-W64-NEXT: ; return to shader part epilog 2606; 2607; GFX10-W32-LABEL: test_strict_wwm5: 2608; GFX10-W32: ; %bb.0: ; %main_body 2609; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 2610; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 2611; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 2612; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2613; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 2614; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2615; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2616; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 2617; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2618; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2619; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2620; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 2621; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2622; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2623; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2624; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 2625; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 2626; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 2627; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2628; GFX10-W32-NEXT: ; return to shader part epilog 2629main_body: 2630 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 2631 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 2632 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 2633 %temp = fadd float %src1, %src1 2634 %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) 2635 %out = fadd float %temp.0, %temp.0 2636 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 2637 ret float %out.0 2638} 2639 2640; Check that WWM is turned on correctly across basic block boundaries. 2641; if..then..endif version 2642;SI-CHECK: buffer_load_dword 2643;VI-CHECK: flat_load_dword 2644;SI-CHECK: buffer_load_dword 2645;VI-CHECK: flat_load_dword 2646define amdgpu_ps float @test_strict_wwm6_then() { 2647; GFX9-W64-LABEL: test_strict_wwm6_then: 2648; GFX9-W64: ; %bb.0: ; %main_body 2649; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2650; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 2651; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2652; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2653; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2654; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2655; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 32, v0 2656; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2657; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2658; GFX9-W64-NEXT: s_cbranch_execz .LBB42_2 2659; GFX9-W64-NEXT: ; %bb.1: ; %if 2660; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2661; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 2662; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2663; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 2664; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2665; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2666; GFX9-W64-NEXT: .LBB42_2: ; %endif 2667; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 2668; GFX9-W64-NEXT: ; return to shader part epilog 2669; 2670; GFX10-W32-LABEL: test_strict_wwm6_then: 2671; GFX10-W32: ; %bb.0: ; %main_body 2672; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2673; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 2674; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2675; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2676; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2677; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2678; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 32, v0 2679; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2680; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 2681; GFX10-W32-NEXT: s_cbranch_execz .LBB42_2 2682; GFX10-W32-NEXT: ; %bb.1: ; %if 2683; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2684; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 2685; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2686; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 2687; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2688; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2689; GFX10-W32-NEXT: .LBB42_2: ; %endif 2690; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 2691; GFX10-W32-NEXT: ; return to shader part epilog 2692main_body: 2693 %src0 = load volatile float, float addrspace(1)* undef 2694 ; use mbcnt to make sure the branch is divergent 2695 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2696 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2697 %cc = icmp uge i32 %hi, 32 2698 br i1 %cc, label %endif, label %if 2699 2700if: 2701 %src1 = load volatile float, float addrspace(1)* undef 2702 %out = fadd float %src0, %src1 2703 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2704 br label %endif 2705 2706endif: 2707 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 2708 ret float %out.1 2709} 2710 2711; Check that WWM is turned on correctly across basic block boundaries. 2712; loop version 2713define amdgpu_ps float @test_strict_wwm6_loop() { 2714; GFX9-W64-LABEL: test_strict_wwm6_loop: 2715; GFX9-W64: ; %bb.0: ; %main_body 2716; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2717; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 2718; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2719; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2720; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 2721; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 2722; GFX9-W64-NEXT: .LBB43_1: ; %loop 2723; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 2724; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2725; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 2726; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2727; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2728; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 2729; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2730; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2731; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 2732; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2733; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2734; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2735; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 2736; GFX9-W64-NEXT: s_cbranch_execnz .LBB43_1 2737; GFX9-W64-NEXT: ; %bb.2: ; %endloop 2738; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 2739; GFX9-W64-NEXT: ; return to shader part epilog 2740; 2741; GFX10-W32-LABEL: test_strict_wwm6_loop: 2742; GFX10-W32: ; %bb.0: ; %main_body 2743; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2744; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 2745; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2746; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2747; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 2748; GFX10-W32-NEXT: s_mov_b32 s0, 0 2749; GFX10-W32-NEXT: .LBB43_1: ; %loop 2750; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 2751; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2752; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 2753; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2754; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2755; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 2756; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2757; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 2758; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2759; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2760; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2761; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 2762; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 2763; GFX10-W32-NEXT: s_cbranch_execnz .LBB43_1 2764; GFX10-W32-NEXT: ; %bb.2: ; %endloop 2765; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 2766; GFX10-W32-NEXT: ; return to shader part epilog 2767main_body: 2768 %src0 = load volatile float, float addrspace(1)* undef 2769 ; use mbcnt to make sure the branch is divergent 2770 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2771 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2772 br label %loop 2773 2774loop: 2775 %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] 2776 %src1 = load volatile float, float addrspace(1)* undef 2777 %out = fadd float %src0, %src1 2778 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2779 %counter.1 = sub i32 %counter, 1 2780 %cc = icmp ne i32 %counter.1, 0 2781 br i1 %cc, label %loop, label %endloop 2782 2783endloop: 2784 ret float %out.0 2785} 2786 2787; Check that @llvm.amdgcn.set.inactive disables WWM. 2788define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { 2789; GFX9-W64-LABEL: test_strict_wwm_set_inactive1: 2790; GFX9-W64: ; %bb.0: ; %main_body 2791; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2792; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 2793; GFX9-W64-NEXT: s_not_b64 exec, exec 2794; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2795; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2796; GFX9-W64-NEXT: s_not_b64 exec, exec 2797; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2798; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 2799; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2800; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 2801; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 2802; GFX9-W64-NEXT: s_endpgm 2803; 2804; GFX10-W32-LABEL: test_strict_wwm_set_inactive1: 2805; GFX10-W32: ; %bb.0: ; %main_body 2806; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2807; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 2808; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2809; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2810; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2811; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2812; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2813; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 2814; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2815; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 2816; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 2817; GFX10-W32-NEXT: s_endpgm 2818main_body: 2819 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2820 %src.0 = bitcast float %src to i32 2821 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) 2822 %out = add i32 %src.1, %src.1 2823 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) 2824 %out.1 = bitcast i32 %out.0 to float 2825 call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2826 ret void 2827} 2828 2829; Check a case of a block being entirely WQM except for a bit of WWM. 2830; There was a bug where it forgot to enter and leave WWM. 2831define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2832; GFX9-W64-LABEL: test_strict_wwm_within_wqm: 2833; GFX9-W64: ; %bb.0: ; %main_body 2834; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2835; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2836; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2837; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 2838; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2839; GFX9-W64-NEXT: s_cbranch_execz .LBB45_2 2840; GFX9-W64-NEXT: ; %bb.1: ; %IF 2841; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2842; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2843; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2844; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2845; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v0 2846; GFX9-W64-NEXT: s_not_b64 exec, exec 2847; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 2848; GFX9-W64-NEXT: s_not_b64 exec, exec 2849; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2850; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2851; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2852; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2853; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2854; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 2855; GFX9-W64-NEXT: .LBB45_2: ; %ENDIF 2856; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2857; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2858; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2859; GFX9-W64-NEXT: ; return to shader part epilog 2860; 2861; GFX10-W32-LABEL: test_strict_wwm_within_wqm: 2862; GFX10-W32: ; %bb.0: ; %main_body 2863; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2864; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2865; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2866; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 2867; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 2868; GFX10-W32-NEXT: s_cbranch_execz .LBB45_2 2869; GFX10-W32-NEXT: ; %bb.1: ; %IF 2870; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2871; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2872; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2873; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2874; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v0 2875; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2876; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 2877; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2878; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2879; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2880; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2881; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2882; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2883; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 2884; GFX10-W32-NEXT: .LBB45_2: ; %ENDIF 2885; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2886; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2887; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2888; GFX10-W32-NEXT: ; return to shader part epilog 2889main_body: 2890 %c.bc = bitcast i32 %c to float 2891 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2892 %tex0 = extractelement <4 x float> %tex, i32 0 2893 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2894 %cmp = icmp eq i32 %z, 0 2895 br i1 %cmp, label %IF, label %ENDIF 2896 2897IF: 2898 %dataf = extractelement <4 x float> %dtex, i32 0 2899 %data1 = fptosi float %dataf to i32 2900 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) 2901 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) 2902 %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3) 2903 %data4f = sitofp i32 %data4 to float 2904 br label %ENDIF 2905 2906ENDIF: 2907 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] 2908 ret float %r 2909} 2910 2911; Check a case of a block being entirely WQM except for a bit of STRICT WQM. 2912define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2913; GFX9-W64-LABEL: test_strict_wqm_within_wqm: 2914; GFX9-W64: ; %bb.0: ; %main_body 2915; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2916; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2917; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec 2918; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2919; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 2920; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] 2921; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2922; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2923; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2924; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2 2925; GFX9-W64-NEXT: ; %bb.1: ; %IF 2926; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec 2927; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2928; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 2929; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2930; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 2931; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2932; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2 2933; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2934; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 2935; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2936; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2937; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 2938; GFX9-W64-NEXT: .LBB46_2: ; %ENDIF 2939; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2940; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2941; GFX9-W64-NEXT: ; return to shader part epilog 2942; 2943; GFX10-W32-LABEL: test_strict_wqm_within_wqm: 2944; GFX10-W32: ; %bb.0: ; %main_body 2945; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2946; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2947; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 2948; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2949; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 2950; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 2951; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2952; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2953; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 2954; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 2955; GFX10-W32-NEXT: ; %bb.1: ; %IF 2956; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo 2957; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2958; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2959; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2960; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2961; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2962; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2 2963; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2964; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 2965; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2966; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2967; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 2968; GFX10-W32-NEXT: .LBB46_2: ; %ENDIF 2969; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2970; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2971; GFX10-W32-NEXT: ; return to shader part epilog 2972main_body: 2973 %c.bc = bitcast i32 %c to float 2974 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2975 %tex0 = extractelement <4 x float> %tex, i32 0 2976 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2977 %cmp = icmp eq i32 %z, 0 2978 br i1 %cmp, label %IF, label %ENDIF 2979 2980IF: 2981 %dataf = extractelement <4 x float> %dtex, i32 0 2982 %data1 = fptosi float %dataf to i32 2983 %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079) 2984 %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2) 2985 %data3f = sitofp i32 %data3 to float 2986 br label %ENDIF 2987 2988ENDIF: 2989 %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ] 2990 ret float %r 2991} 2992 2993;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again. 2994define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) { 2995; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm: 2996; GFX9-W64: ; %bb.0: ; %main_body 2997; GFX9-W64-NEXT: s_mov_b64 s[28:29], exec 2998; GFX9-W64-NEXT: s_mov_b32 s19, s17 2999; GFX9-W64-NEXT: s_mov_b64 s[30:31], exec 3000; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3001; GFX9-W64-NEXT: s_mov_b32 s23, s5 3002; GFX9-W64-NEXT: s_mov_b32 s22, s4 3003; GFX9-W64-NEXT: s_mov_b32 s21, s3 3004; GFX9-W64-NEXT: s_mov_b32 s20, s2 3005; GFX9-W64-NEXT: s_mov_b32 s27, s9 3006; GFX9-W64-NEXT: s_mov_b32 s26, s8 3007; GFX9-W64-NEXT: s_mov_b32 s25, s7 3008; GFX9-W64-NEXT: s_mov_b32 s24, s6 3009; GFX9-W64-NEXT: s_mov_b32 s18, s16 3010; GFX9-W64-NEXT: s_mov_b32 s17, s15 3011; GFX9-W64-NEXT: s_mov_b32 s16, s14 3012; GFX9-W64-NEXT: s_mov_b32 s15, s13 3013; GFX9-W64-NEXT: s_mov_b32 s14, s12 3014; GFX9-W64-NEXT: s_mov_b32 s13, s11 3015; GFX9-W64-NEXT: s_mov_b32 s12, s10 3016; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 3017; GFX9-W64-NEXT: s_mov_b64 exec, s[30:31] 3018; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 3019; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 3020; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3021; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen 3022; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 3023; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 3024; GFX9-W64-NEXT: v_mov_b32_e32 v3, s0 3025; GFX9-W64-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen 3026; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 3027; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 3028; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3029; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3030; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2 3031; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3032; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3033; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 3034; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3035; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3036; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 3037; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4 3038; GFX9-W64-NEXT: s_and_b64 exec, exec, s[28:29] 3039; GFX9-W64-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 3040; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3041; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 3042; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen 3043; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3044; GFX9-W64-NEXT: ; return to shader part epilog 3045; 3046; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm: 3047; GFX10-W32: ; %bb.0: ; %main_body 3048; GFX10-W32-NEXT: s_mov_b32 s28, exec_lo 3049; GFX10-W32-NEXT: s_mov_b32 s19, s17 3050; GFX10-W32-NEXT: s_mov_b32 s29, exec_lo 3051; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3052; GFX10-W32-NEXT: s_mov_b32 s23, s5 3053; GFX10-W32-NEXT: s_mov_b32 s22, s4 3054; GFX10-W32-NEXT: s_mov_b32 s21, s3 3055; GFX10-W32-NEXT: s_mov_b32 s20, s2 3056; GFX10-W32-NEXT: s_mov_b32 s27, s9 3057; GFX10-W32-NEXT: s_mov_b32 s26, s8 3058; GFX10-W32-NEXT: s_mov_b32 s25, s7 3059; GFX10-W32-NEXT: s_mov_b32 s24, s6 3060; GFX10-W32-NEXT: s_mov_b32 s18, s16 3061; GFX10-W32-NEXT: s_mov_b32 s17, s15 3062; GFX10-W32-NEXT: s_mov_b32 s16, s14 3063; GFX10-W32-NEXT: s_mov_b32 s15, s13 3064; GFX10-W32-NEXT: s_mov_b32 s14, s12 3065; GFX10-W32-NEXT: s_mov_b32 s13, s11 3066; GFX10-W32-NEXT: s_mov_b32 s12, s10 3067; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 3068; GFX10-W32-NEXT: s_mov_b32 exec_lo, s29 3069; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 3070; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 3071; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3072; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen 3073; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 3074; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 3075; GFX10-W32-NEXT: v_mov_b32_e32 v3, s0 3076; GFX10-W32-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen 3077; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 3078; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3079; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3080; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3081; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2 3082; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3083; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3084; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 3085; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3086; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 3087; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3088; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4 3089; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s28 3090; GFX10-W32-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D 3091; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3092; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 3093; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen 3094; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3095; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 3096; GFX10-W32-NEXT: ; return to shader part epilog 3097main_body: 3098 call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3099 %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3100 %temp = fadd float %reload, %reload 3101 %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) 3102 %temp3 = fadd float %temp2, %temp2 3103 %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0) 3104 %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm) 3105 %temp5 = fadd float %temp3, %temp4 3106 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0) 3107 call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3108 %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3109 ret float %out 3110} 3111 3112define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { 3113; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm: 3114; GFX9-W64: ; %bb.0: ; %main_body 3115; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec 3116; GFX9-W64-NEXT: s_mov_b32 s15, s13 3117; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec 3118; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3119; GFX9-W64-NEXT: s_mov_b32 s19, s5 3120; GFX9-W64-NEXT: s_mov_b32 s18, s4 3121; GFX9-W64-NEXT: s_mov_b32 s17, s3 3122; GFX9-W64-NEXT: s_mov_b32 s16, s2 3123; GFX9-W64-NEXT: s_mov_b32 s14, s12 3124; GFX9-W64-NEXT: s_mov_b32 s13, s11 3125; GFX9-W64-NEXT: s_mov_b32 s12, s10 3126; GFX9-W64-NEXT: s_mov_b32 s11, s9 3127; GFX9-W64-NEXT: s_mov_b32 s10, s8 3128; GFX9-W64-NEXT: s_mov_b32 s9, s7 3129; GFX9-W64-NEXT: s_mov_b32 s8, s6 3130; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 3131; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23] 3132; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3133; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 3134; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 3135; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen 3136; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 3137; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 3138; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3139; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen 3140; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3141; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 3142; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3143; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2 3144; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3145; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3146; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 3147; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3148; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3149; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 3150; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4 3151; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] 3152; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3153; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3154; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3155; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3156; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3157; GFX9-W64-NEXT: ; return to shader part epilog 3158; 3159; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm: 3160; GFX10-W32: ; %bb.0: ; %main_body 3161; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo 3162; GFX10-W32-NEXT: s_mov_b32 s15, s13 3163; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo 3164; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3165; GFX10-W32-NEXT: s_mov_b32 s19, s5 3166; GFX10-W32-NEXT: s_mov_b32 s18, s4 3167; GFX10-W32-NEXT: s_mov_b32 s17, s3 3168; GFX10-W32-NEXT: s_mov_b32 s16, s2 3169; GFX10-W32-NEXT: s_mov_b32 s14, s12 3170; GFX10-W32-NEXT: s_mov_b32 s13, s11 3171; GFX10-W32-NEXT: s_mov_b32 s12, s10 3172; GFX10-W32-NEXT: s_mov_b32 s11, s9 3173; GFX10-W32-NEXT: s_mov_b32 s10, s8 3174; GFX10-W32-NEXT: s_mov_b32 s9, s7 3175; GFX10-W32-NEXT: s_mov_b32 s8, s6 3176; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 3177; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21 3178; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3179; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 3180; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3181; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3182; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3183; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen 3184; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3185; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3186; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3187; GFX10-W32-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen 3188; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3189; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3190; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3191; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2 3192; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3193; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3194; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 3195; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3196; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 3197; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3198; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4 3199; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3200; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3201; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3202; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3203; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3204; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3205; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 3206; GFX10-W32-NEXT: ; return to shader part epilog 3207main_body: 3208 call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3209 %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3210 %temp = fadd float %reload, %reload 3211 %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) 3212 %temp3 = fadd float %temp2, %temp2 3213 %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3214 %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) 3215 %temp5 = fadd float %temp3, %temp4 3216 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) 3217 call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3218 %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3219 ret float %out 3220} 3221 3222;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again. 3223define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { 3224; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm: 3225; GFX9-W64: ; %bb.0: ; %main_body 3226; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec 3227; GFX9-W64-NEXT: s_mov_b32 s15, s13 3228; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec 3229; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3230; GFX9-W64-NEXT: s_mov_b32 s19, s5 3231; GFX9-W64-NEXT: s_mov_b32 s18, s4 3232; GFX9-W64-NEXT: s_mov_b32 s17, s3 3233; GFX9-W64-NEXT: s_mov_b32 s16, s2 3234; GFX9-W64-NEXT: s_mov_b32 s14, s12 3235; GFX9-W64-NEXT: s_mov_b32 s13, s11 3236; GFX9-W64-NEXT: s_mov_b32 s12, s10 3237; GFX9-W64-NEXT: s_mov_b32 s11, s9 3238; GFX9-W64-NEXT: s_mov_b32 s10, s8 3239; GFX9-W64-NEXT: s_mov_b32 s9, s7 3240; GFX9-W64-NEXT: s_mov_b32 s8, s6 3241; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 3242; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23] 3243; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3244; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3245; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 3246; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen 3247; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 3248; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3249; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen 3250; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3251; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3252; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3253; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3254; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3255; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2 3256; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3257; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3258; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3 3259; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] 3260; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3261; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3262; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3263; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3264; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3265; GFX9-W64-NEXT: ; return to shader part epilog 3266; 3267; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm: 3268; GFX10-W32: ; %bb.0: ; %main_body 3269; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo 3270; GFX10-W32-NEXT: s_mov_b32 s15, s13 3271; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo 3272; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3273; GFX10-W32-NEXT: s_mov_b32 s19, s5 3274; GFX10-W32-NEXT: s_mov_b32 s18, s4 3275; GFX10-W32-NEXT: s_mov_b32 s17, s3 3276; GFX10-W32-NEXT: s_mov_b32 s16, s2 3277; GFX10-W32-NEXT: s_mov_b32 s14, s12 3278; GFX10-W32-NEXT: s_mov_b32 s13, s11 3279; GFX10-W32-NEXT: s_mov_b32 s12, s10 3280; GFX10-W32-NEXT: s_mov_b32 s11, s9 3281; GFX10-W32-NEXT: s_mov_b32 s10, s8 3282; GFX10-W32-NEXT: s_mov_b32 s9, s7 3283; GFX10-W32-NEXT: s_mov_b32 s8, s6 3284; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 3285; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21 3286; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3287; GFX10-W32-NEXT: v_mov_b32_e32 v3, s1 3288; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3289; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3290; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3291; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen 3292; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3293; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3294; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen 3295; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3296; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3297; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3298; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3299; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2 3300; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3301; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3302; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3303; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3 3304; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3305; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3306; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3307; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3308; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3309; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3310; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 3311; GFX10-W32-NEXT: ; return to shader part epilog 3312main_body: 3313 call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3314 %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3315 %temp = fadd float %reload, %reload 3316 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) 3317 %temp2 = fadd float %tex, %tex 3318 %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3319 %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) 3320 %temp4 = fadd float %temp2, %temp3 3321 %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) 3322 call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3323 %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3324 ret float %out 3325} 3326 3327declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 3328declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 3329 3330declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 3331declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 3332declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2 3333declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2 3334declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3 3335declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3 3336 3337declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 3338declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3339declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3340declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3341declare void @llvm.amdgcn.kill(i1) #1 3342declare float @llvm.amdgcn.wqm.f32(float) #3 3343declare i32 @llvm.amdgcn.wqm.i32(i32) #3 3344declare float @llvm.amdgcn.strict.wwm.f32(float) #3 3345declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3 3346declare float @llvm.amdgcn.wwm.f32(float) #3 3347declare i32 @llvm.amdgcn.wwm.i32(i32) #3 3348declare float @llvm.amdgcn.strict.wqm.f32(float) #3 3349declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3 3350declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 3351declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 3352declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 3353declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3 3354declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1 3355declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 3356declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 3357declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) 3358 3359attributes #1 = { nounwind } 3360attributes #2 = { nounwind readonly } 3361attributes #3 = { nounwind readnone } 3362attributes #4 = { nounwind readnone convergent } 3363attributes #5 = { "amdgpu-ps-wqm-outputs" } 3364attributes #6 = { nounwind "InitialPSInputAddr"="2" } 3365