1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s 4 5; Check that WQM isn't triggered by image load/store intrinsics. 6define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) { 7; GFX9-W64-LABEL: test1: 8; GFX9-W64: ; %bb.0: ; %main_body 9; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 10; GFX9-W64-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm 11; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 12; GFX9-W64-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm 13; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 14; GFX9-W64-NEXT: ; return to shader part epilog 15; 16; GFX10-W32-LABEL: test1: 17; GFX10-W32: ; %bb.0: ; %main_body 18; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 19; GFX10-W32-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 20; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 21; GFX10-W32-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 22; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 23; GFX10-W32-NEXT: ; return to shader part epilog 24main_body: 25 %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 26 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 27 ret <4 x float> %tex 28} 29 30; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible 31define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 32; GFX9-W64-LABEL: test2: 33; GFX9-W64: ; %bb.0: ; %main_body 34; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 35; GFX9-W64-NEXT: s_wqm_b64 exec, exec 36; GFX9-W64-NEXT: s_mov_b32 m0, s3 37; GFX9-W64-NEXT: s_nop 0 38; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 39; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 40; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 41; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 42; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 43; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf 44; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 45; GFX9-W64-NEXT: ; return to shader part epilog 46; 47; GFX10-W32-LABEL: test2: 48; GFX10-W32: ; %bb.0: ; %main_body 49; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 50; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 51; GFX10-W32-NEXT: s_mov_b32 m0, s3 52; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 53; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 54; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 55; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 56; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 57; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D 58; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 59; GFX10-W32-NEXT: ; return to shader part epilog 60main_body: 61 %inst23 = extractelement <2 x float> %pos, i32 0 62 %inst24 = extractelement <2 x float> %pos, i32 1 63 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 64 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 65 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 66 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 67 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 68 ret <4 x float> %tex 69} 70 71; ... but disabled for stores (and, in this simple case, not re-enabled) ... 72define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { 73; GFX9-W64-LABEL: test3: 74; GFX9-W64: ; %bb.0: ; %main_body 75; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 76; GFX9-W64-NEXT: s_wqm_b64 exec, exec 77; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 78; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 79; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 80; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen 81; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 82; GFX9-W64-NEXT: ; return to shader part epilog 83; 84; GFX10-W32-LABEL: test3: 85; GFX10-W32: ; %bb.0: ; %main_body 86; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 87; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 88; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 89; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 90; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 91; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen 92; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 93; GFX10-W32-NEXT: ; return to shader part epilog 94main_body: 95 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 96 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 97 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 98 99 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0) 100 101 ret <4 x float> %tex 102} 103 104; ... and disabled for export. 105define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 106; GFX9-W64-LABEL: test3x: 107; GFX9-W64: ; %bb.0: ; %main_body 108; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 109; GFX9-W64-NEXT: s_wqm_b64 exec, exec 110; GFX9-W64-NEXT: s_mov_b32 m0, s3 111; GFX9-W64-NEXT: s_nop 0 112; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 113; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 114; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 115; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 116; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 117; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf 118; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 119; GFX9-W64-NEXT: exp mrt0 v0, v1, v2, v3 done vm 120; GFX9-W64-NEXT: s_endpgm 121; 122; GFX10-W32-LABEL: test3x: 123; GFX10-W32: ; %bb.0: ; %main_body 124; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 125; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 126; GFX10-W32-NEXT: s_mov_b32 m0, s3 127; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 128; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 129; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 130; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 131; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 132; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D 133; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 134; GFX10-W32-NEXT: exp mrt0 v0, v1, v2, v3 done vm 135; GFX10-W32-NEXT: s_endpgm 136main_body: 137 %inst23 = extractelement <2 x float> %pos, i32 0 138 %inst24 = extractelement <2 x float> %pos, i32 1 139 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 140 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 141 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 142 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 143 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 144 %tex.0 = extractelement <4 x float> %tex, i32 0 145 %tex.1 = extractelement <4 x float> %tex, i32 1 146 %tex.2 = extractelement <4 x float> %tex, i32 2 147 %tex.3 = extractelement <4 x float> %tex, i32 3 148 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true) 149 ret void 150} 151 152; Check that WQM is re-enabled when required. 153define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { 154; GFX9-W64-LABEL: test4: 155; GFX9-W64: ; %bb.0: ; %main_body 156; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 157; GFX9-W64-NEXT: s_wqm_b64 exec, exec 158; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1 159; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 160; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 161; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 162; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 163; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 164; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen 165; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 166; GFX9-W64-NEXT: ; return to shader part epilog 167; 168; GFX10-W32-LABEL: test4: 169; GFX10-W32: ; %bb.0: ; %main_body 170; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 171; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 172; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1 173; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 174; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 175; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 176; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 177; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 178; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen 179; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 180; GFX10-W32-NEXT: ; return to shader part epilog 181main_body: 182 %c.1 = mul i32 %c, %d 183 184 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0) 185 %c.1.bc = bitcast i32 %c.1 to float 186 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 187 %tex0 = extractelement <4 x float> %tex, i32 0 188 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 189 ret <4 x float> %dtex 190} 191 192; Check that WQM is triggered by the wqm intrinsic. 193; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this 194; does not happen - the v_add should write the return reg directly. 195define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { 196; GFX9-W64-LABEL: test5: 197; GFX9-W64: ; %bb.0: ; %main_body 198; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 199; GFX9-W64-NEXT: s_wqm_b64 exec, exec 200; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 201; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 202; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 203; GFX9-W64-NEXT: s_nop 0 204; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 205; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 206; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 207; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 208; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 209; GFX9-W64-NEXT: ; return to shader part epilog 210; 211; GFX10-W32-LABEL: test5: 212; GFX10-W32: ; %bb.0: ; %main_body 213; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 214; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 215; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 216; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 217; GFX10-W32-NEXT: s_clause 0x1 218; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 219; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 220; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 221; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 222; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 223; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 224; GFX10-W32-NEXT: ; return to shader part epilog 225main_body: 226 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 227 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 228 %out = fadd float %src0, %src1 229 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 230 ret float %out.0 231} 232 233; Check that the wqm intrinsic works correctly for integers. 234define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { 235; GFX9-W64-LABEL: test6: 236; GFX9-W64: ; %bb.0: ; %main_body 237; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 238; GFX9-W64-NEXT: s_wqm_b64 exec, exec 239; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 240; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 241; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 242; GFX9-W64-NEXT: s_nop 0 243; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 244; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 245; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 246; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 247; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 248; GFX9-W64-NEXT: ; return to shader part epilog 249; 250; GFX10-W32-LABEL: test6: 251; GFX10-W32: ; %bb.0: ; %main_body 252; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 253; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 254; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 255; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 256; GFX10-W32-NEXT: s_clause 0x1 257; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 258; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 259; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 260; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 261; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 262; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 263; GFX10-W32-NEXT: ; return to shader part epilog 264main_body: 265 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 266 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 267 %out = fadd float %src0, %src1 268 %out.0 = bitcast float %out to i32 269 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 270 %out.2 = bitcast i32 %out.1 to float 271 ret float %out.2 272} 273 274; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. 275 276; Check that WWM is triggered by the wwm intrinsic. 277define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 278; GFX9-W64-LABEL: test_wwm1: 279; GFX9-W64: ; %bb.0: ; %main_body 280; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 281; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 282; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 283; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 284; GFX9-W64-NEXT: s_nop 0 285; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 286; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 287; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 288; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 289; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 290; GFX9-W64-NEXT: ; return to shader part epilog 291; 292; GFX10-W32-LABEL: test_wwm1: 293; GFX10-W32: ; %bb.0: ; %main_body 294; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 295; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 296; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 297; GFX10-W32-NEXT: s_clause 0x1 298; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 299; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 300; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 301; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 302; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 303; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 304; GFX10-W32-NEXT: ; return to shader part epilog 305main_body: 306 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 307 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 308 %out = fadd float %src0, %src1 309 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 310 ret float %out.0 311} 312 313; Same as above, but with an integer type. 314define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { 315; GFX9-W64-LABEL: test_wwm2: 316; GFX9-W64: ; %bb.0: ; %main_body 317; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 318; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 319; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 320; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 321; GFX9-W64-NEXT: s_nop 0 322; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 323; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 324; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 325; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 326; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 327; GFX9-W64-NEXT: ; return to shader part epilog 328; 329; GFX10-W32-LABEL: test_wwm2: 330; GFX10-W32: ; %bb.0: ; %main_body 331; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 332; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 333; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 334; GFX10-W32-NEXT: s_clause 0x1 335; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 336; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 337; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 338; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 339; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 340; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 341; GFX10-W32-NEXT: ; return to shader part epilog 342main_body: 343 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 344 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 345 %src0.0 = bitcast float %src0 to i32 346 %src1.0 = bitcast float %src1 to i32 347 %out = add i32 %src0.0, %src1.0 348 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 349 %out.1 = bitcast i32 %out.0 to float 350 ret float %out.1 351} 352 353; Check that we don't leave WWM on for computations that don't require WWM, 354; since that will lead clobbering things that aren't supposed to be clobbered 355; in cases like this. 356; We enforce this by checking that v_add gets emitted in the same block as 357; WWM computations. 358define amdgpu_ps float @test_wwm3(i32 inreg %idx) { 359; GFX9-W64-LABEL: test_wwm3: 360; GFX9-W64: ; %bb.0: ; %main_body 361; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 362; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 363; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 364; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 365; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 366; GFX9-W64-NEXT: s_cbranch_execz .LBB9_2 367; GFX9-W64-NEXT: ; %bb.1: ; %if 368; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 369; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 370; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 371; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 372; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 373; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 374; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 375; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 376; GFX9-W64-NEXT: .LBB9_2: ; %endif 377; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 378; GFX9-W64-NEXT: ; return to shader part epilog 379; 380; GFX10-W32-LABEL: test_wwm3: 381; GFX10-W32: ; %bb.0: ; %main_body 382; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 383; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 384; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 385; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 386; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 387; GFX10-W32-NEXT: s_cbranch_execz .LBB9_2 388; GFX10-W32-NEXT: ; %bb.1: ; %if 389; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 390; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 391; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 392; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 393; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 394; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 395; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 396; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 397; GFX10-W32-NEXT: .LBB9_2: ; %endif 398; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 399; GFX10-W32-NEXT: ; return to shader part epilog 400main_body: 401 ; use mbcnt to make sure the branch is divergent 402 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 403 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 404 %cc = icmp uge i32 %hi, 16 405 br i1 %cc, label %endif, label %if 406 407if: 408 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 409 %out = fadd float %src, %src 410 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 411 %out.1 = fadd float %src, %out.0 412 br label %endif 413 414endif: 415 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 416 ret float %out.2 417} 418 419; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM 420; write could clobber disabled channels in the non-WWM one. 421; We enforce this by checking that v_mov gets emitted in the same block as 422; WWM computations. 423define amdgpu_ps float @test_wwm4(i32 inreg %idx) { 424; GFX9-W64-LABEL: test_wwm4: 425; GFX9-W64: ; %bb.0: ; %main_body 426; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 427; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 428; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 429; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 430; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 431; GFX9-W64-NEXT: s_cbranch_execz .LBB10_2 432; GFX9-W64-NEXT: ; %bb.1: ; %if 433; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 434; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 435; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 436; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 437; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 438; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 439; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 440; GFX9-W64-NEXT: .LBB10_2: ; %endif 441; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 442; GFX9-W64-NEXT: ; return to shader part epilog 443; 444; GFX10-W32-LABEL: test_wwm4: 445; GFX10-W32: ; %bb.0: ; %main_body 446; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 447; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 448; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 449; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 450; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 451; GFX10-W32-NEXT: s_cbranch_execz .LBB10_2 452; GFX10-W32-NEXT: ; %bb.1: ; %if 453; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 454; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 455; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 456; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 457; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 458; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 459; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 460; GFX10-W32-NEXT: .LBB10_2: ; %endif 461; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 462; GFX10-W32-NEXT: ; return to shader part epilog 463main_body: 464 ; use mbcnt to make sure the branch is divergent 465 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 466 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 467 %cc = icmp uge i32 %hi, 16 468 br i1 %cc, label %endif, label %if 469 470if: 471 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 472 %out = fadd float %src, %src 473 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 474 br label %endif 475 476endif: 477 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 478 ret float %out.1 479} 480 481; Make sure the transition from Exact to WWM then WQM works properly. 482define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { 483; GFX9-W64-LABEL: test_wwm5: 484; GFX9-W64: ; %bb.0: ; %main_body 485; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 486; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 487; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 488; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 489; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 490; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 491; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 492; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 493; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 494; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 495; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 496; GFX9-W64-NEXT: s_wqm_b64 exec, exec 497; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 498; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 499; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 500; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 501; GFX9-W64-NEXT: ; return to shader part epilog 502; 503; GFX10-W32-LABEL: test_wwm5: 504; GFX10-W32: ; %bb.0: ; %main_body 505; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 506; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 507; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 508; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 509; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 510; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 511; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 512; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 513; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 514; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 515; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 516; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 517; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 518; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 519; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 520; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 521; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 522; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 523; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 524; GFX10-W32-NEXT: ; return to shader part epilog 525main_body: 526 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 527 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 528 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 529 %temp = fadd float %src1, %src1 530 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) 531 %out = fadd float %temp.0, %temp.0 532 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 533 ret float %out.0 534} 535 536; Check that WWM is turned on correctly across basic block boundaries. 537; if..then..endif version 538;SI-CHECK: buffer_load_dword 539;VI-CHECK: flat_load_dword 540;SI-CHECK: buffer_load_dword 541;VI-CHECK: flat_load_dword 542define amdgpu_ps float @test_wwm6_then() { 543; GFX9-W64-LABEL: test_wwm6_then: 544; GFX9-W64: ; %bb.0: ; %main_body 545; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 546; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 547; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 548; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 549; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 550; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 551; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 552; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 553; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 554; GFX9-W64-NEXT: s_cbranch_execz .LBB12_2 555; GFX9-W64-NEXT: ; %bb.1: ; %if 556; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 557; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 558; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 559; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 560; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 561; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 562; GFX9-W64-NEXT: .LBB12_2: ; %endif 563; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 564; GFX9-W64-NEXT: ; return to shader part epilog 565; 566; GFX10-W32-LABEL: test_wwm6_then: 567; GFX10-W32: ; %bb.0: ; %main_body 568; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 569; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 570; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 571; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 572; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 573; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 574; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 575; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 576; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 577; GFX10-W32-NEXT: s_cbranch_execz .LBB12_2 578; GFX10-W32-NEXT: ; %bb.1: ; %if 579; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 580; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 581; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 582; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 583; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 584; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 585; GFX10-W32-NEXT: .LBB12_2: ; %endif 586; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 587; GFX10-W32-NEXT: ; return to shader part epilog 588main_body: 589 %src0 = load volatile float, float addrspace(1)* undef 590 ; use mbcnt to make sure the branch is divergent 591 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 592 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 593 %cc = icmp uge i32 %hi, 16 594 br i1 %cc, label %endif, label %if 595 596if: 597 %src1 = load volatile float, float addrspace(1)* undef 598 %out = fadd float %src0, %src1 599 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 600 br label %endif 601 602endif: 603 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 604 ret float %out.1 605} 606 607; Check that WWM is turned on correctly across basic block boundaries. 608; loop version 609;SI-CHECK: buffer_load_dword 610;VI-CHECK: flat_load_dword 611;SI-CHECK: buffer_load_dword 612;VI-CHECK: flat_load_dword 613define amdgpu_ps float @test_wwm6_loop() { 614; GFX9-W64-LABEL: test_wwm6_loop: 615; GFX9-W64: ; %bb.0: ; %main_body 616; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 617; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 618; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 619; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 620; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 621; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 622; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 623; GFX9-W64-NEXT: .LBB13_1: ; %loop 624; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 625; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 626; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 627; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 628; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 629; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 630; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 631; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 632; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 633; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 634; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 635; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 636; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 637; GFX9-W64-NEXT: s_cbranch_execnz .LBB13_1 638; GFX9-W64-NEXT: ; %bb.2: ; %endloop 639; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 640; GFX9-W64-NEXT: ; return to shader part epilog 641; 642; GFX10-W32-LABEL: test_wwm6_loop: 643; GFX10-W32: ; %bb.0: ; %main_body 644; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 645; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 646; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 647; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 648; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 649; GFX10-W32-NEXT: s_mov_b32 s0, 0 650; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 651; GFX10-W32-NEXT: .LBB13_1: ; %loop 652; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 653; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 654; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 655; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 656; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 657; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 658; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 659; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 660; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 661; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 662; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 663; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 664; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 665; GFX10-W32-NEXT: s_cbranch_execnz .LBB13_1 666; GFX10-W32-NEXT: ; %bb.2: ; %endloop 667; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 668; GFX10-W32-NEXT: ; return to shader part epilog 669main_body: 670 %src0 = load volatile float, float addrspace(1)* undef 671 ; use mbcnt to make sure the branch is divergent 672 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 673 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 674 br label %loop 675 676loop: 677 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] 678 %src1 = load volatile float, float addrspace(1)* undef 679 %out = fadd float %src0, %src1 680 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 681 %counter.1 = sub i32 %counter, 1 682 %cc = icmp ne i32 %counter.1, 0 683 br i1 %cc, label %loop, label %endloop 684 685endloop: 686 ret float %out.0 687} 688 689; Check that @llvm.amdgcn.set.inactive disables WWM. 690define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { 691; GFX9-W64-LABEL: test_wwm_set_inactive1: 692; GFX9-W64: ; %bb.0: ; %main_body 693; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 694; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen 695; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 696; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 697; GFX9-W64-NEXT: s_not_b64 exec, exec 698; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 699; GFX9-W64-NEXT: s_not_b64 exec, exec 700; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 701; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 702; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 703; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 704; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 705; GFX9-W64-NEXT: s_endpgm 706; 707; GFX10-W32-LABEL: test_wwm_set_inactive1: 708; GFX10-W32: ; %bb.0: ; %main_body 709; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 710; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen 711; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 712; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 713; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 714; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 715; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 716; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 717; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 718; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 719; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 720; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 721; GFX10-W32-NEXT: s_endpgm 722main_body: 723 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 724 %src.0 = bitcast float %src to i32 725 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) 726 %out = add i32 %src.1, %src.1 727 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 728 %out.1 = bitcast i32 %out.0 to float 729 call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 730 ret void 731} 732 733; Check that Strict WQM is triggered by the strict_wqm intrinsic. 734define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) { 735; GFX9-W64-LABEL: test_strict_wqm1: 736; GFX9-W64: ; %bb.0: ; %main_body 737; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 738; GFX9-W64-NEXT: s_wqm_b64 exec, exec 739; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 740; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 741; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 742; GFX9-W64-NEXT: s_nop 0 743; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 744; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 745; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 746; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 747; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 748; GFX9-W64-NEXT: ; return to shader part epilog 749; 750; GFX10-W32-LABEL: test_strict_wqm1: 751; GFX10-W32: ; %bb.0: ; %main_body 752; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 753; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 754; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 755; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 756; GFX10-W32-NEXT: s_clause 0x1 757; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 758; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 759; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 760; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 761; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 762; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 763; GFX10-W32-NEXT: ; return to shader part epilog 764main_body: 765 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 766 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 767 %out = fadd float %src0, %src1 768 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 769 ret float %out.0 770} 771 772; Same as above, but with an integer type. 773define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) { 774; GFX9-W64-LABEL: test_strict_wqm2: 775; GFX9-W64: ; %bb.0: ; %main_body 776; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 777; GFX9-W64-NEXT: s_wqm_b64 exec, exec 778; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 779; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 780; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 781; GFX9-W64-NEXT: s_nop 0 782; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 783; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 784; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 785; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 786; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 787; GFX9-W64-NEXT: ; return to shader part epilog 788; 789; GFX10-W32-LABEL: test_strict_wqm2: 790; GFX10-W32: ; %bb.0: ; %main_body 791; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 792; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 793; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 794; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 795; GFX10-W32-NEXT: s_clause 0x1 796; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 797; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 798; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 799; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 800; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 801; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 802; GFX10-W32-NEXT: ; return to shader part epilog 803main_body: 804 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 805 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 806 %src0.0 = bitcast float %src0 to i32 807 %src1.0 = bitcast float %src1 to i32 808 %out = add i32 %src0.0, %src1.0 809 %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out) 810 %out.1 = bitcast i32 %out.0 to float 811 ret float %out.1 812} 813 814; Check that we don't leave Strict WQM on for computations that don't require it, 815; since that will lead clobbering things that aren't supposed to be clobbered 816; in cases like this. 817; We enforce this by checking that v_add gets emitted in the same block as 818; WWM computations. 819define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { 820; GFX9-W64-LABEL: test_strict_wqm3: 821; GFX9-W64: ; %bb.0: ; %main_body 822; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 823; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 824; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 825; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 826; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 827; GFX9-W64-NEXT: s_cbranch_execz .LBB17_2 828; GFX9-W64-NEXT: ; %bb.1: ; %if 829; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 830; GFX9-W64-NEXT: s_wqm_b64 exec, exec 831; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 832; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 833; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 834; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 835; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 836; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 837; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 838; GFX9-W64-NEXT: .LBB17_2: ; %endif 839; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 840; GFX9-W64-NEXT: ; return to shader part epilog 841; 842; GFX10-W32-LABEL: test_strict_wqm3: 843; GFX10-W32: ; %bb.0: ; %main_body 844; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 845; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 846; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 847; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 848; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 849; GFX10-W32-NEXT: s_cbranch_execz .LBB17_2 850; GFX10-W32-NEXT: ; %bb.1: ; %if 851; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 852; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 853; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 854; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 855; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 856; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 857; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 858; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 859; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 860; GFX10-W32-NEXT: .LBB17_2: ; %endif 861; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 862; GFX10-W32-NEXT: ; return to shader part epilog 863main_body: 864 ; use mbcnt to make sure the branch is divergent 865 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 866 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 867 %cc = icmp uge i32 %hi, 16 868 br i1 %cc, label %endif, label %if 869 870if: 871 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 872 %out = fadd float %src, %src 873 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 874 %out.1 = fadd float %src, %out.0 875 br label %endif 876 877endif: 878 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 879 ret float %out.2 880} 881 882; Check that Strict WQM writes aren't coalesced with non-strict writes, since 883; the Strict WQM write could clobber disabled channels in the non-strict one. 884; We enforce this by checking that v_mov gets emitted in the same block as 885; WWM computations. 886define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { 887; GFX9-W64-LABEL: test_strict_wqm4: 888; GFX9-W64: ; %bb.0: ; %main_body 889; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 890; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 891; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 892; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 893; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 894; GFX9-W64-NEXT: s_cbranch_execz .LBB18_2 895; GFX9-W64-NEXT: ; %bb.1: ; %if 896; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 897; GFX9-W64-NEXT: s_wqm_b64 exec, exec 898; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 899; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 900; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 901; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 902; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 903; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 904; GFX9-W64-NEXT: .LBB18_2: ; %endif 905; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 906; GFX9-W64-NEXT: ; return to shader part epilog 907; 908; GFX10-W32-LABEL: test_strict_wqm4: 909; GFX10-W32: ; %bb.0: ; %main_body 910; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 911; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 912; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 913; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 914; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 915; GFX10-W32-NEXT: s_cbranch_execz .LBB18_2 916; GFX10-W32-NEXT: ; %bb.1: ; %if 917; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 918; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 919; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 920; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 921; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 922; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 923; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 924; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 925; GFX10-W32-NEXT: .LBB18_2: ; %endif 926; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 927; GFX10-W32-NEXT: ; return to shader part epilog 928main_body: 929 ; use mbcnt to make sure the branch is divergent 930 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 931 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 932 %cc = icmp uge i32 %hi, 16 933 br i1 %cc, label %endif, label %if 934 935if: 936 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 937 %out = fadd float %src, %src 938 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 939 br label %endif 940 941endif: 942 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 943 ret float %out.1 944} 945 946; Make sure the transition from Exact to Strict WQM then WQM works properly. 947define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) { 948; GFX9-W64-LABEL: test_strict_wqm5: 949; GFX9-W64: ; %bb.0: ; %main_body 950; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 951; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 952; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 953; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 954; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 955; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 956; GFX9-W64-NEXT: s_wqm_b64 exec, exec 957; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 958; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 959; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 960; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 961; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 962; GFX9-W64-NEXT: s_wqm_b64 exec, exec 963; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 964; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 965; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 966; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 967; GFX9-W64-NEXT: ; return to shader part epilog 968; 969; GFX10-W32-LABEL: test_strict_wqm5: 970; GFX10-W32: ; %bb.0: ; %main_body 971; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 972; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 973; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 974; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 975; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 976; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 977; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 978; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 979; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 980; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 981; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 982; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 983; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 984; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 985; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 986; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 987; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 988; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 989; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 990; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 991; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 992; GFX10-W32-NEXT: ; return to shader part epilog 993main_body: 994 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 995 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 996 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 997 %temp = fadd float %src1, %src1 998 %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) 999 %out = fadd float %temp.0, %temp.0 1000 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 1001 ret float %out.0 1002} 1003 1004; Check that Strict WQM is turned on correctly across basic block boundaries. 1005; if..then..endif version 1006;SI-CHECK: buffer_load_dword 1007;VI-CHECK: flat_load_dword 1008;SI-CHECK: buffer_load_dword 1009;VI-CHECK: flat_load_dword 1010define amdgpu_ps float @test_strict_wqm6_then() { 1011; GFX9-W64-LABEL: test_strict_wqm6_then: 1012; GFX9-W64: ; %bb.0: ; %main_body 1013; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1014; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1015; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 1016; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1017; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 1018; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1019; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1020; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 1021; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 1022; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1023; GFX9-W64-NEXT: s_cbranch_execz .LBB20_2 1024; GFX9-W64-NEXT: ; %bb.1: ; %if 1025; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1026; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1027; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 1028; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1029; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 1030; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1031; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 1032; GFX9-W64-NEXT: .LBB20_2: ; %endif 1033; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1034; GFX9-W64-NEXT: ; return to shader part epilog 1035; 1036; GFX10-W32-LABEL: test_strict_wqm6_then: 1037; GFX10-W32: ; %bb.0: ; %main_body 1038; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1039; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1040; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 1041; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1042; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 1043; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1044; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1045; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 1046; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 1047; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 1048; GFX10-W32-NEXT: s_cbranch_execz .LBB20_2 1049; GFX10-W32-NEXT: ; %bb.1: ; %if 1050; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1051; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1052; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 1053; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1054; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 1055; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1056; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 1057; GFX10-W32-NEXT: .LBB20_2: ; %endif 1058; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1059; GFX10-W32-NEXT: ; return to shader part epilog 1060main_body: 1061 %src0 = load volatile float, float addrspace(1)* undef 1062 ; use mbcnt to make sure the branch is divergent 1063 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1064 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1065 %cc = icmp uge i32 %hi, 16 1066 br i1 %cc, label %endif, label %if 1067 1068if: 1069 %src1 = load volatile float, float addrspace(1)* undef 1070 %out = fadd float %src0, %src1 1071 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 1072 br label %endif 1073 1074endif: 1075 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 1076 ret float %out.1 1077} 1078 1079; Check that Strict WQM is turned on correctly across basic block boundaries. 1080; loop version 1081;SI-CHECK: buffer_load_dword 1082;VI-CHECK: flat_load_dword 1083;SI-CHECK: buffer_load_dword 1084;VI-CHECK: flat_load_dword 1085define amdgpu_ps float @test_strict_wqm6_loop() { 1086; GFX9-W64-LABEL: test_strict_wqm6_loop: 1087; GFX9-W64: ; %bb.0: ; %main_body 1088; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1089; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1090; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 1091; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1092; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 1093; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1094; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 1095; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 1096; GFX9-W64-NEXT: .LBB21_1: ; %loop 1097; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 1098; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1099; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1100; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 1101; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1102; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1103; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 1104; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1105; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1106; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1107; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 1108; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1109; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1110; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1111; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 1112; GFX9-W64-NEXT: s_cbranch_execnz .LBB21_1 1113; GFX9-W64-NEXT: ; %bb.2: ; %endloop 1114; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1115; GFX9-W64-NEXT: ; return to shader part epilog 1116; 1117; GFX10-W32-LABEL: test_strict_wqm6_loop: 1118; GFX10-W32: ; %bb.0: ; %main_body 1119; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1120; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1121; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 1122; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1123; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 1124; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1125; GFX10-W32-NEXT: s_mov_b32 s0, 0 1126; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 1127; GFX10-W32-NEXT: .LBB21_1: ; %loop 1128; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 1129; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1130; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1131; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 1132; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1133; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1134; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 1135; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1136; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1137; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 1138; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1139; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 1140; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1141; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 1142; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 1143; GFX10-W32-NEXT: s_cbranch_execnz .LBB21_1 1144; GFX10-W32-NEXT: ; %bb.2: ; %endloop 1145; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1146; GFX10-W32-NEXT: ; return to shader part epilog 1147main_body: 1148 %src0 = load volatile float, float addrspace(1)* undef 1149 ; use mbcnt to make sure the branch is divergent 1150 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1151 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1152 br label %loop 1153 1154loop: 1155 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] 1156 %src1 = load volatile float, float addrspace(1)* undef 1157 %out = fadd float %src0, %src1 1158 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 1159 %counter.1 = sub i32 %counter, 1 1160 %cc = icmp ne i32 %counter.1, 0 1161 br i1 %cc, label %loop, label %endloop 1162 1163endloop: 1164 ret float %out.0 1165} 1166 1167; Check that enabling WQM anywhere enables WQM for the set.inactive source. 1168define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { 1169; GFX9-W64-LABEL: test_set_inactive2: 1170; GFX9-W64: ; %bb.0: ; %main_body 1171; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1172; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1173; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 1174; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0 1175; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen 1176; GFX9-W64-NEXT: s_nop 0 1177; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 1178; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec 1179; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec 1180; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 1181; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1182; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1 1183; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1184; GFX9-W64-NEXT: s_endpgm 1185; 1186; GFX10-W32-LABEL: test_set_inactive2: 1187; GFX10-W32: ; %bb.0: ; %main_body 1188; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 1189; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1190; GFX10-W32-NEXT: v_mov_b32_e32 v0, s1 1191; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 1192; GFX10-W32-NEXT: s_clause 0x1 1193; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 1194; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1195; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec 1196; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec killed $exec 1197; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 1198; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1199; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 1200; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1201; GFX10-W32-NEXT: s_endpgm 1202main_body: 1203 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 1204 %src1.0 = bitcast float %src1 to i32 1205 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) 1206 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 1207 %src0.0 = bitcast float %src0 to i32 1208 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) 1209 %out = add i32 %src0.1, %src1.1 1210 %out.0 = bitcast i32 %out to float 1211 call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 1212 ret void 1213} 1214 1215; Check a case of one branch of an if-else requiring WQM, the other requiring 1216; exact. 1217; Note: In this particular case, the save-and-restore could be avoided if the 1218; analysis understood that the two branches of the if-else are mutually 1219; exclusive. 1220define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 1221; GFX9-W64-LABEL: test_control_flow_0: 1222; GFX9-W64: ; %bb.0: ; %main_body 1223; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1224; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1225; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1226; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1227; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1228; GFX9-W64-NEXT: s_cbranch_execz .LBB23_2 1229; GFX9-W64-NEXT: ; %bb.1: ; %ELSE 1230; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] 1231; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1232; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1233; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 1234; GFX9-W64-NEXT: .LBB23_2: ; %Flow 1235; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] 1236; GFX9-W64-NEXT: s_cbranch_execz .LBB23_4 1237; GFX9-W64-NEXT: ; %bb.3: ; %IF 1238; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1239; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1240; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 1241; GFX9-W64-NEXT: .LBB23_4: ; %END 1242; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1243; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1244; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1245; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1246; GFX9-W64-NEXT: ; return to shader part epilog 1247; 1248; GFX10-W32-LABEL: test_control_flow_0: 1249; GFX10-W32: ; %bb.0: ; %main_body 1250; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1251; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1252; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 1253; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 1254; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1255; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2 1256; GFX10-W32-NEXT: ; %bb.1: ; %ELSE 1257; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 1258; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1259; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1260; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 1261; GFX10-W32-NEXT: .LBB23_2: ; %Flow 1262; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 1263; GFX10-W32-NEXT: s_cbranch_execz .LBB23_4 1264; GFX10-W32-NEXT: ; %bb.3: ; %IF 1265; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1266; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1267; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1268; GFX10-W32-NEXT: .LBB23_4: ; %END 1269; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1270; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1271; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1272; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1273; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1274; GFX10-W32-NEXT: ; return to shader part epilog 1275main_body: 1276 %cmp = icmp eq i32 %z, 0 1277 br i1 %cmp, label %IF, label %ELSE 1278 1279IF: 1280 %c.bc = bitcast i32 %c to float 1281 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1282 %tex0 = extractelement <4 x float> %tex, i32 0 1283 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1284 %data.if = extractelement <4 x float> %dtex, i32 0 1285 br label %END 1286 1287ELSE: 1288 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 1289 br label %END 1290 1291END: 1292 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 1293 ret float %r 1294} 1295 1296; Reverse branch order compared to the previous test. 1297define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 1298; GFX9-W64-LABEL: test_control_flow_1: 1299; GFX9-W64: ; %bb.0: ; %main_body 1300; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1301; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1302; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1303; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1304; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1305; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2 1306; GFX9-W64-NEXT: ; %bb.1: ; %IF 1307; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1308; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1309; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 1310; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1311; GFX9-W64-NEXT: .LBB24_2: ; %Flow 1312; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15] 1313; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1314; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1] 1315; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] 1316; GFX9-W64-NEXT: s_cbranch_execz .LBB24_4 1317; GFX9-W64-NEXT: ; %bb.3: ; %ELSE 1318; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1319; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1320; GFX9-W64-NEXT: .LBB24_4: ; %END 1321; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1322; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1323; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1324; GFX9-W64-NEXT: ; return to shader part epilog 1325; 1326; GFX10-W32-LABEL: test_control_flow_1: 1327; GFX10-W32: ; %bb.0: ; %main_body 1328; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1329; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1330; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 1331; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 1332; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1333; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 1334; GFX10-W32-NEXT: ; %bb.1: ; %IF 1335; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1336; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1337; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1338; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1339; GFX10-W32-NEXT: .LBB24_2: ; %Flow 1340; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13 1341; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1342; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0 1343; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 1344; GFX10-W32-NEXT: s_cbranch_execz .LBB24_4 1345; GFX10-W32-NEXT: ; %bb.3: ; %ELSE 1346; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1347; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1348; GFX10-W32-NEXT: .LBB24_4: ; %END 1349; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1350; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1351; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1352; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1353; GFX10-W32-NEXT: ; return to shader part epilog 1354main_body: 1355 %cmp = icmp eq i32 %z, 0 1356 br i1 %cmp, label %ELSE, label %IF 1357 1358IF: 1359 %c.bc = bitcast i32 %c to float 1360 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1361 %tex0 = extractelement <4 x float> %tex, i32 0 1362 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1363 %data.if = extractelement <4 x float> %dtex, i32 0 1364 br label %END 1365 1366ELSE: 1367 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0) 1368 br label %END 1369 1370END: 1371 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 1372 ret float %r 1373} 1374 1375; Check that branch conditions are properly marked as needing WQM... 1376define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 1377; GFX9-W64-LABEL: test_control_flow_2: 1378; GFX9-W64: ; %bb.0: ; %main_body 1379; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1380; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1381; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1382; GFX9-W64-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen 1383; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1384; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 1385; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1386; GFX9-W64-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen 1387; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1388; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 1389; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 1390; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1391; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1392; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1393; GFX9-W64-NEXT: ; %bb.1: ; %ELSE 1394; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5 1395; GFX9-W64-NEXT: ; implicit-def: $vgpr5 1396; GFX9-W64-NEXT: ; %bb.2: ; %Flow 1397; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] 1398; GFX9-W64-NEXT: ; %bb.3: ; %IF 1399; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3 1400; GFX9-W64-NEXT: ; %bb.4: ; %END 1401; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1402; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1403; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1404; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1405; GFX9-W64-NEXT: ; return to shader part epilog 1406; 1407; GFX10-W32-LABEL: test_control_flow_2: 1408; GFX10-W32: ; %bb.0: ; %main_body 1409; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1410; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1411; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1412; GFX10-W32-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen 1413; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1414; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 1415; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1416; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0 1417; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1418; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen 1419; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1420; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1421; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 1422; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1423; GFX10-W32-NEXT: ; %bb.1: ; %ELSE 1424; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5 1425; GFX10-W32-NEXT: ; implicit-def: $vgpr5 1426; GFX10-W32-NEXT: ; %bb.2: ; %Flow 1427; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 1428; GFX10-W32-NEXT: ; %bb.3: ; %IF 1429; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3 1430; GFX10-W32-NEXT: ; %bb.4: ; %END 1431; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1432; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1433; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1434; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1435; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1436; GFX10-W32-NEXT: ; return to shader part epilog 1437main_body: 1438 %idx.1 = extractelement <3 x i32> %idx, i32 0 1439 %data.1 = extractelement <2 x float> %data, i32 0 1440 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) 1441 1442 ; The load that determines the branch (and should therefore be WQM) is 1443 ; surrounded by stores that require disabled WQM. 1444 %idx.2 = extractelement <3 x i32> %idx, i32 1 1445 %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0) 1446 1447 %idx.3 = extractelement <3 x i32> %idx, i32 2 1448 %data.3 = extractelement <2 x float> %data, i32 1 1449 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0) 1450 1451 %cc = fcmp ogt float %z, 0.0 1452 br i1 %cc, label %IF, label %ELSE 1453 1454IF: 1455 %coord.IF = mul i32 %coord, 3 1456 br label %END 1457 1458ELSE: 1459 %coord.ELSE = mul i32 %coord, 4 1460 br label %END 1461 1462END: 1463 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] 1464 %coord.END.bc = bitcast i32 %coord.END to float 1465 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1466 ret <4 x float> %tex 1467} 1468 1469; ... but only if they really do need it. 1470define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { 1471; GFX9-W64-LABEL: test_control_flow_3: 1472; GFX9-W64: ; %bb.0: ; %main_body 1473; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1474; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1475; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 1476; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1477; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1478; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 1479; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1480; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 1481; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1482; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1483; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1484; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1485; GFX9-W64-NEXT: s_cbranch_execnz .LBB26_3 1486; GFX9-W64-NEXT: ; %bb.1: ; %Flow 1487; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 1488; GFX9-W64-NEXT: s_cbranch_execnz .LBB26_4 1489; GFX9-W64-NEXT: .LBB26_2: ; %END 1490; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1491; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1492; GFX9-W64-NEXT: s_branch .LBB26_5 1493; GFX9-W64-NEXT: .LBB26_3: ; %ELSE 1494; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1 1495; GFX9-W64-NEXT: ; implicit-def: $vgpr1 1496; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 1497; GFX9-W64-NEXT: s_cbranch_execz .LBB26_2 1498; GFX9-W64-NEXT: .LBB26_4: ; %IF 1499; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 1500; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1501; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1502; GFX9-W64-NEXT: s_branch .LBB26_5 1503; GFX9-W64-NEXT: .LBB26_5: 1504; 1505; GFX10-W32-LABEL: test_control_flow_3: 1506; GFX10-W32: ; %bb.0: ; %main_body 1507; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1508; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1509; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1510; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1511; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1512; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1513; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1514; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1515; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1516; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1517; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1 1518; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 1519; GFX10-W32-NEXT: s_cbranch_execnz .LBB26_3 1520; GFX10-W32-NEXT: ; %bb.1: ; %Flow 1521; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 1522; GFX10-W32-NEXT: s_cbranch_execnz .LBB26_4 1523; GFX10-W32-NEXT: .LBB26_2: ; %END 1524; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1525; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1526; GFX10-W32-NEXT: s_branch .LBB26_5 1527; GFX10-W32-NEXT: .LBB26_3: ; %ELSE 1528; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 1529; GFX10-W32-NEXT: ; implicit-def: $vgpr1 1530; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 1531; GFX10-W32-NEXT: s_cbranch_execz .LBB26_2 1532; GFX10-W32-NEXT: .LBB26_4: ; %IF 1533; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 1534; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1535; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1536; GFX10-W32-NEXT: s_branch .LBB26_5 1537; GFX10-W32-NEXT: .LBB26_5: 1538main_body: 1539 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1540 %tex0 = extractelement <4 x float> %tex, i32 0 1541 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1542 %dtex.1 = extractelement <4 x float> %dtex, i32 0 1543 call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 1544 1545 %cc = fcmp ogt float %dtex.1, 0.0 1546 br i1 %cc, label %IF, label %ELSE 1547 1548IF: 1549 %tex.IF = fmul float %dtex.1, 3.0 1550 br label %END 1551 1552ELSE: 1553 %tex.ELSE = fmul float %dtex.1, 4.0 1554 br label %END 1555 1556END: 1557 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] 1558 ret float %tex.END 1559} 1560 1561; Another test that failed at some point because of terminator handling. 1562define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { 1563; GFX9-W64-LABEL: test_control_flow_4: 1564; GFX9-W64: ; %bb.0: ; %main_body 1565; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1566; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1567; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1568; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1569; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2 1570; GFX9-W64-NEXT: ; %bb.1: ; %IF 1571; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] 1572; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0 1573; GFX9-W64-NEXT: v_mov_b32_e32 v2, 1 1574; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1575; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen 1576; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 1577; GFX9-W64-NEXT: .LBB27_2: ; %END 1578; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1579; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1580; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1581; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1582; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1583; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1584; GFX9-W64-NEXT: ; return to shader part epilog 1585; 1586; GFX10-W32-LABEL: test_control_flow_4: 1587; GFX10-W32: ; %bb.0: ; %main_body 1588; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1589; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1590; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 1591; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 1592; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 1593; GFX10-W32-NEXT: ; %bb.1: ; %IF 1594; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 1595; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0 1596; GFX10-W32-NEXT: v_mov_b32_e32 v2, 1 1597; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1598; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen 1599; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 1600; GFX10-W32-NEXT: .LBB27_2: ; %END 1601; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1602; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1603; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1604; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1605; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1606; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1607; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1608; GFX10-W32-NEXT: ; return to shader part epilog 1609main_body: 1610 %cond = icmp eq i32 %y, 0 1611 br i1 %cond, label %IF, label %END 1612 1613IF: 1614 %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0) 1615 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) 1616 br label %END 1617 1618END: 1619 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1620 %tex0 = extractelement <4 x float> %tex, i32 0 1621 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1622 ret <4 x float> %dtex 1623} 1624 1625; Kill is performed in WQM mode so that uniform kill behaves correctly ... 1626define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { 1627; GFX9-W64-LABEL: test_kill_0: 1628; GFX9-W64: ; %bb.0: ; %main_body 1629; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1630; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1631; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1632; GFX9-W64-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf 1633; GFX9-W64-NEXT: s_nop 0 1634; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1635; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1636; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v6 1637; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc 1638; GFX9-W64-NEXT: s_cbranch_scc0 .LBB28_2 1639; GFX9-W64-NEXT: ; %bb.1: ; %main_body 1640; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc 1641; GFX9-W64-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 1642; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1643; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1644; GFX9-W64-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf 1645; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1646; GFX9-W64-NEXT: v_add_f32_e32 v0, v7, v11 1647; GFX9-W64-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen 1648; GFX9-W64-NEXT: v_add_f32_e32 v1, v8, v12 1649; GFX9-W64-NEXT: v_add_f32_e32 v2, v9, v13 1650; GFX9-W64-NEXT: v_add_f32_e32 v3, v10, v14 1651; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1652; GFX9-W64-NEXT: s_branch .LBB28_3 1653; GFX9-W64-NEXT: .LBB28_2: 1654; GFX9-W64-NEXT: s_mov_b64 exec, 0 1655; GFX9-W64-NEXT: exp null off, off, off, off done vm 1656; GFX9-W64-NEXT: s_endpgm 1657; GFX9-W64-NEXT: .LBB28_3: 1658; 1659; GFX10-W32-LABEL: test_kill_0: 1660; GFX10-W32: ; %bb.0: ; %main_body 1661; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1662; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1663; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1664; GFX10-W32-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1665; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1666; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1667; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v6 1668; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo 1669; GFX10-W32-NEXT: s_cbranch_scc0 .LBB28_2 1670; GFX10-W32-NEXT: ; %bb.1: ; %main_body 1671; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo 1672; GFX10-W32-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1673; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1674; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1675; GFX10-W32-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1676; GFX10-W32-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen 1677; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1678; GFX10-W32-NEXT: v_add_f32_e32 v4, v8, v12 1679; GFX10-W32-NEXT: v_add_f32_e32 v5, v10, v14 1680; GFX10-W32-NEXT: v_add_f32_e32 v0, v7, v11 1681; GFX10-W32-NEXT: v_add_f32_e32 v2, v9, v13 1682; GFX10-W32-NEXT: v_mov_b32_e32 v1, v4 1683; GFX10-W32-NEXT: v_mov_b32_e32 v3, v5 1684; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1685; GFX10-W32-NEXT: s_branch .LBB28_3 1686; GFX10-W32-NEXT: .LBB28_2: 1687; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 1688; GFX10-W32-NEXT: exp null off, off, off, off done vm 1689; GFX10-W32-NEXT: s_endpgm 1690; GFX10-W32-NEXT: .LBB28_3: 1691main_body: 1692 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1693 %idx.0 = extractelement <2 x i32> %idx, i32 0 1694 %data.0 = extractelement <2 x float> %data, i32 0 1695 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0) 1696 1697 %z.cmp = fcmp olt float %z, 0.0 1698 call void @llvm.amdgcn.kill(i1 %z.cmp) 1699 1700 %idx.1 = extractelement <2 x i32> %idx, i32 1 1701 %data.1 = extractelement <2 x float> %data, i32 1 1702 call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0) 1703 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1704 %tex2.0 = extractelement <4 x float> %tex2, i32 0 1705 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1706 %out = fadd <4 x float> %tex, %dtex 1707 1708 ret <4 x float> %out 1709} 1710 1711; ... but only if WQM is necessary. 1712; CHECK-LABEL: {{^}}test_kill_1: 1713; CHECK-NEXT: ; %main_body 1714; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 1715; CHECK: s_wqm_b64 exec, exec 1716; CHECK: image_sample 1717; CHECK: s_and_b64 exec, exec, [[ORIG]] 1718; CHECK: image_sample 1719; CHECK-NOT: wqm 1720; CHECK-DAG: buffer_store_dword 1721; CHECK-DAG: v_cmp_ 1722define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 1723; GFX9-W64-LABEL: test_kill_1: 1724; GFX9-W64: ; %bb.0: ; %main_body 1725; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1726; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2 1727; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1728; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0 1729; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 1730; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1731; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1732; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1733; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v4 1734; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc 1735; GFX9-W64-NEXT: buffer_store_dword v5, off, s[0:3], 0 1736; GFX9-W64-NEXT: s_cbranch_scc0 .LBB29_2 1737; GFX9-W64-NEXT: ; %bb.1: ; %main_body 1738; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc 1739; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1740; GFX9-W64-NEXT: s_branch .LBB29_3 1741; GFX9-W64-NEXT: .LBB29_2: 1742; GFX9-W64-NEXT: s_mov_b64 exec, 0 1743; GFX9-W64-NEXT: exp null off, off, off, off done vm 1744; GFX9-W64-NEXT: s_endpgm 1745; GFX9-W64-NEXT: .LBB29_3: 1746; 1747; GFX10-W32-LABEL: test_kill_1: 1748; GFX10-W32: ; %bb.0: ; %main_body 1749; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1750; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2 1751; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1752; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0 1753; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1754; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1755; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1756; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1757; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v4 1758; GFX10-W32-NEXT: buffer_store_dword v5, off, s[0:3], 0 1759; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo 1760; GFX10-W32-NEXT: s_cbranch_scc0 .LBB29_2 1761; GFX10-W32-NEXT: ; %bb.1: ; %main_body 1762; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo 1763; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1764; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1765; GFX10-W32-NEXT: s_branch .LBB29_3 1766; GFX10-W32-NEXT: .LBB29_2: 1767; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 1768; GFX10-W32-NEXT: exp null off, off, off, off done vm 1769; GFX10-W32-NEXT: s_endpgm 1770; GFX10-W32-NEXT: .LBB29_3: 1771main_body: 1772 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1773 %tex0 = extractelement <4 x float> %tex, i32 0 1774 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1775 1776 call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) 1777 1778 %z.cmp = fcmp olt float %z, 0.0 1779 call void @llvm.amdgcn.kill(i1 %z.cmp) 1780 1781 ret <4 x float> %dtex 1782} 1783 1784; Check prolog shaders. 1785; CHECK-LABEL: {{^}}test_prolog_1: 1786; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 1787; CHECK: s_wqm_b64 exec, exec 1788; CHECK: v_add_f32_e32 v0, 1789; CHECK: s_and_b64 exec, exec, [[ORIG]] 1790define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 { 1791; GFX9-W64-LABEL: test_prolog_1: 1792; GFX9-W64: ; %bb.0: ; %main_body 1793; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1794; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1795; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 1796; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1797; GFX9-W64-NEXT: ; return to shader part epilog 1798; 1799; GFX10-W32-LABEL: test_prolog_1: 1800; GFX10-W32: ; %bb.0: ; %main_body 1801; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1802; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1803; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 1804; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1805; GFX10-W32-NEXT: ; return to shader part epilog 1806main_body: 1807 %s = fadd float %a, %b 1808 ret float %s 1809} 1810 1811; CHECK-LABEL: {{^}}test_loop_vcc: 1812; CHECK-NEXT: ; %entry 1813; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 1814; CHECK: s_wqm_b64 exec, exec 1815; CHECK: v_mov 1816; CHECK: v_mov 1817; CHECK: v_mov 1818; CHECK: v_mov 1819; CHECK: s_and_b64 exec, exec, [[LIVE]] 1820; CHECK: image_store 1821; CHECK: s_wqm_b64 exec, exec 1822; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 1823; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000 1824 1825; CHECK: [[LOOPHDR:.LBB[0-9]+_[0-9]+]]: ; %body 1826; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] 1827; CHECK: [[LOOP:.LBB[0-9]+_[0-9]+]]: ; %loop 1828; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] 1829; CHECK: s_cbranch_vccz [[LOOPHDR]] 1830 1831; CHECK: ; %break 1832; CHECK: ; return 1833define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { 1834; GFX9-W64-LABEL: test_loop_vcc: 1835; GFX9-W64: ; %bb.0: ; %entry 1836; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1837; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1838; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 1839; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 1840; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 1841; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 1842; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1843; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm 1844; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1845; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 1846; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 1847; GFX9-W64-NEXT: s_branch .LBB31_2 1848; GFX9-W64-NEXT: .LBB31_1: ; %body 1849; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 1850; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf 1851; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 1852; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 1853; GFX9-W64-NEXT: .LBB31_2: ; %loop 1854; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 1855; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1856; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 1857; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 1858; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 1859; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 1860; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 1861; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 1862; GFX9-W64-NEXT: ; %bb.3: 1863; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 1864; GFX9-W64-NEXT: ; implicit-def: $vgpr8 1865; GFX9-W64-NEXT: .LBB31_4: ; %break 1866; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1867; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1868; GFX9-W64-NEXT: ; return to shader part epilog 1869; 1870; GFX10-W32-LABEL: test_loop_vcc: 1871; GFX10-W32: ; %bb.0: ; %entry 1872; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1873; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1874; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0 1875; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1876; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 1877; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1878; GFX10-W32-NEXT: s_branch .LBB31_2 1879; GFX10-W32-NEXT: .p2align 6 1880; GFX10-W32-NEXT: .LBB31_1: ; %body 1881; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 1882; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 1883; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 1884; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 1885; GFX10-W32-NEXT: .LBB31_2: ; %loop 1886; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 1887; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 1888; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1889; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3 1890; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2 1891; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 1892; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 1893; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 1894; GFX10-W32-NEXT: ; %bb.3: 1895; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 1896; GFX10-W32-NEXT: ; implicit-def: $vgpr8 1897; GFX10-W32-NEXT: .LBB31_4: ; %break 1898; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1899; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1900; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 1901; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 1902; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6 1903; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7 1904; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1905; GFX10-W32-NEXT: ; return to shader part epilog 1906entry: 1907 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0) 1908 br label %loop 1909 1910loop: 1911 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 1912 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 1913 %cc = fcmp ogt float %ctr.iv, 7.0 1914 br i1 %cc, label %break, label %body 1915 1916body: 1917 %c.iv0 = extractelement <4 x float> %c.iv, i32 0 1918 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 1919 %ctr.next = fadd float %ctr.iv, 2.0 1920 br label %loop 1921 1922break: 1923 ret <4 x float> %c.iv 1924} 1925 1926; Only intrinsic stores need exact execution -- other stores do not have 1927; externally visible effects and may require WQM for correctness. 1928; CHECK-LABEL: {{^}}test_alloca: 1929; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 1930; CHECK: s_wqm_b64 exec, exec 1931 1932; CHECK: s_and_b64 exec, exec, [[LIVE]] 1933; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 1934; CHECK: s_wqm_b64 exec, exec 1935; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} 1936; CHECK: s_and_b64 exec, exec, [[LIVE]] 1937; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen 1938; CHECK: s_wqm_b64 exec, exec 1939; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen 1940 1941; CHECK: s_and_b64 exec, exec, [[LIVE]] 1942; CHECK: image_sample 1943; CHECK: buffer_store_dwordx4 1944define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { 1945; GFX9-W64-LABEL: test_alloca: 1946; GFX9-W64: ; %bb.0: ; %entry 1947; GFX9-W64-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1948; GFX9-W64-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1949; GFX9-W64-NEXT: s_mov_b32 s10, -1 1950; GFX9-W64-NEXT: s_mov_b32 s11, 0xe00000 1951; GFX9-W64-NEXT: s_add_u32 s8, s8, s0 1952; GFX9-W64-NEXT: s_addc_u32 s9, s9, 0 1953; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1954; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1955; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1956; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0 1957; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1958; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 1959; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1960; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1961; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 1962; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen 1963; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1964; GFX9-W64-NEXT: v_mov_b32_e32 v0, 4 1965; GFX9-W64-NEXT: v_lshl_add_u32 v0, v2, 2, v0 1966; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen 1967; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1968; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1969; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 1970; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1971; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1972; GFX9-W64-NEXT: s_endpgm 1973; 1974; GFX10-W32-LABEL: test_alloca: 1975; GFX10-W32: ; %bb.0: ; %entry 1976; GFX10-W32-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 1977; GFX10-W32-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 1978; GFX10-W32-NEXT: s_mov_b32 s10, -1 1979; GFX10-W32-NEXT: s_mov_b32 s11, 0x31c16000 1980; GFX10-W32-NEXT: s_add_u32 s8, s8, s0 1981; GFX10-W32-NEXT: s_addc_u32 s9, s9, 0 1982; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1983; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1984; GFX10-W32-NEXT: v_mov_b32_e32 v3, 1 1985; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 4 1986; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1987; GFX10-W32-NEXT: buffer_store_dword v0, off, s[0:3], 0 1988; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1989; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4 1990; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 1991; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1992; GFX10-W32-NEXT: buffer_store_dword v0, v3, s[0:3], 0 idxen 1993; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1994; GFX10-W32-NEXT: buffer_load_dword v0, v2, s[8:11], 0 offen 1995; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1996; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1997; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 1998; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1999; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2000; GFX10-W32-NEXT: s_endpgm 2001entry: 2002 %array = alloca [32 x i32], align 4, addrspace(5) 2003 2004 call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0) 2005 2006 %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0 2007 store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4 2008 2009 call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0) 2010 2011 %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx 2012 %c = load i32, i32 addrspace(5)* %c.gep, align 4 2013 %c.bc = bitcast i32 %c to float 2014 %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2015 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0) 2016 2017 ret void 2018} 2019 2020; Must return to exact at the end of a non-void returning shader, 2021; otherwise the EXEC mask exported by the epilog will be wrong. This is true 2022; even if the shader has no kills, because a kill could have happened in a 2023; previous shader fragment. 2024; CHECK-LABEL: {{^}}test_nonvoid_return: 2025; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 2026; CHECK: s_wqm_b64 exec, exec 2027; CHECK: s_and_b64 exec, exec, [[LIVE]] 2028; CHECK-NOT: exec 2029define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { 2030; GFX9-W64-LABEL: test_nonvoid_return: 2031; GFX9-W64: ; %bb.0: 2032; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 2033; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2034; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 2035; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 2036; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2037; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 2038; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2039; GFX9-W64-NEXT: ; return to shader part epilog 2040; 2041; GFX10-W32-LABEL: test_nonvoid_return: 2042; GFX10-W32: ; %bb.0: 2043; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 2044; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2045; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D 2046; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 2047; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2048; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2049; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2050; GFX10-W32-NEXT: ; return to shader part epilog 2051 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2052 %tex0 = extractelement <4 x float> %tex, i32 0 2053 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2054 ret <4 x float> %dtex 2055} 2056 2057; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable: 2058; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 2059; CHECK: s_wqm_b64 exec, exec 2060; CHECK: s_and_b64 exec, exec, [[LIVE]] 2061; CHECK-NOT: exec 2062define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { 2063; GFX9-W64-LABEL: test_nonvoid_return_unreachable: 2064; GFX9-W64: ; %bb.0: ; %entry 2065; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2066; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 2067; GFX9-W64-NEXT: s_and_b64 exec, exec, exec 2068; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2069; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 2070; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 2071; GFX9-W64-NEXT: s_cbranch_scc0 .LBB34_2 2072; GFX9-W64-NEXT: ; %bb.1: ; %else 2073; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2074; GFX9-W64-NEXT: s_branch .LBB34_3 2075; GFX9-W64-NEXT: .LBB34_2: ; %if 2076; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2077; GFX9-W64-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 2078; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2079; GFX9-W64-NEXT: .LBB34_3: 2080; 2081; GFX10-W32-LABEL: test_nonvoid_return_unreachable: 2082; GFX10-W32: ; %bb.0: ; %entry 2083; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2084; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D 2085; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo 2086; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2087; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2088; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 2089; GFX10-W32-NEXT: s_cbranch_scc0 .LBB34_2 2090; GFX10-W32-NEXT: ; %bb.1: ; %else 2091; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2092; GFX10-W32-NEXT: s_branch .LBB34_3 2093; GFX10-W32-NEXT: .LBB34_2: ; %if 2094; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2095; GFX10-W32-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 2096; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2097; GFX10-W32-NEXT: .LBB34_3: 2098entry: 2099 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2100 %tex0 = extractelement <4 x float> %tex, i32 0 2101 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2102 %cc = icmp sgt i32 %c, 0 2103 br i1 %cc, label %if, label %else 2104 2105if: 2106 store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef 2107 unreachable 2108 2109else: 2110 ret <4 x float> %dtex 2111} 2112 2113; Test awareness that s_wqm_b64 clobbers SCC. 2114; CHECK-LABEL: {{^}}test_scc: 2115; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 2116; CHECK: s_wqm_b64 exec, exec 2117; CHECK: s_cmp_ 2118; CHECK-NEXT: s_cbranch_scc 2119; CHECK: ; %else 2120; CHECK: image_sample 2121; CHECK: ; %if 2122; CHECK: image_sample 2123; CHECK: ; %end 2124; CHECK: s_and_b64 exec, exec, [[ORIG]] 2125define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { 2126; GFX9-W64-LABEL: test_scc: 2127; GFX9-W64: ; %bb.0: ; %main_body 2128; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 2129; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 2130; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2131; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 2132; GFX9-W64-NEXT: s_cbranch_scc0 .LBB35_2 2133; GFX9-W64-NEXT: ; %bb.1: ; %else 2134; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2135; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 2136; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf 2137; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 2138; GFX9-W64-NEXT: s_branch .LBB35_4 2139; GFX9-W64-NEXT: .LBB35_2: 2140; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 2141; GFX9-W64-NEXT: .LBB35_3: ; %if 2142; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2143; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2144; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 2145; GFX9-W64-NEXT: .LBB35_4: ; %end 2146; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 2147; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0 2148; GFX9-W64-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen 2149; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2150; GFX9-W64-NEXT: ; return to shader part epilog 2151; 2152; GFX10-W32-LABEL: test_scc: 2153; GFX10-W32: ; %bb.0: ; %main_body 2154; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 2155; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 2156; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2157; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 2158; GFX10-W32-NEXT: s_cbranch_scc0 .LBB35_2 2159; GFX10-W32-NEXT: ; %bb.1: ; %else 2160; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2161; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1 2162; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D 2163; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 2164; GFX10-W32-NEXT: s_branch .LBB35_4 2165; GFX10-W32-NEXT: .LBB35_2: 2166; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 2167; GFX10-W32-NEXT: .LBB35_3: ; %if 2168; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2169; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2170; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2171; GFX10-W32-NEXT: .LBB35_4: ; %end 2172; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 2173; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0 2174; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen 2175; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2176; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2177; GFX10-W32-NEXT: ; return to shader part epilog 2178main_body: 2179 %cc = icmp sgt i32 %sel, 0 2180 br i1 %cc, label %if, label %else 2181 2182if: 2183 %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2184 br label %end 2185 2186else: 2187 %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2188 br label %end 2189 2190end: 2191 %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] 2192 call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2193 ret <4 x float> %r 2194} 2195 2196; Check a case of a block being entirely WQM except for a bit of WWM. 2197; There was a bug where it forgot to enter and leave WWM. 2198define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2199; GFX9-W64-LABEL: test_wwm_within_wqm: 2200; GFX9-W64: ; %bb.0: ; %main_body 2201; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2202; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2203; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2204; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 2205; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2206; GFX9-W64-NEXT: s_cbranch_execz .LBB36_2 2207; GFX9-W64-NEXT: ; %bb.1: ; %IF 2208; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2209; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2210; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2211; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2212; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 2213; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 2214; GFX9-W64-NEXT: s_not_b64 exec, exec 2215; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 2216; GFX9-W64-NEXT: s_not_b64 exec, exec 2217; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2218; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2219; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2220; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2221; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2222; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 2223; GFX9-W64-NEXT: .LBB36_2: ; %ENDIF 2224; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2225; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2226; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2227; GFX9-W64-NEXT: ; return to shader part epilog 2228; 2229; GFX10-W32-LABEL: test_wwm_within_wqm: 2230; GFX10-W32: ; %bb.0: ; %main_body 2231; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2232; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2233; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2234; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 2235; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 2236; GFX10-W32-NEXT: s_cbranch_execz .LBB36_2 2237; GFX10-W32-NEXT: ; %bb.1: ; %IF 2238; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2239; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2240; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2241; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2242; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 2243; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 2244; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2245; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 2246; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2247; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2248; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2249; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2250; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2251; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2252; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 2253; GFX10-W32-NEXT: .LBB36_2: ; %ENDIF 2254; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2255; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2256; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2257; GFX10-W32-NEXT: ; return to shader part epilog 2258main_body: 2259 %cmp = icmp eq i32 %z, 0 2260 br i1 %cmp, label %IF, label %ENDIF 2261 2262IF: 2263 %c.bc = bitcast i32 %c to float 2264 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2265 %tex0 = extractelement <4 x float> %tex, i32 0 2266 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2267 %dataf = extractelement <4 x float> %dtex, i32 0 2268 %data1 = fptosi float %dataf to i32 2269 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) 2270 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) 2271 %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3) 2272 %data4f = sitofp i32 %data4 to float 2273 br label %ENDIF 2274 2275ENDIF: 2276 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] 2277 ret float %r 2278} 2279 2280; Check that WWM is triggered by the strict_wwm intrinsic. 2281define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 2282; GFX9-W64-LABEL: test_strict_wwm1: 2283; GFX9-W64: ; %bb.0: ; %main_body 2284; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2285; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2286; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 2287; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2288; GFX9-W64-NEXT: s_nop 0 2289; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2290; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2291; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 2292; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2293; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2294; GFX9-W64-NEXT: ; return to shader part epilog 2295; 2296; GFX10-W32-LABEL: test_strict_wwm1: 2297; GFX10-W32: ; %bb.0: ; %main_body 2298; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2299; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2300; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 2301; GFX10-W32-NEXT: s_clause 0x1 2302; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2303; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2304; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2305; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 2306; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2307; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2308; GFX10-W32-NEXT: ; return to shader part epilog 2309main_body: 2310 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 2311 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 2312 %out = fadd float %src0, %src1 2313 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2314 ret float %out.0 2315} 2316 2317; Same as above, but with an integer type. 2318define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) { 2319; GFX9-W64-LABEL: test_strict_wwm2: 2320; GFX9-W64: ; %bb.0: ; %main_body 2321; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2322; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2323; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 2324; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2325; GFX9-W64-NEXT: s_nop 0 2326; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2327; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2328; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 2329; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2330; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2331; GFX9-W64-NEXT: ; return to shader part epilog 2332; 2333; GFX10-W32-LABEL: test_strict_wwm2: 2334; GFX10-W32: ; %bb.0: ; %main_body 2335; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2336; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2337; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 2338; GFX10-W32-NEXT: s_clause 0x1 2339; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2340; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2341; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2342; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 2343; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2344; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2345; GFX10-W32-NEXT: ; return to shader part epilog 2346main_body: 2347 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 2348 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 2349 %src0.0 = bitcast float %src0 to i32 2350 %src1.0 = bitcast float %src1 to i32 2351 %out = add i32 %src0.0, %src1.0 2352 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) 2353 %out.1 = bitcast i32 %out.0 to float 2354 ret float %out.1 2355} 2356 2357; Check that we don't leave WWM on for computations that don't require WWM, 2358; since that will lead clobbering things that aren't supposed to be clobbered 2359; in cases like this. 2360; We enforce this by checking that v_add gets emitted in the same block as 2361; WWM computations. 2362define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { 2363; GFX9-W64-LABEL: test_strict_wwm3: 2364; GFX9-W64: ; %bb.0: ; %main_body 2365; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2366; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2367; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 2368; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2369; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 2370; GFX9-W64-NEXT: s_cbranch_execz .LBB39_2 2371; GFX9-W64-NEXT: ; %bb.1: ; %if 2372; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2373; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2374; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2375; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2376; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 2377; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2378; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2379; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 2380; GFX9-W64-NEXT: .LBB39_2: ; %endif 2381; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 2382; GFX9-W64-NEXT: ; return to shader part epilog 2383; 2384; GFX10-W32-LABEL: test_strict_wwm3: 2385; GFX10-W32: ; %bb.0: ; %main_body 2386; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2387; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2388; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 2389; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2390; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2391; GFX10-W32-NEXT: s_cbranch_execz .LBB39_2 2392; GFX10-W32-NEXT: ; %bb.1: ; %if 2393; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2394; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2395; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2396; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2397; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 2398; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2399; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2400; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 2401; GFX10-W32-NEXT: .LBB39_2: ; %endif 2402; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2403; GFX10-W32-NEXT: ; return to shader part epilog 2404main_body: 2405 ; use mbcnt to make sure the branch is divergent 2406 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2407 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2408 %cc = icmp uge i32 %hi, 16 2409 br i1 %cc, label %endif, label %if 2410 2411if: 2412 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2413 %out = fadd float %src, %src 2414 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2415 %out.1 = fadd float %src, %out.0 2416 br label %endif 2417 2418endif: 2419 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 2420 ret float %out.2 2421} 2422 2423; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM 2424; write could clobber disabled channels in the non-WWM one. 2425; We enforce this by checking that v_mov gets emitted in the same block as 2426; WWM computations. 2427define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { 2428; GFX9-W64-LABEL: test_strict_wwm4: 2429; GFX9-W64: ; %bb.0: ; %main_body 2430; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2431; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2432; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 2433; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2434; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 2435; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2 2436; GFX9-W64-NEXT: ; %bb.1: ; %if 2437; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2438; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2439; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2440; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2441; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 2442; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2443; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2444; GFX9-W64-NEXT: .LBB40_2: ; %endif 2445; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 2446; GFX9-W64-NEXT: ; return to shader part epilog 2447; 2448; GFX10-W32-LABEL: test_strict_wwm4: 2449; GFX10-W32: ; %bb.0: ; %main_body 2450; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2451; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2452; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 2453; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2454; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2455; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2 2456; GFX10-W32-NEXT: ; %bb.1: ; %if 2457; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2458; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2459; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2460; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2461; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 2462; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2463; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2464; GFX10-W32-NEXT: .LBB40_2: ; %endif 2465; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2466; GFX10-W32-NEXT: ; return to shader part epilog 2467main_body: 2468 ; use mbcnt to make sure the branch is divergent 2469 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2470 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2471 %cc = icmp uge i32 %hi, 16 2472 br i1 %cc, label %endif, label %if 2473 2474if: 2475 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2476 %out = fadd float %src, %src 2477 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2478 br label %endif 2479 2480endif: 2481 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 2482 ret float %out.1 2483} 2484 2485; Make sure the transition from Exact to WWM then WQM works properly. 2486define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) { 2487; GFX9-W64-LABEL: test_strict_wwm5: 2488; GFX9-W64: ; %bb.0: ; %main_body 2489; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 2490; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 2491; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 2492; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2493; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 2494; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2495; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 2496; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2497; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2498; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 2499; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2500; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2501; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2502; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 2503; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 2504; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 2505; GFX9-W64-NEXT: ; return to shader part epilog 2506; 2507; GFX10-W32-LABEL: test_strict_wwm5: 2508; GFX10-W32: ; %bb.0: ; %main_body 2509; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 2510; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 2511; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 2512; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2513; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 2514; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2515; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2516; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 2517; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2518; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2519; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2520; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 2521; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2522; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2523; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2524; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 2525; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec 2526; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 2527; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2528; GFX10-W32-NEXT: ; return to shader part epilog 2529main_body: 2530 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 2531 call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 2532 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 2533 %temp = fadd float %src1, %src1 2534 %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) 2535 %out = fadd float %temp.0, %temp.0 2536 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 2537 ret float %out.0 2538} 2539 2540; Check that WWM is turned on correctly across basic block boundaries. 2541; if..then..endif version 2542;SI-CHECK: buffer_load_dword 2543;VI-CHECK: flat_load_dword 2544;SI-CHECK: buffer_load_dword 2545;VI-CHECK: flat_load_dword 2546define amdgpu_ps float @test_strict_wwm6_then() { 2547; GFX9-W64-LABEL: test_strict_wwm6_then: 2548; GFX9-W64: ; %bb.0: ; %main_body 2549; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2550; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 2551; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2552; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2553; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2554; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2555; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 2556; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2557; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2558; GFX9-W64-NEXT: s_cbranch_execz .LBB42_2 2559; GFX9-W64-NEXT: ; %bb.1: ; %if 2560; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2561; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 2562; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2563; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 2564; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2565; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2566; GFX9-W64-NEXT: .LBB42_2: ; %endif 2567; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 2568; GFX9-W64-NEXT: ; return to shader part epilog 2569; 2570; GFX10-W32-LABEL: test_strict_wwm6_then: 2571; GFX10-W32: ; %bb.0: ; %main_body 2572; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2573; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 2574; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2575; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2576; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2577; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2578; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 2579; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2580; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 2581; GFX10-W32-NEXT: s_cbranch_execz .LBB42_2 2582; GFX10-W32-NEXT: ; %bb.1: ; %if 2583; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2584; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 2585; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2586; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 2587; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2588; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2589; GFX10-W32-NEXT: .LBB42_2: ; %endif 2590; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 2591; GFX10-W32-NEXT: ; return to shader part epilog 2592main_body: 2593 %src0 = load volatile float, float addrspace(1)* undef 2594 ; use mbcnt to make sure the branch is divergent 2595 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2596 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2597 %cc = icmp uge i32 %hi, 16 2598 br i1 %cc, label %endif, label %if 2599 2600if: 2601 %src1 = load volatile float, float addrspace(1)* undef 2602 %out = fadd float %src0, %src1 2603 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2604 br label %endif 2605 2606endif: 2607 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 2608 ret float %out.1 2609} 2610 2611; Check that WWM is turned on correctly across basic block boundaries. 2612; loop version 2613define amdgpu_ps float @test_strict_wwm6_loop() { 2614; GFX9-W64-LABEL: test_strict_wwm6_loop: 2615; GFX9-W64: ; %bb.0: ; %main_body 2616; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2617; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 2618; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2619; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2620; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2621; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 2622; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 2623; GFX9-W64-NEXT: .LBB43_1: ; %loop 2624; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 2625; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2626; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 2627; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2628; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2629; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 2630; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2631; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2632; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 2633; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2634; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2635; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2636; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 2637; GFX9-W64-NEXT: s_cbranch_execnz .LBB43_1 2638; GFX9-W64-NEXT: ; %bb.2: ; %endloop 2639; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 2640; GFX9-W64-NEXT: ; return to shader part epilog 2641; 2642; GFX10-W32-LABEL: test_strict_wwm6_loop: 2643; GFX10-W32: ; %bb.0: ; %main_body 2644; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2645; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 2646; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2647; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2648; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2649; GFX10-W32-NEXT: s_mov_b32 s0, 0 2650; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 2651; GFX10-W32-NEXT: .LBB43_1: ; %loop 2652; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 2653; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2654; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 2655; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2656; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2657; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 2658; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2659; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 2660; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2661; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2662; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2663; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 2664; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 2665; GFX10-W32-NEXT: s_cbranch_execnz .LBB43_1 2666; GFX10-W32-NEXT: ; %bb.2: ; %endloop 2667; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 2668; GFX10-W32-NEXT: ; return to shader part epilog 2669main_body: 2670 %src0 = load volatile float, float addrspace(1)* undef 2671 ; use mbcnt to make sure the branch is divergent 2672 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2673 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2674 br label %loop 2675 2676loop: 2677 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] 2678 %src1 = load volatile float, float addrspace(1)* undef 2679 %out = fadd float %src0, %src1 2680 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2681 %counter.1 = sub i32 %counter, 1 2682 %cc = icmp ne i32 %counter.1, 0 2683 br i1 %cc, label %loop, label %endloop 2684 2685endloop: 2686 ret float %out.0 2687} 2688 2689; Check that @llvm.amdgcn.set.inactive disables WWM. 2690define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { 2691; GFX9-W64-LABEL: test_strict_wwm_set_inactive1: 2692; GFX9-W64: ; %bb.0: ; %main_body 2693; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2694; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen 2695; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2696; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2697; GFX9-W64-NEXT: s_not_b64 exec, exec 2698; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2699; GFX9-W64-NEXT: s_not_b64 exec, exec 2700; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2701; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 2702; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2703; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 2704; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 2705; GFX9-W64-NEXT: s_endpgm 2706; 2707; GFX10-W32-LABEL: test_strict_wwm_set_inactive1: 2708; GFX10-W32: ; %bb.0: ; %main_body 2709; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2710; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen 2711; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2712; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2713; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2714; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2715; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2716; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2717; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 2718; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2719; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 2720; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 2721; GFX10-W32-NEXT: s_endpgm 2722main_body: 2723 %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2724 %src.0 = bitcast float %src to i32 2725 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) 2726 %out = add i32 %src.1, %src.1 2727 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) 2728 %out.1 = bitcast i32 %out.0 to float 2729 call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) 2730 ret void 2731} 2732 2733; Check a case of a block being entirely WQM except for a bit of WWM. 2734; There was a bug where it forgot to enter and leave WWM. 2735define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2736; GFX9-W64-LABEL: test_strict_wwm_within_wqm: 2737; GFX9-W64: ; %bb.0: ; %main_body 2738; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2739; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2740; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2741; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 2742; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2743; GFX9-W64-NEXT: s_cbranch_execz .LBB45_2 2744; GFX9-W64-NEXT: ; %bb.1: ; %IF 2745; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2746; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2747; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2748; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2749; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 2750; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 2751; GFX9-W64-NEXT: s_not_b64 exec, exec 2752; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0 2753; GFX9-W64-NEXT: s_not_b64 exec, exec 2754; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2755; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2756; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2757; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2758; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2759; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 2760; GFX9-W64-NEXT: .LBB45_2: ; %ENDIF 2761; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2762; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2763; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2764; GFX9-W64-NEXT: ; return to shader part epilog 2765; 2766; GFX10-W32-LABEL: test_strict_wwm_within_wqm: 2767; GFX10-W32: ; %bb.0: ; %main_body 2768; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2769; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2770; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2771; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 2772; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 2773; GFX10-W32-NEXT: s_cbranch_execz .LBB45_2 2774; GFX10-W32-NEXT: ; %bb.1: ; %IF 2775; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2776; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2777; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2778; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2779; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 2780; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 2781; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2782; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0 2783; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo 2784; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2785; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2786; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2787; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2788; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2789; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 2790; GFX10-W32-NEXT: .LBB45_2: ; %ENDIF 2791; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2792; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2793; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2794; GFX10-W32-NEXT: ; return to shader part epilog 2795main_body: 2796 %cmp = icmp eq i32 %z, 0 2797 br i1 %cmp, label %IF, label %ENDIF 2798 2799IF: 2800 %c.bc = bitcast i32 %c to float 2801 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2802 %tex0 = extractelement <4 x float> %tex, i32 0 2803 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2804 %dataf = extractelement <4 x float> %dtex, i32 0 2805 %data1 = fptosi float %dataf to i32 2806 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) 2807 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) 2808 %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3) 2809 %data4f = sitofp i32 %data4 to float 2810 br label %ENDIF 2811 2812ENDIF: 2813 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] 2814 ret float %r 2815} 2816 2817; Check a case of a block being entirely WQM except for a bit of STRICT WQM. 2818define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2819; GFX9-W64-LABEL: test_strict_wqm_within_wqm: 2820; GFX9-W64: ; %bb.0: ; %main_body 2821; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2822; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2823; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec 2824; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2825; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 2826; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] 2827; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2828; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2829; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2830; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2 2831; GFX9-W64-NEXT: ; %bb.1: ; %IF 2832; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec 2833; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2834; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 2835; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2836; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 2837; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2838; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2 2839; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2840; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 2841; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2842; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2843; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 2844; GFX9-W64-NEXT: .LBB46_2: ; %ENDIF 2845; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2846; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2847; GFX9-W64-NEXT: ; return to shader part epilog 2848; 2849; GFX10-W32-LABEL: test_strict_wqm_within_wqm: 2850; GFX10-W32: ; %bb.0: ; %main_body 2851; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2852; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2853; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 2854; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2855; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 2856; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 2857; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2858; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 2859; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 2860; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 2861; GFX10-W32-NEXT: ; %bb.1: ; %IF 2862; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo 2863; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2864; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2865; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2866; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2867; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2868; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2 2869; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2870; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 2871; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2872; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2873; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 2874; GFX10-W32-NEXT: .LBB46_2: ; %ENDIF 2875; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2876; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2877; GFX10-W32-NEXT: ; return to shader part epilog 2878main_body: 2879 %cmp = icmp eq i32 %z, 0 2880 br i1 %cmp, label %IF, label %ENDIF 2881 2882IF: 2883 %c.bc = bitcast i32 %c to float 2884 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2885 %tex0 = extractelement <4 x float> %tex, i32 0 2886 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2887 %dataf = extractelement <4 x float> %dtex, i32 0 2888 %data1 = fptosi float %dataf to i32 2889 %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079) 2890 %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2) 2891 %data3f = sitofp i32 %data3 to float 2892 br label %ENDIF 2893 2894ENDIF: 2895 %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ] 2896 ret float %r 2897} 2898 2899;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again. 2900define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) { 2901; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm: 2902; GFX9-W64: ; %bb.0: ; %main_body 2903; GFX9-W64-NEXT: s_mov_b64 s[28:29], exec 2904; GFX9-W64-NEXT: s_mov_b32 s19, s17 2905; GFX9-W64-NEXT: s_mov_b64 s[30:31], exec 2906; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2907; GFX9-W64-NEXT: s_mov_b32 s23, s5 2908; GFX9-W64-NEXT: s_mov_b32 s22, s4 2909; GFX9-W64-NEXT: s_mov_b32 s21, s3 2910; GFX9-W64-NEXT: s_mov_b32 s20, s2 2911; GFX9-W64-NEXT: s_mov_b32 s27, s9 2912; GFX9-W64-NEXT: s_mov_b32 s26, s8 2913; GFX9-W64-NEXT: s_mov_b32 s25, s7 2914; GFX9-W64-NEXT: s_mov_b32 s24, s6 2915; GFX9-W64-NEXT: s_mov_b32 s18, s16 2916; GFX9-W64-NEXT: s_mov_b32 s17, s15 2917; GFX9-W64-NEXT: s_mov_b32 s16, s14 2918; GFX9-W64-NEXT: s_mov_b32 s15, s13 2919; GFX9-W64-NEXT: s_mov_b32 s14, s12 2920; GFX9-W64-NEXT: s_mov_b32 s13, s11 2921; GFX9-W64-NEXT: s_mov_b32 s12, s10 2922; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 2923; GFX9-W64-NEXT: s_mov_b64 exec, s[30:31] 2924; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 2925; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 2926; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2927; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen 2928; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2929; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2930; GFX9-W64-NEXT: v_mov_b32_e32 v3, s0 2931; GFX9-W64-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen 2932; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2933; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 2934; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2935; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 2936; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2 2937; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2938; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2939; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2940; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 2941; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2942; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 2943; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4 2944; GFX9-W64-NEXT: s_and_b64 exec, exec, s[28:29] 2945; GFX9-W64-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 2946; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2947; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 2948; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen 2949; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2950; GFX9-W64-NEXT: ; return to shader part epilog 2951; 2952; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm: 2953; GFX10-W32: ; %bb.0: ; %main_body 2954; GFX10-W32-NEXT: s_mov_b32 s28, exec_lo 2955; GFX10-W32-NEXT: s_mov_b32 s19, s17 2956; GFX10-W32-NEXT: s_mov_b32 s29, exec_lo 2957; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2958; GFX10-W32-NEXT: s_mov_b32 s23, s5 2959; GFX10-W32-NEXT: s_mov_b32 s22, s4 2960; GFX10-W32-NEXT: s_mov_b32 s21, s3 2961; GFX10-W32-NEXT: s_mov_b32 s20, s2 2962; GFX10-W32-NEXT: s_mov_b32 s27, s9 2963; GFX10-W32-NEXT: s_mov_b32 s26, s8 2964; GFX10-W32-NEXT: s_mov_b32 s25, s7 2965; GFX10-W32-NEXT: s_mov_b32 s24, s6 2966; GFX10-W32-NEXT: s_mov_b32 s18, s16 2967; GFX10-W32-NEXT: s_mov_b32 s17, s15 2968; GFX10-W32-NEXT: s_mov_b32 s16, s14 2969; GFX10-W32-NEXT: s_mov_b32 s15, s13 2970; GFX10-W32-NEXT: s_mov_b32 s14, s12 2971; GFX10-W32-NEXT: s_mov_b32 s13, s11 2972; GFX10-W32-NEXT: s_mov_b32 s12, s10 2973; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 2974; GFX10-W32-NEXT: s_mov_b32 exec_lo, s29 2975; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 2976; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 2977; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2978; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen 2979; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2980; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2981; GFX10-W32-NEXT: v_mov_b32_e32 v3, s0 2982; GFX10-W32-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen 2983; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2984; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 2985; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2986; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 2987; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2 2988; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2989; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2990; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2991; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2992; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 2993; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 2994; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4 2995; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s28 2996; GFX10-W32-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D 2997; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2998; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 2999; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen 3000; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3001; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 3002; GFX10-W32-NEXT: ; return to shader part epilog 3003main_body: 3004 call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3005 %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3006 %temp = fadd float %reload, %reload 3007 %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) 3008 %temp3 = fadd float %temp2, %temp2 3009 %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0) 3010 %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm) 3011 %temp5 = fadd float %temp3, %temp4 3012 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0) 3013 call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3014 %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3015 ret float %out 3016} 3017 3018define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { 3019; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm: 3020; GFX9-W64: ; %bb.0: ; %main_body 3021; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec 3022; GFX9-W64-NEXT: s_mov_b32 s15, s13 3023; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec 3024; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3025; GFX9-W64-NEXT: s_mov_b32 s19, s5 3026; GFX9-W64-NEXT: s_mov_b32 s18, s4 3027; GFX9-W64-NEXT: s_mov_b32 s17, s3 3028; GFX9-W64-NEXT: s_mov_b32 s16, s2 3029; GFX9-W64-NEXT: s_mov_b32 s14, s12 3030; GFX9-W64-NEXT: s_mov_b32 s13, s11 3031; GFX9-W64-NEXT: s_mov_b32 s12, s10 3032; GFX9-W64-NEXT: s_mov_b32 s11, s9 3033; GFX9-W64-NEXT: s_mov_b32 s10, s8 3034; GFX9-W64-NEXT: s_mov_b32 s9, s7 3035; GFX9-W64-NEXT: s_mov_b32 s8, s6 3036; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 3037; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23] 3038; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3039; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 3040; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 3041; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen 3042; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 3043; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 3044; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3045; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen 3046; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3047; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 3048; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3049; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2 3050; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3051; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3052; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 3053; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3054; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3055; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 3056; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4 3057; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] 3058; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3059; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3060; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3061; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3062; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3063; GFX9-W64-NEXT: ; return to shader part epilog 3064; 3065; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm: 3066; GFX10-W32: ; %bb.0: ; %main_body 3067; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo 3068; GFX10-W32-NEXT: s_mov_b32 s15, s13 3069; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo 3070; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3071; GFX10-W32-NEXT: s_mov_b32 s19, s5 3072; GFX10-W32-NEXT: s_mov_b32 s18, s4 3073; GFX10-W32-NEXT: s_mov_b32 s17, s3 3074; GFX10-W32-NEXT: s_mov_b32 s16, s2 3075; GFX10-W32-NEXT: s_mov_b32 s14, s12 3076; GFX10-W32-NEXT: s_mov_b32 s13, s11 3077; GFX10-W32-NEXT: s_mov_b32 s12, s10 3078; GFX10-W32-NEXT: s_mov_b32 s11, s9 3079; GFX10-W32-NEXT: s_mov_b32 s10, s8 3080; GFX10-W32-NEXT: s_mov_b32 s9, s7 3081; GFX10-W32-NEXT: s_mov_b32 s8, s6 3082; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 3083; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21 3084; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3085; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 3086; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3087; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3088; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3089; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen 3090; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3091; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3092; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3093; GFX10-W32-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen 3094; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3095; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3096; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3097; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2 3098; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3099; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3100; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 3101; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3102; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 3103; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3104; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4 3105; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3106; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3107; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3108; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3109; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3110; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3111; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 3112; GFX10-W32-NEXT: ; return to shader part epilog 3113main_body: 3114 call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3115 %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3116 %temp = fadd float %reload, %reload 3117 %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) 3118 %temp3 = fadd float %temp2, %temp2 3119 %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3120 %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) 3121 %temp5 = fadd float %temp3, %temp4 3122 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) 3123 call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3124 %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3125 ret float %out 3126} 3127 3128;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again. 3129define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) { 3130; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm: 3131; GFX9-W64: ; %bb.0: ; %main_body 3132; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec 3133; GFX9-W64-NEXT: s_mov_b32 s15, s13 3134; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec 3135; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3136; GFX9-W64-NEXT: s_mov_b32 s19, s5 3137; GFX9-W64-NEXT: s_mov_b32 s18, s4 3138; GFX9-W64-NEXT: s_mov_b32 s17, s3 3139; GFX9-W64-NEXT: s_mov_b32 s16, s2 3140; GFX9-W64-NEXT: s_mov_b32 s14, s12 3141; GFX9-W64-NEXT: s_mov_b32 s13, s11 3142; GFX9-W64-NEXT: s_mov_b32 s12, s10 3143; GFX9-W64-NEXT: s_mov_b32 s11, s9 3144; GFX9-W64-NEXT: s_mov_b32 s10, s8 3145; GFX9-W64-NEXT: s_mov_b32 s9, s7 3146; GFX9-W64-NEXT: s_mov_b32 s8, s6 3147; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 3148; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23] 3149; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3150; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3151; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 3152; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen 3153; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 3154; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3155; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen 3156; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3157; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3158; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3159; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3160; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3161; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2 3162; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3163; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3164; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3 3165; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] 3166; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3167; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3168; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3169; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3170; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3171; GFX9-W64-NEXT: ; return to shader part epilog 3172; 3173; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm: 3174; GFX10-W32: ; %bb.0: ; %main_body 3175; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo 3176; GFX10-W32-NEXT: s_mov_b32 s15, s13 3177; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo 3178; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3179; GFX10-W32-NEXT: s_mov_b32 s19, s5 3180; GFX10-W32-NEXT: s_mov_b32 s18, s4 3181; GFX10-W32-NEXT: s_mov_b32 s17, s3 3182; GFX10-W32-NEXT: s_mov_b32 s16, s2 3183; GFX10-W32-NEXT: s_mov_b32 s14, s12 3184; GFX10-W32-NEXT: s_mov_b32 s13, s11 3185; GFX10-W32-NEXT: s_mov_b32 s12, s10 3186; GFX10-W32-NEXT: s_mov_b32 s11, s9 3187; GFX10-W32-NEXT: s_mov_b32 s10, s8 3188; GFX10-W32-NEXT: s_mov_b32 s9, s7 3189; GFX10-W32-NEXT: s_mov_b32 s8, s6 3190; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 3191; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21 3192; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3193; GFX10-W32-NEXT: v_mov_b32_e32 v3, s1 3194; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3195; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3196; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3197; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen 3198; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3199; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3200; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen 3201; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3202; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3203; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3204; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3205; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2 3206; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3207; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3208; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3209; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3 3210; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3211; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3212; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3213; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3214; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3215; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3216; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 3217; GFX10-W32-NEXT: ; return to shader part epilog 3218main_body: 3219 call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3220 %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0) 3221 %temp = fadd float %reload, %reload 3222 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) 3223 %temp2 = fadd float %tex, %tex 3224 %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3225 %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) 3226 %temp4 = fadd float %temp2, %temp3 3227 %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0) 3228 call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3229 %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0) 3230 ret float %out 3231} 3232 3233; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for 3234; vector comparisons in Wave32 mode. 3235define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)* inreg %0) { 3236; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32: 3237; GFX9-W64: ; %bb.0: ; %main_body 3238; GFX9-W64-NEXT: s_mov_b32 s3, 0x31016fac 3239; GFX9-W64-NEXT: s_mov_b32 s2, 32 3240; GFX9-W64-NEXT: s_mov_b32 s1, 0x8000 3241; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 3242; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 3243; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0 3244; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc 3245; GFX9-W64-NEXT: s_cbranch_scc0 .LBB50_1 3246; GFX9-W64-NEXT: s_endpgm 3247; GFX9-W64-NEXT: .LBB50_1: 3248; GFX9-W64-NEXT: s_mov_b64 exec, 0 3249; GFX9-W64-NEXT: exp null off, off, off, off done vm 3250; GFX9-W64-NEXT: s_endpgm 3251; 3252; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32: 3253; GFX10-W32: ; %bb.0: ; %main_body 3254; GFX10-W32-NEXT: s_mov_b32 s3, 0x31016fac 3255; GFX10-W32-NEXT: s_mov_b32 s2, 32 3256; GFX10-W32-NEXT: s_mov_b32 s1, 0x8000 3257; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 3258; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 3259; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0 3260; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo 3261; GFX10-W32-NEXT: s_cbranch_scc0 .LBB50_1 3262; GFX10-W32-NEXT: s_endpgm 3263; GFX10-W32-NEXT: .LBB50_1: 3264; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 3265; GFX10-W32-NEXT: exp null off, off, off, off done vm 3266; GFX10-W32-NEXT: s_endpgm 3267main_body: 3268 %1 = ptrtoint float addrspace(6)* %0 to i32 3269 %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0 3270 %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3 3271 %4 = fcmp nsz arcp ugt float %3, 0.000000e+00 3272 call void @llvm.amdgcn.kill(i1 %4) #1 3273 ret void 3274} 3275 3276declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 3277declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 3278 3279declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 3280declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 3281declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2 3282declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2 3283declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3 3284declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3 3285 3286declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 3287declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3288declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3289declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3290declare void @llvm.amdgcn.kill(i1) #1 3291declare float @llvm.amdgcn.wqm.f32(float) #3 3292declare i32 @llvm.amdgcn.wqm.i32(i32) #3 3293declare float @llvm.amdgcn.strict.wwm.f32(float) #3 3294declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3 3295declare float @llvm.amdgcn.wwm.f32(float) #3 3296declare i32 @llvm.amdgcn.wwm.i32(i32) #3 3297declare float @llvm.amdgcn.strict.wqm.f32(float) #3 3298declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3 3299declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 3300declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 3301declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 3302declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3 3303declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1 3304declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 3305declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 3306declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) 3307declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7 3308 3309attributes #1 = { nounwind } 3310attributes #2 = { nounwind readonly } 3311attributes #3 = { nounwind readnone } 3312attributes #4 = { nounwind readnone convergent } 3313attributes #5 = { "amdgpu-ps-wqm-outputs" } 3314attributes #6 = { nounwind "InitialPSInputAddr"="2" } 3315attributes #7 = { nounwind readnone willreturn } 3316