1;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI 2;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI 3 4; Check that WQM isn't triggered by image load/store intrinsics. 5; 6;CHECK-LABEL: {{^}}test1: 7;CHECK-NOT: s_wqm 8define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) { 9main_body: 10 %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) 11 call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) 12 ret <4 x float> %tex 13} 14 15; Check that WQM is triggered by image samples and left untouched for loads... 16; 17;CHECK-LABEL: {{^}}test2: 18;CHECK-NEXT: ; %main_body 19;CHECK-NEXT: s_wqm_b64 exec, exec 20;CHECK-NOT: exec 21define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { 22main_body: 23 %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 24 %c.2 = bitcast <4 x float> %c.1 to <4 x i32> 25 %c.3 = extractelement <4 x i32> %c.2, i32 0 26 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 27 %data = load float, float addrspace(1)* %gep 28 29 call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef) 30 31 ret void 32} 33 34; ... but disabled for stores (and, in this simple case, not re-enabled). 35; 36;CHECK-LABEL: {{^}}test3: 37;CHECK-NEXT: ; %main_body 38;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 39;CHECK-NEXT: s_wqm_b64 exec, exec 40;CHECK: s_and_b64 exec, exec, [[ORIG]] 41;CHECK: image_sample 42;CHECK: store 43;CHECK-NOT: exec 44;CHECK: .size test3 45define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) { 46main_body: 47 %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 48 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 49 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 50 51 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0) 52 53 ret <4 x float> %tex 54} 55 56; Check that WQM is re-enabled when required. 57; 58;CHECK-LABEL: {{^}}test4: 59;CHECK-NEXT: ; %main_body 60;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 61;CHECK-NEXT: s_wqm_b64 exec, exec 62;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1 63;CHECK: s_and_b64 exec, exec, [[ORIG]] 64;CHECK: store 65;CHECK: s_wqm_b64 exec, exec 66;CHECK: image_sample 67;CHECK: image_sample 68define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { 69main_body: 70 %c.1 = mul i32 %c, %d 71 72 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) 73 74 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 75 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 76 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 77 ret <4 x float> %dtex 78} 79 80; Check a case of one branch of an if-else requiring WQM, the other requiring 81; exact. 82; 83; Note: In this particular case, the save-and-restore could be avoided if the 84; analysis understood that the two branches of the if-else are mutually 85; exclusive. 86; 87;CHECK-LABEL: {{^}}test_control_flow_0: 88;CHECK-NEXT: ; %main_body 89;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 90;CHECK-NEXT: s_wqm_b64 exec, exec 91;CHECK: %ELSE 92;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 93;CHECK: store 94;CHECK: s_mov_b64 exec, [[SAVED]] 95;CHECK: %IF 96;CHECK: image_sample 97;CHECK: image_sample 98define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 99main_body: 100 %cmp = icmp eq i32 %z, 0 101 br i1 %cmp, label %IF, label %ELSE 102 103IF: 104 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 105 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 106 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 107 %data.if = extractelement <4 x float> %dtex, i32 0 108 br label %END 109 110ELSE: 111 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) 112 br label %END 113 114END: 115 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 116 ret float %r 117} 118 119; Reverse branch order compared to the previous test. 120; 121;CHECK-LABEL: {{^}}test_control_flow_1: 122;CHECK-NEXT: ; %main_body 123;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 124;CHECK-NEXT: s_wqm_b64 exec, exec 125;CHECK: %IF 126;CHECK: image_sample 127;CHECK: image_sample 128;CHECK: %Flow 129;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], 130;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] 131;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] 132;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] 133;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]] 134;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE 135;CHECK: store_dword 136;CHECK: [[END_BB]]: ; %END 137;CHECK: s_or_b64 exec, exec, 138;CHECK: v_mov_b32_e32 v0 139;CHECK: ; return 140define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 141main_body: 142 %cmp = icmp eq i32 %z, 0 143 br i1 %cmp, label %ELSE, label %IF 144 145IF: 146 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 147 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 148 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 149 %data.if = extractelement <4 x float> %dtex, i32 0 150 br label %END 151 152ELSE: 153 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0) 154 br label %END 155 156END: 157 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 158 ret float %r 159} 160 161; Check that branch conditions are properly marked as needing WQM... 162; 163;CHECK-LABEL: {{^}}test_control_flow_2: 164;CHECK-NEXT: ; %main_body 165;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 166;CHECK-NEXT: s_wqm_b64 exec, exec 167;CHECK: s_and_b64 exec, exec, [[ORIG]] 168;CHECK: store 169;CHECK: s_wqm_b64 exec, exec 170;CHECK: load 171;CHECK: s_and_b64 exec, exec, [[ORIG]] 172;CHECK: store 173;CHECK: s_wqm_b64 exec, exec 174;CHECK: v_cmp 175define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 176main_body: 177 %idx.1 = extractelement <3 x i32> %idx, i32 0 178 %data.1 = extractelement <2 x float> %data, i32 0 179 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) 180 181 ; The load that determines the branch (and should therefore be WQM) is 182 ; surrounded by stores that require disabled WQM. 183 %idx.2 = extractelement <3 x i32> %idx, i32 1 184 %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0) 185 186 %idx.3 = extractelement <3 x i32> %idx, i32 2 187 %data.3 = extractelement <2 x float> %data, i32 1 188 call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0) 189 190 %cc = fcmp ogt float %z, 0.0 191 br i1 %cc, label %IF, label %ELSE 192 193IF: 194 %coord.IF = mul i32 %coord, 3 195 br label %END 196 197ELSE: 198 %coord.ELSE = mul i32 %coord, 4 199 br label %END 200 201END: 202 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] 203 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 204 ret <4 x float> %tex 205} 206 207; ... but only if they really do need it. 208; 209;CHECK-LABEL: {{^}}test_control_flow_3: 210;CHECK-NEXT: ; %main_body 211;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 212;CHECK-NEXT: s_wqm_b64 exec, exec 213;CHECK: image_sample 214;CHECK: s_and_b64 exec, exec, [[ORIG]] 215;CHECK: image_sample 216;CHECK: v_cmp 217;CHECK: store 218define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) { 219main_body: 220 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 221 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 222 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 223 %dtex.1 = extractelement <4 x float> %dtex, i32 0 224 225 call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 226 227 %cc = fcmp ogt float %dtex.1, 0.0 228 br i1 %cc, label %IF, label %ELSE 229 230IF: 231 %tex.IF = fmul float %dtex.1, 3.0 232 br label %END 233 234ELSE: 235 %tex.ELSE = fmul float %dtex.1, 4.0 236 br label %END 237 238END: 239 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] 240 ret float %tex.END 241} 242 243; Another test that failed at some point because of terminator handling. 244; 245;CHECK-LABEL: {{^}}test_control_flow_4: 246;CHECK-NEXT: ; %main_body 247;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 248;CHECK-NEXT: s_wqm_b64 exec, exec 249;CHECK: %IF 250;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 251;CHECK: load 252;CHECK: store 253;CHECK: s_mov_b64 exec, [[SAVE]] 254;CHECK: %END 255;CHECK: image_sample 256;CHECK: image_sample 257define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) { 258main_body: 259 %cond = icmp eq i32 %y, 0 260 br i1 %cond, label %IF, label %END 261 262IF: 263 %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0) 264 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) 265 br label %END 266 267END: 268 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 269 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 270 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 271 ret <4 x float> %dtex 272} 273 274; Kill is performed in WQM mode so that uniform kill behaves correctly ... 275; 276;CHECK-LABEL: {{^}}test_kill_0: 277;CHECK-NEXT: ; %main_body 278;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 279;CHECK-NEXT: s_wqm_b64 exec, exec 280;CHECK: s_and_b64 exec, exec, [[ORIG]] 281;CHECK: image_sample 282;CHECK: buffer_store_dword 283;CHECK: s_wqm_b64 exec, exec 284;CHECK: v_cmpx_ 285;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] 286;CHECK: buffer_store_dword 287;CHECK: s_mov_b64 exec, [[SAVE]] 288;CHECK: image_sample 289define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { 290main_body: 291 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 292 293 %idx.0 = extractelement <2 x i32> %idx, i32 0 294 %data.0 = extractelement <2 x float> %data, i32 0 295 call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) 296 297 call void @llvm.AMDGPU.kill(float %z) 298 299 %idx.1 = extractelement <2 x i32> %idx, i32 1 300 %data.1 = extractelement <2 x float> %data, i32 1 301 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) 302 303 %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 304 %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32> 305 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 306 %out = fadd <4 x float> %tex, %dtex 307 308 ret <4 x float> %out 309} 310 311; ... but only if WQM is necessary. 312; 313; CHECK-LABEL: {{^}}test_kill_1: 314; CHECK-NEXT: ; %main_body 315; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 316; CHECK: s_wqm_b64 exec, exec 317; CHECK: image_sample 318; CHECK: s_and_b64 exec, exec, [[ORIG]] 319; CHECK: image_sample 320; CHECK: buffer_store_dword 321; CHECK-NOT: wqm 322; CHECK: v_cmpx_ 323define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { 324main_body: 325 %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 326 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 327 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 328 329 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) 330 331 call void @llvm.AMDGPU.kill(float %z) 332 333 ret <4 x float> %dtex 334} 335 336; Check prolog shaders. 337; 338; CHECK-LABEL: {{^}}test_prolog_1: 339; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 340; CHECK: s_wqm_b64 exec, exec 341; CHECK: v_add_f32_e32 v0, 342; CHECK: s_and_b64 exec, exec, [[ORIG]] 343define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 { 344main_body: 345 %s = fadd float %a, %b 346 ret float %s 347} 348 349; CHECK-LABEL: {{^}}test_loop_vcc: 350; CHECK-NEXT: ; %entry 351; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 352; CHECK: s_wqm_b64 exec, exec 353; CHECK: s_and_b64 exec, exec, [[LIVE]] 354; CHECK: image_store 355; CHECK: s_wqm_b64 exec, exec 356; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0 357; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000 358 359; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body 360; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] 361; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] 362; CHECK: s_cbranch_vccz [[LOOPHDR]] 363; CHECK: ; %break 364 365; CHECK: ; return 366define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { 367entry: 368 call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) 369 br label %loop 370 371loop: 372 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 373 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 374 %cc = fcmp ogt float %ctr.iv, 7.0 375 br i1 %cc, label %break, label %body 376 377body: 378 %c.i = bitcast <4 x float> %c.iv to <4 x i32> 379 %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 380 %ctr.next = fadd float %ctr.iv, 2.0 381 br label %loop 382 383break: 384 ret <4 x float> %c.iv 385} 386 387; Only intrinsic stores need exact execution -- other stores do not have 388; externally visible effects and may require WQM for correctness. 389; 390; CHECK-LABEL: {{^}}test_alloca: 391; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 392; CHECK: s_wqm_b64 exec, exec 393 394; CHECK: s_and_b64 exec, exec, [[LIVE]] 395; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 396; CHECK: s_wqm_b64 exec, exec 397; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+$}} 398; CHECK: s_and_b64 exec, exec, [[LIVE]] 399; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen 400; CHECK: s_wqm_b64 exec, exec 401; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen 402 403; CHECK: s_and_b64 exec, exec, [[LIVE]] 404; CHECK: image_sample 405; CHECK: buffer_store_dwordx4 406define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { 407entry: 408 %array = alloca [32 x i32], align 4 409 410 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) 411 412 %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0 413 store volatile i32 %a, i32* %s.gep, align 4 414 415 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0) 416 417 %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx 418 %c = load i32, i32* %c.gep, align 4 419 420 %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 421 422 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) 423 424 ret void 425} 426 427; Must return to exact at the end of a non-void returning shader, 428; otherwise the EXEC mask exported by the epilog will be wrong. This is true 429; even if the shader has no kills, because a kill could have happened in a 430; previous shader fragment. 431; 432; CHECK-LABEL: {{^}}test_nonvoid_return: 433; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 434; CHECK: s_wqm_b64 exec, exec 435; 436; CHECK: s_and_b64 exec, exec, [[LIVE]] 437; CHECK-NOT: exec 438define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { 439 %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 440 %tex.i = bitcast <4 x float> %tex to <4 x i32> 441 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 442 ret <4 x float> %dtex 443} 444 445; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable: 446; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec 447; CHECK: s_wqm_b64 exec, exec 448; 449; CHECK: s_and_b64 exec, exec, [[LIVE]] 450; CHECK-NOT: exec 451define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { 452entry: 453 %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 454 %tex.i = bitcast <4 x float> %tex to <4 x i32> 455 %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 456 457 %cc = icmp sgt i32 %c, 0 458 br i1 %cc, label %if, label %else 459 460if: 461 store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef 462 unreachable 463 464else: 465 ret <4 x float> %dtex 466} 467 468; Test awareness that s_wqm_b64 clobbers SCC. 469; 470; CHECK-LABEL: {{^}}test_scc: 471; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec 472; CHECK: s_wqm_b64 exec, exec 473; CHECK: s_cmp_ 474; CHECK-NEXT: s_cbranch_scc 475; CHECK: ; %if 476; CHECK: s_and_b64 exec, exec, [[ORIG]] 477; CHECK: image_sample 478; CHECK: ; %else 479; CHECK: s_and_b64 exec, exec, [[ORIG]] 480; CHECK: image_sample 481; CHECK: ; %end 482define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { 483main_body: 484 %cc = icmp sgt i32 %sel, 0 485 br i1 %cc, label %if, label %else 486 487if: 488 %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 489 br label %end 490 491else: 492 %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) 493 br label %end 494 495end: 496 %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] 497 498 call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 499 500 ret <4 x float> %r 501} 502 503 504declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 505declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 506declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 507 508declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 509declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 510 511declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 512declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 513declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 514 515declare void @llvm.AMDGPU.kill(float) 516declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) 517 518attributes #1 = { nounwind } 519attributes #2 = { nounwind readonly } 520attributes #3 = { nounwind readnone } 521attributes #4 = { "amdgpu-ps-wqm-outputs" } 522