1; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s 2 3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 4 5; SI-LABEL: {{^}}test_if: 6; Make sure the i1 values created by the cfg structurizer pass are 7; moved using VALU instructions 8 9 10; waitcnt should be inserted after exec modification 11; SI: v_cmp_lt_i32_e32 vcc, 0, 12; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc 13; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]] 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]] 16; SI-NEXT: s_cbranch_execz [[FLOW_BB]] 17 18; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3 19; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 20; SI: v_mov_b32_e32 v{{[0-9]}}, -1 21; SI: s_and_saveexec_b64 22; SI-NEXT: s_xor_b64 23; SI-NEXT: ; mask branch 24 25; v_mov should be after exec modification 26; SI: [[FLOW_BB]]: 27; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]] 28; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}} 29; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] 30; SI-NEXT: ; mask branch 31; 32define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { 33entry: 34 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 35 switch i32 %tid, label %default [ 36 i32 0, label %case0 37 i32 1, label %case1 38 ] 39 40case0: 41 %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 42 store i32 13, i32 addrspace(1)* %arrayidx1, align 4 43 br label %end 44 45case1: 46 %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 47 store i32 17, i32 addrspace(1)* %arrayidx5, align 4 48 br label %end 49 50default: 51 %cmp8 = icmp eq i32 %tid, 2 52 %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b 53 br i1 %cmp8, label %if, label %else 54 55if: 56 store i32 19, i32 addrspace(1)* %arrayidx10, align 4 57 br label %end 58 59else: 60 store i32 21, i32 addrspace(1)* %arrayidx10, align 4 61 br label %end 62 63end: 64 ret void 65} 66 67; SI-LABEL: @simple_test_v_if 68; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 69; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 70; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] 71 72; SI: BB{{[0-9]+_[0-9]+}}: 73; SI: buffer_store_dword 74; SI: s_endpgm 75 76; SI: BB1_2: 77; SI: s_or_b64 exec, exec, [[BR_SREG]] 78; SI: s_endpgm 79define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 80 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 81 %is.0 = icmp ne i32 %tid, 0 82 br i1 %is.0, label %store, label %exit 83 84store: 85 %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid 86 store i32 999, i32 addrspace(1)* %gep 87 ret void 88 89exit: 90 ret void 91} 92 93; SI-LABEL: {{^}}simple_test_v_loop: 94; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} 95; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc 96; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] 97; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] 98 99; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} 100 101; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: 102; SI: buffer_load_dword 103; SI-DAG: buffer_store_dword 104; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100 105; SI: s_cbranch_scc0 [[LABEL_LOOP]] 106; SI: [[LABEL_EXIT]]: 107; SI: s_endpgm 108 109define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { 110entry: 111 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 112 %is.0 = icmp ne i32 %tid, 0 113 %limit = add i32 %tid, 64 114 br i1 %is.0, label %loop, label %exit 115 116loop: 117 %i = phi i32 [%tid, %entry], [%i.inc, %loop] 118 %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i 119 %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i 120 %load = load i32, i32 addrspace(1)* %src 121 store i32 %load, i32 addrspace(1)* %gep.dst 122 %i.inc = add nsw i32 %i, 1 123 %cmp = icmp eq i32 %limit, %i.inc 124 br i1 %cmp, label %exit, label %loop 125 126exit: 127 ret void 128} 129 130; SI-LABEL: {{^}}multi_vcond_loop: 131 132; Load loop limit from buffer 133; Branch to exit if uniformly not taken 134; SI: ; BB#0: 135; SI: buffer_load_dword [[VBOUND:v[0-9]+]] 136; SI: v_cmp_lt_i32_e32 vcc 137; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc 138; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] 139; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] 140 141; Initialize inner condition to false 142; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader 143; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} 144; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] 145 146; Clear exec bits for workitems that load -1s 147; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: 148; SI: buffer_load_dword [[B:v[0-9]+]] 149; SI: buffer_load_dword [[A:v[0-9]+]] 150; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] 151; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] 152; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] 153; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] 154; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]] 155; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] 156 157; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 158; SI: buffer_store_dword 159; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] 160; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] 161 162; SI: [[LABEL_FLOW]]: 163; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] 164; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]] 165; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG3]], [[TMP]] 166; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] 167; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]] 168 169; SI: BB#5 170; SI: s_or_b64 exec, exec, [[COND_STATE]] 171 172; SI: [[LABEL_EXIT]]: 173; SI-NOT: [[COND_STATE]] 174; SI: s_endpgm 175 176define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { 177bb: 178 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 179 %tmp4 = sext i32 %tmp to i64 180 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4 181 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 182 %tmp7 = icmp sgt i32 %tmp6, 0 183 %tmp8 = sext i32 %tmp6 to i64 184 br i1 %tmp7, label %bb10, label %bb26 185 186bb10: ; preds = %bb, %bb20 187 %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] 188 %tmp12 = add nsw i64 %tmp11, %tmp4 189 %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12 190 %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 191 %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12 192 %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 193 %tmp17 = icmp ne i32 %tmp14, -1 194 %tmp18 = icmp ne i32 %tmp16, -1 195 %tmp19 = and i1 %tmp17, %tmp18 196 br i1 %tmp19, label %bb20, label %bb26 197 198bb20: ; preds = %bb10 199 %tmp21 = add nsw i32 %tmp16, %tmp14 200 %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12 201 store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4 202 %tmp23 = add nuw nsw i64 %tmp11, 1 203 %tmp24 = icmp slt i64 %tmp23, %tmp8 204 br i1 %tmp24, label %bb10, label %bb26 205 206bb26: ; preds = %bb10, %bb20, %bb 207 ret void 208} 209 210attributes #0 = { nounwind readnone } 211attributes #1 = { nounwind } 212