1; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
2
3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4
5; SI-LABEL: {{^}}test_if:
6; Make sure the i1 values created by the cfg structurizer pass are
7; moved using VALU instructions
8
9
10; waitcnt should be inserted after exec modification
11; SI: v_cmp_lt_i32_e32 vcc, 0,
12; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
13; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
14; SI-NEXT: s_waitcnt lgkmcnt(0)
15; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
16; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
17
18; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
19; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
20; SI: v_mov_b32_e32 v{{[0-9]}}, -1
21; SI: s_and_saveexec_b64
22; SI-NEXT: s_xor_b64
23; SI-NEXT: ; mask branch
24
25; v_mov should be after exec modification
26; SI: [[FLOW_BB]]:
27; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
28; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
29; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
30; SI-NEXT: ; mask branch
31;
32define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
33entry:
34  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
35  switch i32 %tid, label %default [
36    i32 0, label %case0
37    i32 1, label %case1
38  ]
39
40case0:
41  %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
42  store i32 13, i32 addrspace(1)* %arrayidx1, align 4
43  br label %end
44
45case1:
46  %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
47  store i32 17, i32 addrspace(1)* %arrayidx5, align 4
48  br label %end
49
50default:
51  %cmp8 = icmp eq i32 %tid, 2
52  %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
53  br i1 %cmp8, label %if, label %else
54
55if:
56  store i32 19, i32 addrspace(1)* %arrayidx10, align 4
57  br label %end
58
59else:
60  store i32 21, i32 addrspace(1)* %arrayidx10, align 4
61  br label %end
62
63end:
64  ret void
65}
66
67; SI-LABEL: @simple_test_v_if
68; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
69; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
70; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
71
72; SI: BB{{[0-9]+_[0-9]+}}:
73; SI: buffer_store_dword
74; SI: s_endpgm
75
76; SI: BB1_2:
77; SI: s_or_b64 exec, exec, [[BR_SREG]]
78; SI: s_endpgm
79define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
80  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
81  %is.0 = icmp ne i32 %tid, 0
82  br i1 %is.0, label %store, label %exit
83
84store:
85  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
86  store i32 999, i32 addrspace(1)* %gep
87  ret void
88
89exit:
90  ret void
91}
92
93; SI-LABEL: {{^}}simple_test_v_loop:
94; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
95; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
96; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
97; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
98
99; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
100
101; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
102; SI: buffer_load_dword
103; SI-DAG: buffer_store_dword
104; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100
105; SI: s_cbranch_scc0 [[LABEL_LOOP]]
106; SI: [[LABEL_EXIT]]:
107; SI: s_endpgm
108
109define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
110entry:
111  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
112  %is.0 = icmp ne i32 %tid, 0
113  %limit = add i32 %tid, 64
114  br i1 %is.0, label %loop, label %exit
115
116loop:
117  %i = phi i32 [%tid, %entry], [%i.inc, %loop]
118  %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
119  %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
120  %load = load i32, i32 addrspace(1)* %src
121  store i32 %load, i32 addrspace(1)* %gep.dst
122  %i.inc = add nsw i32 %i, 1
123  %cmp = icmp eq i32 %limit, %i.inc
124  br i1 %cmp, label %exit, label %loop
125
126exit:
127  ret void
128}
129
130; SI-LABEL: {{^}}multi_vcond_loop:
131
132; Load loop limit from buffer
133; Branch to exit if uniformly not taken
134; SI: ; BB#0:
135; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
136; SI: v_cmp_lt_i32_e32 vcc
137; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
138; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
139; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
140
141; Initialize inner condition to false
142; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader
143; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
144; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
145
146; Clear exec bits for workitems that load -1s
147; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
148; SI: buffer_load_dword [[B:v[0-9]+]]
149; SI: buffer_load_dword [[A:v[0-9]+]]
150; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
151; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
152; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
153; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
154; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]]
155; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
156
157; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
158; SI: buffer_store_dword
159; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
160; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
161
162; SI: [[LABEL_FLOW]]:
163; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
164; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]]
165; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG3]], [[TMP]]
166; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
167; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
168
169; SI: BB#5
170; SI: s_or_b64 exec, exec, [[COND_STATE]]
171
172; SI: [[LABEL_EXIT]]:
173; SI-NOT: [[COND_STATE]]
174; SI: s_endpgm
175
176define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
177bb:
178  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
179  %tmp4 = sext i32 %tmp to i64
180  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
181  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
182  %tmp7 = icmp sgt i32 %tmp6, 0
183  %tmp8 = sext i32 %tmp6 to i64
184  br i1 %tmp7, label %bb10, label %bb26
185
186bb10:                                             ; preds = %bb, %bb20
187  %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
188  %tmp12 = add nsw i64 %tmp11, %tmp4
189  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
190  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
191  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
192  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
193  %tmp17 = icmp ne i32 %tmp14, -1
194  %tmp18 = icmp ne i32 %tmp16, -1
195  %tmp19 = and i1 %tmp17, %tmp18
196  br i1 %tmp19, label %bb20, label %bb26
197
198bb20:                                             ; preds = %bb10
199  %tmp21 = add nsw i32 %tmp16, %tmp14
200  %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
201  store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
202  %tmp23 = add nuw nsw i64 %tmp11, 1
203  %tmp24 = icmp slt i64 %tmp23, %tmp8
204  br i1 %tmp24, label %bb10, label %bb26
205
206bb26:                                             ; preds = %bb10, %bb20, %bb
207  ret void
208}
209
210attributes #0 = { nounwind readnone }
211attributes #1 = { nounwind }
212