1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
2
3; GCN-LABEL: {{^}}select_undef_lhs:
4; GCN: s_waitcnt
5; GCN-NOT: v_cmp
6; GCN-NOT: v_cndmask
7; GCN-NEXT: s_setpc_b64
8define float @select_undef_lhs(float %val, i1 %cond) {
9  %undef = call float @llvm.amdgcn.rcp.f32(float undef)
10  %sel = select i1 %cond, float %undef, float %val
11  ret float %sel
12}
13
14; GCN-LABEL: {{^}}select_undef_rhs:
15; GCN: s_waitcnt
16; GCN-NOT: v_cmp
17; GCN-NOT: v_cndmask
18; GCN-NEXT: s_setpc_b64
19define float @select_undef_rhs(float %val, i1 %cond) {
20  %undef = call float @llvm.amdgcn.rcp.f32(float undef)
21  %sel = select i1 %cond, float %val, float %undef
22  ret float %sel
23}
24
25; GCN-LABEL: {{^}}select_undef_n1:
26; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
27; GCN: store_dword {{[^,]+}}, [[RES]]
28define void @select_undef_n1(float addrspace(1)* %a, i32 %c) {
29  %cc = icmp eq i32 %c, 0
30  %sel = select i1 %cc, float 1.000000e+00, float undef
31  store float %sel, float addrspace(1)* %a
32  ret void
33}
34
35; GCN-LABEL: {{^}}select_undef_n2:
36; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
37; GCN: store_dword {{[^,]+}}, [[RES]]
38define void @select_undef_n2(float addrspace(1)* %a, i32 %c) {
39  %cc = icmp eq i32 %c, 0
40  %sel = select i1 %cc, float undef, float 1.000000e+00
41  store float %sel, float addrspace(1)* %a
42  ret void
43}
44
45declare float @llvm.amdgcn.rcp.f32(float)
46
47
48; Make sure the vector undef isn't lowered into 0s.
49; GCN-LABEL: {{^}}undef_v6f32:
50; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
51; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
52; GCN: s_cbranch_vccnz
53define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) {
54entry:
55  br label %loop
56
57loop:
58  %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
59  %load = load volatile <6 x float>, <6 x float> addrspace(3)* undef
60  %add = fadd <6 x float> %load, %phi
61  br i1 %cond, label %loop, label %ret
62
63ret:
64  store volatile <6 x float> %add, <6 x float> addrspace(3)* undef
65  ret void
66}
67
68; GCN-LABEL: {{^}}undef_v6i32:
69; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
70; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
71; GCN: s_cbranch_vccnz
72define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) {
73entry:
74  br label %loop
75
76loop:
77  %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
78  %load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef
79  %add = add <6 x i32> %load, %phi
80  br i1 %cond, label %loop, label %ret
81
82ret:
83  store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef
84  ret void
85}
86
87; Make sure the vector undef isn't lowered into 0s.
88; GCN-LABEL: {{^}}undef_v5f32:
89; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
90; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
91; GCN: s_cbranch_vccnz
92define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) {
93entry:
94  br label %loop
95
96loop:
97  %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
98  %load = load volatile <5 x float>, <5 x float> addrspace(3)* undef
99  %add = fadd <5 x float> %load, %phi
100  br i1 %cond, label %loop, label %ret
101
102ret:
103  store volatile <5 x float> %add, <5 x float> addrspace(3)* undef
104  ret void
105}
106
107; GCN-LABEL: {{^}}undef_v5i32:
108; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
109; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
110; GCN: s_cbranch_vccnz
111define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) {
112entry:
113  br label %loop
114
115loop:
116  %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
117  %load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef
118  %add = add <5 x i32> %load, %phi
119  br i1 %cond, label %loop, label %ret
120
121ret:
122  store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef
123  ret void
124}
125
126; Make sure the vector undef isn't lowered into 0s.
127; GCN-LABEL: {{^}}undef_v3f64:
128; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
129; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
130; GCN: s_cbranch_vccnz
131define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) {
132entry:
133  br label %loop
134
135loop:
136  %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
137  %load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr
138  %add = fadd <3 x double> %load, %phi
139  br i1 %cond, label %loop, label %ret
140
141ret:
142  store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr
143  ret void
144}
145
146; GCN-LABEL: {{^}}undef_v3i64:
147; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
148; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
149; GCN: s_cbranch_vccnz
150define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) {
151entry:
152  br label %loop
153
154loop:
155  %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
156  %load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr
157  %add = add <3 x i64> %load, %phi
158  br i1 %cond, label %loop, label %ret
159
160ret:
161  store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr
162  ret void
163}
164
165; Make sure the vector undef isn't lowered into 0s.
166; GCN-LABEL: {{^}}undef_v4f16:
167; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
168; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
169; GCN: s_cbranch_vccnz
170define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) {
171entry:
172  br label %loop
173
174loop:
175  %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
176  %load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr
177  %add = fadd <4 x half> %load, %phi
178  br i1 %cond, label %loop, label %ret
179
180ret:
181  store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr
182  ret void
183}
184
185; GCN-LABEL: {{^}}undef_v4i16:
186; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
187; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
188; GCN: s_cbranch_vccnz
189define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) {
190entry:
191  br label %loop
192
193loop:
194  %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
195  %load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr
196  %add = add <4 x i16> %load, %phi
197  br i1 %cond, label %loop, label %ret
198
199ret:
200  store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr
201  ret void
202}
203
204; Make sure the vector undef isn't lowered into 0s.
205; GCN-LABEL: {{^}}undef_v2f16:
206; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
207; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
208; GCN: s_cbranch_vccnz
209define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) {
210entry:
211  br label %loop
212
213loop:
214  %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
215  %load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr
216  %add = fadd <2 x half> %load, %phi
217  br i1 %cond, label %loop, label %ret
218
219ret:
220  store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr
221  ret void
222}
223
224; GCN-LABEL: {{^}}undef_v2i16:
225; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
226; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
227; GCN: s_cbranch_vccnz
228define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) {
229entry:
230  br label %loop
231
232loop:
233  %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
234  %load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr
235  %add = add <2 x i16> %load, %phi
236  br i1 %cond, label %loop, label %ret
237
238ret:
239  store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr
240  ret void
241}
242
243; We were expanding undef vectors into zero vectors. Optimizations
244; would then see we used no elements of the vector, and reform the
245; undef vector resulting in a combiner loop.
246; GCN-LABEL: {{^}}inf_loop_undef_vector:
247; GCN: s_waitcnt
248; GCN-NEXT: v_mad_u64_u32
249; GCN-NEXT: v_mul_lo_u32
250; GCN-NEXT: v_mul_lo_u32
251; GCN-NEXT: v_add3_u32
252; GCN-NEXT: global_store_dwordx2
253define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
254  %i = insertelement <6 x float> %arg, float %arg1, i64 2
255  %i3 = bitcast <6 x float> %i to <3 x i64>
256  %i4 = extractelement <3 x i64> %i3, i64 0
257  %i5 = extractelement <3 x i64> %i3, i64 1
258  %i6 = mul i64 %i5, %arg2
259  %i7 = add i64 %i6, %i4
260  store volatile i64 %i7, i64 addrspace(1)* undef, align 4
261  ret void
262}
263