1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s
3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
5
6declare i32 @llvm.amdgcn.workitem.id.x() #1
7declare half @llvm.fabs.f16(half)
8declare float @llvm.fabs.f32(float)
9declare double @llvm.fabs.f64(double)
10
11; GCN-LABEL: {{^}}v_cnd_nan_nosgpr:
12; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0
13; GCN: s_cselect_b64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], -1, 0
14; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]]
15; GCN-DAG: v{{[0-9]}}
16; All nan values are converted to 0xffffffff
17; GCN: s_endpgm
18define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
19  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
20  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
21  %f = load float, float addrspace(1)* %f.gep
22  %setcc = icmp ne i32 %c, 0
23  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
24  store float %select, float addrspace(1)* %out
25  ret void
26}
27
28
29; This requires slightly trickier SGPR operand legalization since the
30; single constant bus SGPR usage is the last operand, and it should
31; never be moved.
32; However on GFX10 constant bus is limited to 2 scalar operands, not one.
33
34; GCN-LABEL: {{^}}v_cnd_nan:
35; SIVI:  s_cmp_eq_u32 s{{[0-9]+}}, 0
36; SIVI:  s_cselect_b64 vcc, -1, 0
37; SIVI:  v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc
38; GFX10: s_cmp_eq_u32 s{{[0-9]+}}, 0
39; GFX10: s_cselect_b64 [[CC:s\[[0-9:]+\]]],
40; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]]
41; GCN-DAG: v{{[0-9]}}
42; All nan values are converted to 0xffffffff
43; GCN: s_endpgm
44define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
45  %setcc = icmp ne i32 %c, 0
46  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
47  store float %select, float addrspace(1)* %out
48  ret void
49}
50
51; Test different compare and select operand types for optimal code
52; shrinking.
53; (select (cmp (sgprX, constant)), constant, sgprZ)
54
55; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32:
56; GCN: s_load_{{dwordx2|b64}} s[[[X:[0-9]+]]:[[Z:[0-9]+]]], s[0:1], {{0x4c|0x13}}
57
58; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0
59; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0
60; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
61; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], [[CC]]
62; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, s[[Z]], [[CC]]
63define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
64  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
65  %tid.ext = sext i32 %tid to i64
66  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
67  %setcc = fcmp one float %x, 0.0
68  %select = select i1 %setcc, float 1.0, float %z
69  store float %select, float addrspace(1)* %out.gep
70  ret void
71}
72
73; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprX_f32:
74; GCN: s_load_{{dword|b32}} [[X:s[0-9]+]]
75; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0
76; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0
77; SIVI-DAG:  v_mov_b32_e32 [[VX:v[0-9]+]], [[X]]
78; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VX]], [[CC]]
79; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[X]], [[CC]]
80define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
81  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
82  %tid.ext = sext i32 %tid to i64
83  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
84  %setcc = fcmp one float %x, 0.0
85  %select = select i1 %setcc, float 1.0, float %x
86  store float %select, float addrspace(1)* %out.gep
87  ret void
88}
89
90; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32:
91; GCN-DAG: s_load_{{dwordx2|b64}} s[[[X:[0-9]+]]:[[Z:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
92; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0
93; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0
94; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
95; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], [[CC]]
96; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 0, s[[Z]], [[CC]]
97define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
98  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
99  %tid.ext = sext i32 %tid to i64
100  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
101  %setcc = fcmp one float %x, 0.0
102  %select = select i1 %setcc, float 0.0, float %z
103  store float %select, float addrspace(1)* %out.gep
104  ret void
105}
106
107; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprX_f32:
108; GCN: s_load_{{dword|b32}} [[X:s[0-9]+]]
109; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0
110; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0
111; SIVI-DAG:  v_mov_b32_e32 [[VX:v[0-9]+]], [[X]]
112; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VX]], [[CC]]
113; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[X]], [[CC]]
114define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
115  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
116  %tid.ext = sext i32 %tid to i64
117  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
118  %setcc = fcmp one float %x, 0.0
119  %select = select i1 %setcc, float 0.0, float %x
120  store float %select, float addrspace(1)* %out.gep
121  ret void
122}
123
124; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32:
125; GCN-DAG: s_load_{{dword|b32}} [[X:s[0-9]+]]
126; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]]
127; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
128; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
129define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
130  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
131  %tid.ext = sext i32 %tid to i64
132  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
133  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
134  %z = load float, float addrspace(1)* %z.gep
135  %setcc = fcmp one float %x, 0.0
136  %select = select i1 %setcc, float 0.0, float %z
137  store float %select, float addrspace(1)* %out.gep
138  ret void
139}
140
141; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32:
142; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]]
143; GCN-DAG: s_load_{{dword|b32}} [[X:s[0-9]+]]
144; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
145; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
146define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
147  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
148  %tid.ext = sext i32 %tid to i64
149  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
150  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
151  %z = load float, float addrspace(1)* %z.gep
152  %setcc = fcmp one float %x, 0.0
153  %select = select i1 %setcc, float 1.0, float %z
154  store float %select, float addrspace(1)* %out.gep
155  ret void
156}
157
158; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_sgprZ_f32:
159; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
160; GCN-DAG: s_load_{{dword|b32}} [[Z:s[0-9]+]]
161; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]]
162; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
163; SIVI:     v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
164; GFX10:    v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[Z]], vcc
165define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
166  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
167  %tid.ext = sext i32 %tid to i64
168  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
169  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
170  %x = load float, float addrspace(1)* %x.gep
171  %setcc = fcmp olt float %x, 0.0
172  %select = select i1 %setcc, float 1.0, float %z
173  store float %select, float addrspace(1)* %out.gep
174  ret void
175}
176
177; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_f32:
178; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
179; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]]
180; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
181; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
182define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
183  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
184  %tid.ext = sext i32 %tid to i64
185  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
186  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
187  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
188  %x = load volatile float, float addrspace(1)* %x.gep
189  %z = load volatile float, float addrspace(1)* %z.gep
190  %setcc = fcmp ult float %x, 0.0
191  %select = select i1 %setcc, float 1.0, float %z
192  store float %select, float addrspace(1)* %out.gep
193  ret void
194}
195
196; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i32:
197; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
198; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]]
199; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
200; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
201define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
202  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
203  %tid.ext = sext i32 %tid to i64
204  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
205  %z.gep = getelementptr inbounds i32, i32 addrspace(1)* %z.ptr, i64 %tid.ext
206  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
207  %x = load volatile i32, i32 addrspace(1)* %x.gep
208  %z = load volatile i32, i32 addrspace(1)* %z.gep
209  %setcc = icmp slt i32 %x, 0
210  %select = select i1 %setcc, i32 2, i32 %z
211  store i32 %select, i32 addrspace(1)* %out.gep
212  ret void
213}
214
215; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64:
216; GCN: {{buffer|flat|global}}_load_{{dwordx2|b64}} v[[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]]
217; GCN-DAG: {{buffer|flat|global}}_load_{{dwordx2|b64}} v[[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]]
218; GCN-DAG: v_cmp_lt_i64_e32 vcc, -1, v[[[X_LO]]:[[X_HI]]]
219; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
220; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
221define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
222  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
223  %tid.ext = sext i32 %tid to i64
224  %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
225  %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext
226  %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext
227  %x = load volatile i64, i64 addrspace(1)* %x.gep
228  %z = load volatile i64, i64 addrspace(1)* %z.gep
229  %setcc = icmp slt i64 %x, 0
230  %select = select i1 %setcc, i64 2, i64 %z
231  store i64 %select, i64 addrspace(1)* %out.gep
232  ret void
233}
234
235; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
236; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
237; GCN: {{buffer|flat|global}}_load_{{dword|b128}}
238
239; GCN: v_cmp_nge_f32_e32 vcc, 4.0, [[X]]
240; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
241; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
242; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
243; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
244define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
245  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
246  %tid.ext = sext i32 %tid to i64
247  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
248  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
249  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
250  %x = load volatile float, float addrspace(1)* %x.gep
251  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
252  %setcc = fcmp ugt float %x, 4.0
253  %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
254  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
255  ret void
256}
257
258; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
259; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
260; GCN: {{buffer|flat|global}}_load_{{dword|b128}}
261
262; GCN: v_cmp_ge_f32_e32 vcc, 4.0, [[X]]
263; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
264; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
265; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
266; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
267define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
268  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
269  %tid.ext = sext i32 %tid to i64
270  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
271  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
272  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
273  %x = load volatile float, float addrspace(1)* %x.gep
274  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
275  %setcc = fcmp ugt float %x, 4.0
276  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
277  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
278  ret void
279}
280
281; This must be swapped as a vector type before the condition has
282; multiple uses.
283
284; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
285; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
286; GCN: {{buffer|flat|global}}_load_{{dword|b128}}
287
288; GCN: v_cmp_le_f32_e32 vcc, 4.0, [[X]]
289; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
290; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
291; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
292; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
293define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
294  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
295  %tid.ext = sext i32 %tid to i64
296  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
297  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
298  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
299  %x = load volatile float, float addrspace(1)* %x.gep
300  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
301  %setcc = fcmp ugt float 4.0, %x
302  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
303  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
304  ret void
305}
306
307; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1:
308; GCN: load_{{dword|b32}}
309; GCN: load_{{ubyte|u8}}
310; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v
311; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1,
312; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v
313; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}}
314; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
315; GCN: store_{{byte|b8}}
316define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
317  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
318  %tid.ext = sext i32 %tid to i64
319  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
320  %z.gep = getelementptr inbounds i1, i1 addrspace(1)* %z.ptr, i64 %tid.ext
321  %out.gep = getelementptr inbounds i1, i1 addrspace(1)* %out, i64 %tid.ext
322  %x = load volatile i32, i32 addrspace(1)* %x.gep
323  %z = load volatile i1, i1 addrspace(1)* %z.gep
324  %setcc = icmp slt i32 %x, 0
325  %select = select i1 %setcc, i1 true, i1 %z
326  store i1 %select, i1 addrspace(1)* %out.gep
327  ret void
328}
329
330; Different types compared vs. selected
331; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
332; SIVI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000
333; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
334; GCN-DAG: {{buffer|flat|global}}_load_{{dwordx2|b64}}
335
336; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
337; SIVI-DAG:  v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
338; GFX10-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3ff00000, v{{[0-9]+}}, vcc
339; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
340define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
341  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
342  %tid.ext = sext i32 %tid to i64
343  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
344  %z.gep = getelementptr inbounds double, double addrspace(1)* %z.ptr, i64 %tid.ext
345  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
346  %x = load volatile float, float addrspace(1)* %x.gep
347  %z = load volatile double, double addrspace(1)* %z.gep
348  %setcc = fcmp ult float %x, 0.0
349  %select = select i1 %setcc, double 1.0, double %z
350  store double %select, double addrspace(1)* %out.gep
351  ret void
352}
353
354; Different types compared vs. selected
355; GCN-LABEL: {{^}}fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
356; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
357; GCN: {{buffer|flat|global}}_load_{{dwordx2|b64}}
358
359; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
360; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
361; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
362define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
363  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
364  %tid.ext = sext i32 %tid to i64
365  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
366  %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext
367  %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext
368  %x = load volatile float, float addrspace(1)* %x.gep
369  %z = load volatile i64, i64 addrspace(1)* %z.gep
370  %setcc = fcmp one float %x, 0.0
371  %select = select i1 %setcc, i64 3, i64 %z
372  store i64 %select, i64 addrspace(1)* %out.gep
373  ret void
374}
375
376; Different types compared vs. selected
377; GCN-LABEL: {{^}}icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
378; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
379; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[Z:v[0-9]+]]
380
381; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
382; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
383define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
384  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
385  %tid.ext = sext i32 %tid to i64
386  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
387  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
388  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
389  %x = load volatile i32, i32 addrspace(1)* %x.gep
390  %z = load volatile float, float addrspace(1)* %z.gep
391  %setcc = icmp ugt i32 %x, 1
392  %select = select i1 %setcc, float 4.0, float %z
393  store float %select, float addrspace(1)* %out.gep
394  ret void
395}
396
397; FIXME: Should be able to handle multiple uses
398
399; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
400; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[X:v[0-9]+]]
401
402; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
403; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
404; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
405define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
406  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
407  %tid.ext = sext i32 %tid to i64
408  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
409  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
410  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
411  %x = load volatile float, float addrspace(1)* %x.gep
412  %z = load volatile float, float addrspace(1)* %z.gep
413  %setcc = fcmp ugt float 4.0, %x
414  %select0 = select i1 %setcc, float -1.0, float %z
415  %select1 = select i1 %setcc, float -2.0, float %z
416  store volatile float %select0, float addrspace(1)* %out.gep
417  store volatile float %select1, float addrspace(1)* %out.gep
418  ret void
419}
420
421; Source modifiers abs/neg only work for f32
422
423; GCN-LABEL: {{^}}v_cndmask_abs_neg_f16:
424; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
425define amdgpu_kernel void @v_cndmask_abs_neg_f16(half addrspace(1)* %out, i32 %c, half addrspace(1)* %fptr) #0 {
426  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
427  %f.gep = getelementptr half, half addrspace(1)* %fptr, i32 %idx
428  %f = load half, half addrspace(1)* %f.gep
429  %f.abs = call half @llvm.fabs.f16(half %f)
430  %f.neg = fneg half %f
431  %setcc = icmp ne i32 %c, 0
432  %select = select i1 %setcc, half %f.abs, half %f.neg
433  store half %select, half addrspace(1)* %out
434  ret void
435}
436
437; GCN-LABEL: {{^}}v_cndmask_abs_neg_f32:
438; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, |v{{[0-9]+}}|,
439define amdgpu_kernel void @v_cndmask_abs_neg_f32(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
440  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
441  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
442  %f = load float, float addrspace(1)* %f.gep
443  %f.abs = call float @llvm.fabs.f32(float %f)
444  %f.neg = fneg float %f
445  %setcc = icmp ne i32 %c, 0
446  %select = select i1 %setcc, float %f.abs, float %f.neg
447  store float %select, float addrspace(1)* %out
448  ret void
449}
450
451; GCN-LABEL: {{^}}v_cndmask_abs_neg_f64:
452; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
453; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
454define amdgpu_kernel void @v_cndmask_abs_neg_f64(double addrspace(1)* %out, i32 %c, double addrspace(1)* %fptr) #0 {
455  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
456  %f.gep = getelementptr double, double addrspace(1)* %fptr, i32 %idx
457  %f = load double, double addrspace(1)* %f.gep
458  %f.abs = call double @llvm.fabs.f64(double %f)
459  %f.neg = fneg double %f
460  %setcc = icmp ne i32 %c, 0
461  %select = select i1 %setcc, double %f.abs, double %f.neg
462  store double %select, double addrspace(1)* %out
463  ret void
464}
465
466attributes #0 = { nounwind }
467attributes #1 = { nounwind readnone }
468