1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI %s
3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() #1
6declare half @llvm.fabs.f16(half)
7declare float @llvm.fabs.f32(float)
8declare double @llvm.fabs.f64(double)
9
10; GCN-LABEL: {{^}}v_cnd_nan_nosgpr:
11; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0
12; GCN: s_cselect_b64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], -1, 0
13; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]]
14; GCN-DAG: v{{[0-9]}}
15; All nan values are converted to 0xffffffff
16; GCN: s_endpgm
17define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
18  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
19  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
20  %f = load float, float addrspace(1)* %f.gep
21  %setcc = icmp ne i32 %c, 0
22  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
23  store float %select, float addrspace(1)* %out
24  ret void
25}
26
27
28; This requires slightly trickier SGPR operand legalization since the
29; single constant bus SGPR usage is the last operand, and it should
30; never be moved.
31; However on GFX10 constant bus is limited to 2 scalar operands, not one.
32
33; GCN-LABEL: {{^}}v_cnd_nan:
34; SIVI:  s_cmp_eq_u32 s{{[0-9]+}}, 0
35; SIVI:  s_cselect_b64 vcc, -1, 0
36; SIVI:  v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc
37; GFX10: s_cmp_eq_u32 s{{[0-9]+}}, 0
38; GFX10: s_cselect_b64 [[CC:s\[[0-9:]+\]]],
39; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]]
40; GCN-DAG: v{{[0-9]}}
41; All nan values are converted to 0xffffffff
42; GCN: s_endpgm
43define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
44  %setcc = icmp ne i32 %c, 0
45  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
46  store float %select, float addrspace(1)* %out
47  ret void
48}
49
50; Test different compare and select operand types for optimal code
51; shrinking.
52; (select (cmp (sgprX, constant)), constant, sgprZ)
53
54; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32:
55; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s[0:1], {{0x4c|0x13}}
56
57; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0
58; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0
59; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
60; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], [[CC]]
61; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, s[[Z]], [[CC]]
62define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
63  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
64  %tid.ext = sext i32 %tid to i64
65  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
66  %setcc = fcmp one float %x, 0.0
67  %select = select i1 %setcc, float 1.0, float %z
68  store float %select, float addrspace(1)* %out.gep
69  ret void
70}
71
72; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprX_f32:
73; GCN: s_load_dword [[X:s[0-9]+]]
74; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0
75; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0
76; SIVI-DAG:  v_mov_b32_e32 [[VX:v[0-9]+]], [[X]]
77; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VX]], [[CC]]
78; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[X]], [[CC]]
79define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
80  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
81  %tid.ext = sext i32 %tid to i64
82  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
83  %setcc = fcmp one float %x, 0.0
84  %select = select i1 %setcc, float 1.0, float %x
85  store float %select, float addrspace(1)* %out.gep
86  ret void
87}
88
89; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprZ_f32:
90; GCN-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
91; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0
92; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0
93; SIVI-DAG:  v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]]
94; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], [[CC]]
95; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 0, s[[Z]], [[CC]]
96define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, [8 x i32], float %x, float %z) #0 {
97  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
98  %tid.ext = sext i32 %tid to i64
99  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
100  %setcc = fcmp one float %x, 0.0
101  %select = select i1 %setcc, float 0.0, float %z
102  store float %select, float addrspace(1)* %out.gep
103  ret void
104}
105
106; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_sgprX_f32:
107; GCN: s_load_dword [[X:s[0-9]+]]
108; SIVI-DAG:  v_cmp_nlg_f32_e64 [[CC:vcc]], [[X]], 0
109; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], [[X]], 0
110; SIVI-DAG:  v_mov_b32_e32 [[VX:v[0-9]+]], [[X]]
111; SIVI:      v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VX]], [[CC]]
112; GFX10:     v_cndmask_b32_e64 v{{[0-9]+}}, 0, [[X]], [[CC]]
113define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
114  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
115  %tid.ext = sext i32 %tid to i64
116  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
117  %setcc = fcmp one float %x, 0.0
118  %select = select i1 %setcc, float 0.0, float %x
119  store float %select, float addrspace(1)* %out.gep
120  ret void
121}
122
123; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32:
124; GCN-DAG: s_load_dword [[X:s[0-9]+]]
125; GCN-DAG: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
126; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
127; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
128define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
129  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
130  %tid.ext = sext i32 %tid to i64
131  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
132  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
133  %z = load float, float addrspace(1)* %z.gep
134  %setcc = fcmp one float %x, 0.0
135  %select = select i1 %setcc, float 0.0, float %z
136  store float %select, float addrspace(1)* %out.gep
137  ret void
138}
139
140; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32:
141; GCN-DAG: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
142; GCN-DAG: s_load_dword [[X:s[0-9]+]]
143; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
144; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
145define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
146  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
147  %tid.ext = sext i32 %tid to i64
148  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
149  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
150  %z = load float, float addrspace(1)* %z.gep
151  %setcc = fcmp one float %x, 0.0
152  %select = select i1 %setcc, float 1.0, float %z
153  store float %select, float addrspace(1)* %out.gep
154  ret void
155}
156
157; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_sgprZ_f32:
158; GCN-DAG: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
159; GCN-DAG: s_load_dword [[Z:s[0-9]+]]
160; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]]
161; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
162; SIVI:     v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
163; GFX10:    v_cndmask_b32_e64 v{{[0-9]+}}, 1.0, [[Z]], vcc
164define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
165  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
166  %tid.ext = sext i32 %tid to i64
167  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
168  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
169  %x = load float, float addrspace(1)* %x.gep
170  %setcc = fcmp olt float %x, 0.0
171  %select = select i1 %setcc, float 1.0, float %z
172  store float %select, float addrspace(1)* %out.gep
173  ret void
174}
175
176; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_f32:
177; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
178; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
179; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
180; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
181define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
182  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
183  %tid.ext = sext i32 %tid to i64
184  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
185  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
186  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
187  %x = load volatile float, float addrspace(1)* %x.gep
188  %z = load volatile float, float addrspace(1)* %z.gep
189  %setcc = fcmp ult float %x, 0.0
190  %select = select i1 %setcc, float 1.0, float %z
191  store float %select, float addrspace(1)* %out.gep
192  ret void
193}
194
195; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i32:
196; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
197; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
198; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
199; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
200define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
201  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
202  %tid.ext = sext i32 %tid to i64
203  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
204  %z.gep = getelementptr inbounds i32, i32 addrspace(1)* %z.ptr, i64 %tid.ext
205  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
206  %x = load volatile i32, i32 addrspace(1)* %x.gep
207  %z = load volatile i32, i32 addrspace(1)* %z.gep
208  %setcc = icmp slt i32 %x, 0
209  %select = select i1 %setcc, i32 2, i32 %z
210  store i32 %select, i32 addrspace(1)* %out.gep
211  ret void
212}
213
214; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i64:
215; GCN: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[X_LO:[0-9]+]]:[[X_HI:[0-9]+]]{{\]}}
216; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v{{\[}}[[Z_LO:[0-9]+]]:[[Z_HI:[0-9]+]]{{\]}}
217; GCN-DAG: v_cmp_lt_i64_e32 vcc, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
218; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v[[Z_HI]], vcc
219; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v[[Z_LO]], vcc
220define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
221  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
222  %tid.ext = sext i32 %tid to i64
223  %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
224  %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext
225  %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext
226  %x = load volatile i64, i64 addrspace(1)* %x.gep
227  %z = load volatile i64, i64 addrspace(1)* %z.gep
228  %setcc = icmp slt i64 %x, 0
229  %select = select i1 %setcc, i64 2, i64 %z
230  store i64 %select, i64 addrspace(1)* %out.gep
231  ret void
232}
233
234; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
235; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
236; GCN: {{buffer|flat|global}}_load_dwordx4
237
238; GCN: v_cmp_nge_f32_e32 vcc, 4.0, [[X]]
239; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
240; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
241; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
242; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
243define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
244  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
245  %tid.ext = sext i32 %tid to i64
246  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
247  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
248  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
249  %x = load volatile float, float addrspace(1)* %x.gep
250  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
251  %setcc = fcmp ugt float %x, 4.0
252  %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
253  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
254  ret void
255}
256
257; GCN-LABEL: {{^}}fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
258; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
259; GCN: {{buffer|flat|global}}_load_dwordx4
260
261; GCN: v_cmp_ge_f32_e32 vcc, 4.0, [[X]]
262; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
263; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
264; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
265; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
266define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
267  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
268  %tid.ext = sext i32 %tid to i64
269  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
270  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
271  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
272  %x = load volatile float, float addrspace(1)* %x.gep
273  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
274  %setcc = fcmp ugt float %x, 4.0
275  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
276  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
277  ret void
278}
279
280; This must be swapped as a vector type before the condition has
281; multiple uses.
282
283; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
284; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
285; GCN: {{buffer|flat|global}}_load_dwordx4
286
287; GCN: v_cmp_le_f32_e32 vcc, 4.0, [[X]]
288; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
289; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
290; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
291; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
292define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
293  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
294  %tid.ext = sext i32 %tid to i64
295  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
296  %z.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %z.ptr, i64 %tid.ext
297  %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
298  %x = load volatile float, float addrspace(1)* %x.gep
299  %z = load volatile <4 x float>, <4 x float> addrspace(1)* %z.gep
300  %setcc = fcmp ugt float 4.0, %x
301  %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
302  store <4 x float> %select, <4 x float> addrspace(1)* %out.gep
303  ret void
304}
305
306; GCN-LABEL: {{^}}icmp_vgprX_k0_select_k1_vgprZ_i1:
307; GCN: load_dword
308; GCN: load_ubyte
309; GCN-DAG: v_cmp_gt_i32_e32 vcc, 0, v
310; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 1,
311; GCN-DAG: v_cmp_eq_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, v
312; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, s{{\[[0-9]+:[0-9]+\]}}
313; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
314; GCN: store_byte
315define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
316  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
317  %tid.ext = sext i32 %tid to i64
318  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
319  %z.gep = getelementptr inbounds i1, i1 addrspace(1)* %z.ptr, i64 %tid.ext
320  %out.gep = getelementptr inbounds i1, i1 addrspace(1)* %out, i64 %tid.ext
321  %x = load volatile i32, i32 addrspace(1)* %x.gep
322  %z = load volatile i1, i1 addrspace(1)* %z.gep
323  %setcc = icmp slt i32 %x, 0
324  %select = select i1 %setcc, i1 true, i1 %z
325  store i1 %select, i1 addrspace(1)* %out.gep
326  ret void
327}
328
329; Different types compared vs. selected
330; GCN-LABEL: {{^}}fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
331; SIVI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3ff00000
332; GCN-DAG: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
333; GCN-DAG: {{buffer|flat|global}}_load_dwordx2
334
335; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
336; SIVI-DAG:  v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
337; GFX10-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3ff00000, v{{[0-9]+}}, vcc
338; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
339define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
340  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
341  %tid.ext = sext i32 %tid to i64
342  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
343  %z.gep = getelementptr inbounds double, double addrspace(1)* %z.ptr, i64 %tid.ext
344  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
345  %x = load volatile float, float addrspace(1)* %x.gep
346  %z = load volatile double, double addrspace(1)* %z.gep
347  %setcc = fcmp ult float %x, 0.0
348  %select = select i1 %setcc, double 1.0, double %z
349  store double %select, double addrspace(1)* %out.gep
350  ret void
351}
352
353; Different types compared vs. selected
354; GCN-LABEL: {{^}}fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
355; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
356; GCN: {{buffer|flat|global}}_load_dwordx2
357
358; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
359; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
360; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
361define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
362  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
363  %tid.ext = sext i32 %tid to i64
364  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
365  %z.gep = getelementptr inbounds i64, i64 addrspace(1)* %z.ptr, i64 %tid.ext
366  %out.gep = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %tid.ext
367  %x = load volatile float, float addrspace(1)* %x.gep
368  %z = load volatile i64, i64 addrspace(1)* %z.gep
369  %setcc = fcmp one float %x, 0.0
370  %select = select i1 %setcc, i64 3, i64 %z
371  store i64 %select, i64 addrspace(1)* %out.gep
372  ret void
373}
374
375; Different types compared vs. selected
376; GCN-LABEL: {{^}}icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
377; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
378; GCN: {{buffer|flat|global}}_load_dword [[Z:v[0-9]+]]
379
380; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
381; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
382define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
383  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
384  %tid.ext = sext i32 %tid to i64
385  %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
386  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
387  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
388  %x = load volatile i32, i32 addrspace(1)* %x.gep
389  %z = load volatile float, float addrspace(1)* %z.gep
390  %setcc = icmp ugt i32 %x, 1
391  %select = select i1 %setcc, float 4.0, float %z
392  store float %select, float addrspace(1)* %out.gep
393  ret void
394}
395
396; FIXME: Should be able to handle multiple uses
397
398; GCN-LABEL: {{^}}fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
399; GCN: {{buffer|flat|global}}_load_dword [[X:v[0-9]+]]
400
401; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
402; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
403; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
404define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
405  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
406  %tid.ext = sext i32 %tid to i64
407  %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
408  %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
409  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
410  %x = load volatile float, float addrspace(1)* %x.gep
411  %z = load volatile float, float addrspace(1)* %z.gep
412  %setcc = fcmp ugt float 4.0, %x
413  %select0 = select i1 %setcc, float -1.0, float %z
414  %select1 = select i1 %setcc, float -2.0, float %z
415  store volatile float %select0, float addrspace(1)* %out.gep
416  store volatile float %select1, float addrspace(1)* %out.gep
417  ret void
418}
419
420; Source modifiers abs/neg only work for f32
421
422; GCN-LABEL: {{^}}v_cndmask_abs_neg_f16:
423; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
424define amdgpu_kernel void @v_cndmask_abs_neg_f16(half addrspace(1)* %out, i32 %c, half addrspace(1)* %fptr) #0 {
425  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
426  %f.gep = getelementptr half, half addrspace(1)* %fptr, i32 %idx
427  %f = load half, half addrspace(1)* %f.gep
428  %f.abs = call half @llvm.fabs.f16(half %f)
429  %f.neg = fneg half %f
430  %setcc = icmp ne i32 %c, 0
431  %select = select i1 %setcc, half %f.abs, half %f.neg
432  store half %select, half addrspace(1)* %out
433  ret void
434}
435
436; GCN-LABEL: {{^}}v_cndmask_abs_neg_f32:
437; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, |v{{[0-9]+}}|,
438define amdgpu_kernel void @v_cndmask_abs_neg_f32(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
439  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
440  %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
441  %f = load float, float addrspace(1)* %f.gep
442  %f.abs = call float @llvm.fabs.f32(float %f)
443  %f.neg = fneg float %f
444  %setcc = icmp ne i32 %c, 0
445  %select = select i1 %setcc, float %f.abs, float %f.neg
446  store float %select, float addrspace(1)* %out
447  ret void
448}
449
450; GCN-LABEL: {{^}}v_cndmask_abs_neg_f64:
451; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
452; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}},
453define amdgpu_kernel void @v_cndmask_abs_neg_f64(double addrspace(1)* %out, i32 %c, double addrspace(1)* %fptr) #0 {
454  %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
455  %f.gep = getelementptr double, double addrspace(1)* %fptr, i32 %idx
456  %f = load double, double addrspace(1)* %f.gep
457  %f.abs = call double @llvm.fabs.f64(double %f)
458  %f.neg = fneg double %f
459  %setcc = icmp ne i32 %c, 0
460  %select = select i1 %setcc, double %f.abs, double %f.neg
461  store double %select, double addrspace(1)* %out
462  ret void
463}
464
465attributes #0 = { nounwind }
466attributes #1 = { nounwind readnone }
467