1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GFX678 %s
2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX678 %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
4
5; GCN-LABEL: {{^}}v_clamp_f32:
6; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
7; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
8define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
9  %tid = call i32 @llvm.amdgcn.workitem.id.x()
10  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
11  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
12  %a = load float, float addrspace(1)* %gep0
13  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
14  %med = call float @llvm.minnum.f32(float %max, float 1.0)
15
16  store float %med, float addrspace(1)* %out.gep
17  ret void
18}
19
20; GCN-LABEL: {{^}}v_clamp_neg_f32:
21; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
22; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
23define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
24  %tid = call i32 @llvm.amdgcn.workitem.id.x()
25  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
26  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
27  %a = load float, float addrspace(1)* %gep0
28  %fneg.a = fneg float %a
29  %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
30  %med = call float @llvm.minnum.f32(float %max, float 1.0)
31
32  store float %med, float addrspace(1)* %out.gep
33  ret void
34}
35
36; GCN-LABEL: {{^}}v_clamp_negabs_f32:
37; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
38; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
39define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
40  %tid = call i32 @llvm.amdgcn.workitem.id.x()
41  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
42  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
43  %a = load float, float addrspace(1)* %gep0
44  %fabs.a = call float @llvm.fabs.f32(float %a)
45  %fneg.fabs.a = fneg float %fabs.a
46
47  %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
48  %med = call float @llvm.minnum.f32(float %max, float 1.0)
49
50  store float %med, float addrspace(1)* %out.gep
51  ret void
52}
53
54; GCN-LABEL: {{^}}v_clamp_negzero_f32:
55; GCN-DAG: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
56; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
57; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[ADD]]
58; GCN: v_min_f32_e32 v{{[0-9]+}}, 1.0, [[MAX]]
59define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
60  %tid = call i32 @llvm.amdgcn.workitem.id.x()
61  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
62  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
63  %a = load float, float addrspace(1)* %gep0
64  %add = fadd nnan float %a, 0.5
65  %max = call float @llvm.maxnum.f32(float %add, float -0.0)
66  %med = call float @llvm.minnum.f32(float %max, float 1.0)
67
68  store float %med, float addrspace(1)* %out.gep
69  ret void
70}
71
72; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp
73; matched through med3, not if directly. Is this correct?
74
75; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
76; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
77; GFX678: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
78; GFX9: v_max_f32_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
79; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]]
80; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
81define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
82  %tid = call i32 @llvm.amdgcn.workitem.id.x()
83  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
84  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
85  %a = load float, float addrspace(1)* %gep0
86  %max = call float @llvm.maxnum.f32(float %a, float -0.0)
87  %med = call float @llvm.minnum.f32(float %max, float 1.0)
88
89  store float %med, float addrspace(1)* %out.gep
90  ret void
91}
92
93; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
94; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
95; GFX678: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
96; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[A]], [[A]]
97; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
98; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]]
99; GCN-NOT: [[MAX]]
100; GCN-NOT: [[MED]]
101
102; SI: buffer_store_dword [[MED]]
103; SI: buffer_store_dword [[MAX]]
104
105; GFX89: {{flat|global}}_store_dword v{{.+}}, [[MED]]
106; GFX89: {{flat|global}}_store_dword v{{.+}}, [[MAX]]
107define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
108  %tid = call i32 @llvm.amdgcn.workitem.id.x()
109  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
110  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
111  %a = load float, float addrspace(1)* %gep0
112  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
113  %med = call float @llvm.minnum.f32(float %max, float 1.0)
114
115  store float %med, float addrspace(1)* %out.gep
116  store volatile float %max, float addrspace(1)* undef
117  ret void
118}
119
120; GCN-LABEL: {{^}}v_clamp_f16:
121; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
122; GFX89: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
123
124; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
125; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
126define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
127  %tid = call i32 @llvm.amdgcn.workitem.id.x()
128  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
129  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
130  %a = load half, half addrspace(1)* %gep0
131  %max = call half @llvm.maxnum.f16(half %a, half 0.0)
132  %med = call half @llvm.minnum.f16(half %max, half 1.0)
133
134  store half %med, half addrspace(1)* %out.gep
135  ret void
136}
137
138; GCN-LABEL: {{^}}v_clamp_neg_f16:
139; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
140; GFX89: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
141
142; FIXME: Better to fold neg into max
143; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
144; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
145define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
146  %tid = call i32 @llvm.amdgcn.workitem.id.x()
147  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
148  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
149  %a = load half, half addrspace(1)* %gep0
150  %fneg.a = fsub half -0.0, %a
151  %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
152  %med = call half @llvm.minnum.f16(half %max, half 1.0)
153
154  store half %med, half addrspace(1)* %out.gep
155  ret void
156}
157
158; GCN-LABEL: {{^}}v_clamp_negabs_f16:
159; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
160; GFX89: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
161
162; FIXME: Better to fold neg/abs into max
163
164; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
165; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
166define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
167  %tid = call i32 @llvm.amdgcn.workitem.id.x()
168  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
169  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
170  %a = load half, half addrspace(1)* %gep0
171  %fabs.a = call half @llvm.fabs.f16(half %a)
172  %fneg.fabs.a = fsub half -0.0, %fabs.a
173
174  %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
175  %med = call half @llvm.minnum.f16(half %max, half 1.0)
176
177  store half %med, half addrspace(1)* %out.gep
178  ret void
179}
180
181; FIXME: Do f64 instructions support clamp?
182; GCN-LABEL: {{^}}v_clamp_f64:
183; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
184; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
185define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
186  %tid = call i32 @llvm.amdgcn.workitem.id.x()
187  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
188  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
189  %a = load double, double addrspace(1)* %gep0
190  %max = call double @llvm.maxnum.f64(double %a, double 0.0)
191  %med = call double @llvm.minnum.f64(double %max, double 1.0)
192
193  store double %med, double addrspace(1)* %out.gep
194  ret void
195}
196
197; GCN-LABEL: {{^}}v_clamp_neg_f64:
198; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
199; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
200define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
201  %tid = call i32 @llvm.amdgcn.workitem.id.x()
202  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
203  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
204  %a = load double, double addrspace(1)* %gep0
205  %fneg.a = fsub double -0.0, %a
206  %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
207  %med = call double @llvm.minnum.f64(double %max, double 1.0)
208
209  store double %med, double addrspace(1)* %out.gep
210  ret void
211}
212
213; GCN-LABEL: {{^}}v_clamp_negabs_f64:
214; GCN: {{buffer|flat|global}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
215; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
216define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
217  %tid = call i32 @llvm.amdgcn.workitem.id.x()
218  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
219  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
220  %a = load double, double addrspace(1)* %gep0
221  %fabs.a = call double @llvm.fabs.f64(double %a)
222  %fneg.fabs.a = fsub double -0.0, %fabs.a
223
224  %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
225  %med = call double @llvm.minnum.f64(double %max, double 1.0)
226
227  store double %med, double addrspace(1)* %out.gep
228  ret void
229}
230
231; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
232; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
233; GCN: v_med3_f32
234define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
235  %tid = call i32 @llvm.amdgcn.workitem.id.x()
236  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
237  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
238  %a = load float, float addrspace(1)* %gep0
239  %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
240  store float %med, float addrspace(1)* %out.gep
241  ret void
242}
243
244; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
245; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
246; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
247define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
248  %tid = call i32 @llvm.amdgcn.workitem.id.x()
249  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
250  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
251  %a = load float, float addrspace(1)* %gep0
252  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
253  store float %med, float addrspace(1)* %out.gep
254  ret void
255}
256
257; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
258; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
259; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
260define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
261  %tid = call i32 @llvm.amdgcn.workitem.id.x()
262  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
263  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
264  %a = load float, float addrspace(1)* %gep0
265  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
266  store float %med, float addrspace(1)* %out.gep
267  ret void
268}
269
270; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
271; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
272; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
273define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
274  %tid = call i32 @llvm.amdgcn.workitem.id.x()
275  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
276  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
277  %a = load float, float addrspace(1)* %gep0
278  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
279  store float %med, float addrspace(1)* %out.gep
280  ret void
281}
282
283; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
284; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
285; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
286define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
287  %tid = call i32 @llvm.amdgcn.workitem.id.x()
288  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
289  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
290  %a = load float, float addrspace(1)* %gep0
291  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
292  store float %med, float addrspace(1)* %out.gep
293  ret void
294}
295
296; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
297; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
298; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
299define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
300  %tid = call i32 @llvm.amdgcn.workitem.id.x()
301  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
302  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
303  %a = load float, float addrspace(1)* %gep0
304  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
305  store float %med, float addrspace(1)* %out.gep
306  ret void
307}
308
309; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
310; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
311; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
312define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
313  %tid = call i32 @llvm.amdgcn.workitem.id.x()
314  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
315  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
316  %a = load float, float addrspace(1)* %gep0
317  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
318  store float %med, float addrspace(1)* %out.gep
319  ret void
320}
321
322; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
323; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
324define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
325  %tid = call i32 @llvm.amdgcn.workitem.id.x()
326  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
327  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
328  store float %med, float addrspace(1)* %out.gep
329  ret void
330}
331
332; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
333; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
334define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
335  %tid = call i32 @llvm.amdgcn.workitem.id.x()
336  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
337  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
338  store float %med, float addrspace(1)* %out.gep
339  ret void
340}
341
342; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
343; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
344define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
345  %tid = call i32 @llvm.amdgcn.workitem.id.x()
346  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
347  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
348  store float %med, float addrspace(1)* %out.gep
349  ret void
350}
351
352; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
353; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
354define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
355  %tid = call i32 @llvm.amdgcn.workitem.id.x()
356  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
357  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
358  store float %med, float addrspace(1)* %out.gep
359  ret void
360}
361
362; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
363; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
364define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
365  %tid = call i32 @llvm.amdgcn.workitem.id.x()
366  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
367  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
368  store float %med, float addrspace(1)* %out.gep
369  ret void
370}
371
372; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
373; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
374define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
375  %tid = call i32 @llvm.amdgcn.workitem.id.x()
376  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
377  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
378  store float %med, float addrspace(1)* %out.gep
379  ret void
380}
381
382; ---------------------------------------------------------------------
383; Test non-default behaviors enabling snans and disabling dx10_clamp
384; ---------------------------------------------------------------------
385
386; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
387; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
388; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, [[A]]
389; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
390define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
391  %tid = call i32 @llvm.amdgcn.workitem.id.x()
392  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
393  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
394  %a = load float, float addrspace(1)* %gep0
395  %a.nnan = fadd nnan float %a, 0.5
396  %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0)
397  %med = call float @llvm.minnum.f32(float %max, float 1.0)
398
399  store float %med, float addrspace(1)* %out.gep
400  ret void
401}
402
403; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
404; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
405; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 0.5 clamp{{$}}
406define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
407  %tid = call i32 @llvm.amdgcn.workitem.id.x()
408  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
409  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
410  %a = load float, float addrspace(1)* %gep0
411  %add = fadd float %a, 0.5
412  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
413  %med = call float @llvm.minnum.f32(float %max, float 1.0)
414
415  store float %med, float addrspace(1)* %out.gep
416  ret void
417}
418
419; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
420; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
421; GFX678: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
422; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[A]], [[A]]
423; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0
424define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
425  %tid = call i32 @llvm.amdgcn.workitem.id.x()
426  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
427  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
428  %a = load float, float addrspace(1)* %gep0
429  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
430  %med = call float @llvm.minnum.f32(float %max, float 1.0)
431
432  store float %med, float addrspace(1)* %out.gep
433  ret void
434}
435
436; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
437; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
438; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
439; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
440define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
441  %tid = call i32 @llvm.amdgcn.workitem.id.x()
442  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
443  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
444  %a = load float, float addrspace(1)* %gep0
445  %add  = fadd nnan float %a, 1.0
446  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
447  %med = call float @llvm.minnum.f32(float %max, float 1.0)
448
449  store float %med, float addrspace(1)* %out.gep
450  ret void
451}
452
453; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
454; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
455; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
456define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
457  %tid = call i32 @llvm.amdgcn.workitem.id.x()
458  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
459  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
460  %a = load float, float addrspace(1)* %gep0
461  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
462  store float %med, float addrspace(1)* %out.gep
463  ret void
464}
465
466; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
467; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
468; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
469define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
470  %tid = call i32 @llvm.amdgcn.workitem.id.x()
471  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
472  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
473  %a = load float, float addrspace(1)* %gep0
474  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
475  store float %med, float addrspace(1)* %out.gep
476  ret void
477}
478
479; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
480; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
481; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
482define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
483  %tid = call i32 @llvm.amdgcn.workitem.id.x()
484  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
485  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
486  %a = load float, float addrspace(1)* %gep0
487  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
488  store float %med, float addrspace(1)* %out.gep
489  ret void
490}
491
492; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
493; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
494; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
495define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
496  %tid = call i32 @llvm.amdgcn.workitem.id.x()
497  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
498  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
499  %a = load float, float addrspace(1)* %gep0
500  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
501  store float %med, float addrspace(1)* %out.gep
502  ret void
503}
504
505; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
506; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
507; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
508define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
509  %tid = call i32 @llvm.amdgcn.workitem.id.x()
510  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
511  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
512  %a = load float, float addrspace(1)* %gep0
513  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
514  store float %med, float addrspace(1)* %out.gep
515  ret void
516}
517
518; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
519; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
520; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
521define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
522  %tid = call i32 @llvm.amdgcn.workitem.id.x()
523  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
524  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
525  %a = load float, float addrspace(1)* %gep0
526  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
527  store float %med, float addrspace(1)* %out.gep
528  ret void
529}
530
531; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
532; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
533define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
534  %tid = call i32 @llvm.amdgcn.workitem.id.x()
535  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
536  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
537  store float %med, float addrspace(1)* %out.gep
538  ret void
539}
540
541; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
542; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
543define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
544  %tid = call i32 @llvm.amdgcn.workitem.id.x()
545  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
546  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
547  store float %med, float addrspace(1)* %out.gep
548  ret void
549}
550
551; GCN-LABEL: {{^}}v_clamp_v2f16:
552; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
553; GFX9-NOT: [[A]]
554; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
555define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
556  %tid = call i32 @llvm.amdgcn.workitem.id.x()
557  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
558  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
559  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
560  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
561  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
562
563  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
564  ret void
565}
566
567; GCN-LABEL: {{^}}v_clamp_v2f16_undef_elt:
568; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
569; GFX9-NOT: [[A]]
570; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
571define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
572  %tid = call i32 @llvm.amdgcn.workitem.id.x()
573  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
574  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
575  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
576  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
577  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
578
579  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
580  ret void
581}
582
583; GCN-LABEL: {{^}}v_clamp_v2f16_not_zero:
584; GFX9: v_pk_max_f16
585; GFX9: v_pk_min_f16
586define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
587  %tid = call i32 @llvm.amdgcn.workitem.id.x()
588  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
589  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
590  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
591  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
592  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
593
594  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
595  ret void
596}
597
598; GCN-LABEL: {{^}}v_clamp_v2f16_not_one:
599; GFX9: v_pk_max_f16
600; GFX9: v_pk_min_f16
601define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
602  %tid = call i32 @llvm.amdgcn.workitem.id.x()
603  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
604  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
605  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
606  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
607  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
608
609  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
610  ret void
611}
612
613; GCN-LABEL: {{^}}v_clamp_neg_v2f16:
614; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
615; GFX9-NOT: [[A]]
616; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
617define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
618  %tid = call i32 @llvm.amdgcn.workitem.id.x()
619  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
620  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
621  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
622  %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a
623  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
624  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
625
626  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
627  ret void
628}
629
630; GCN-LABEL: {{^}}v_clamp_negabs_v2f16:
631; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
632; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, [[A]]
633; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1] clamp{{$}}
634define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
635  %tid = call i32 @llvm.amdgcn.workitem.id.x()
636  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
637  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
638  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
639  %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
640  %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a
641
642  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
643  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
644
645  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
646  ret void
647}
648
649; GCN-LABEL: {{^}}v_clamp_neglo_v2f16:
650; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
651; GFX9-NOT: [[A]]
652; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_lo:[1,1] clamp{{$}}
653define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
654  %tid = call i32 @llvm.amdgcn.workitem.id.x()
655  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
656  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
657  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
658  %lo = extractelement <2 x half> %a, i32 0
659  %neg.lo = fsub half -0.0, %lo
660  %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
661  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
662  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
663
664  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
665  ret void
666}
667
668; GCN-LABEL: {{^}}v_clamp_neghi_v2f16:
669; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
670; GFX9-NOT: [[A]]
671; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] neg_hi:[1,1] clamp{{$}}
672define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
673  %tid = call i32 @llvm.amdgcn.workitem.id.x()
674  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
675  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
676  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
677  %hi = extractelement <2 x half> %a, i32 1
678  %neg.hi = fsub half -0.0, %hi
679  %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
680  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
681  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
682
683  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
684  ret void
685}
686
687; GCN-LABEL: {{^}}v_clamp_v2f16_shuffle:
688; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
689; GFX9-NOT: [[A]]
690; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] op_sel:[1,1] op_sel_hi:[0,0] clamp{{$}}
691define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
692  %tid = call i32 @llvm.amdgcn.workitem.id.x()
693  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
694  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
695  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
696  %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
697  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
698  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
699
700  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
701  ret void
702}
703
704; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
705; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
706; GFX9-NOT: [[A]]
707; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
708define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
709  %tid = call i32 @llvm.amdgcn.workitem.id.x()
710  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
711  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
712  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
713  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
714  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
715
716  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
717  ret void
718}
719
720; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
721; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
722; GFX9-NOT: [[A]]
723; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
724define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
725  %tid = call i32 @llvm.amdgcn.workitem.id.x()
726  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
727  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
728  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
729  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
730  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
731
732  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
733  ret void
734}
735
736; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
737; GCN: v_add_f32_e32 [[A:v[0-9]+]]
738; GCN: v_add_f32_e32 [[B:v[0-9]+]]
739; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[B]] clamp{{$}}
740define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0
741{
742  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0
743  %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1
744  %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2
745  %l0 = load float, float addrspace(1)* %gep0
746  %l1 = load float, float addrspace(1)* %gep1
747  %l2 = load float, float addrspace(1)* %gep2
748  %a = fadd nsz float %l0, %l1
749  %b = fadd nsz float %l0, %l2
750  %res = call nsz float @llvm.maxnum.f32(float %a, float %b)
751  %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
752  %min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
753  %out.gep = getelementptr float, float addrspace(1)* %out, i32 3
754  store float %min, float addrspace(1)* %out.gep
755  ret void
756}
757
758declare i32 @llvm.amdgcn.workitem.id.x() #1
759declare float @llvm.fabs.f32(float) #1
760declare float @llvm.minnum.f32(float, float) #1
761declare float @llvm.maxnum.f32(float, float) #1
762declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
763declare double @llvm.fabs.f64(double) #1
764declare double @llvm.minnum.f64(double, double) #1
765declare double @llvm.maxnum.f64(double, double) #1
766declare half @llvm.fabs.f16(half) #1
767declare half @llvm.minnum.f16(half, half) #1
768declare half @llvm.maxnum.f16(half, half) #1
769declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
770declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
771declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
772
773attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
774attributes #1 = { nounwind readnone }
775attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
776attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
777attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
778