1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
6
7define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
8; GFX6-LABEL: v_clamp_f32:
9; GFX6:       ; %bb.0:
10; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
11; GFX6-NEXT:    s_mov_b32 s7, 0xf000
12; GFX6-NEXT:    s_mov_b32 s6, 0
13; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
14; GFX6-NEXT:    v_mov_b32_e32 v1, 0
15; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
17; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
18; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
19; GFX6-NEXT:    s_waitcnt vmcnt(0)
20; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
21; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
22; GFX6-NEXT:    s_endpgm
23;
24; GFX8-LABEL: v_clamp_f32:
25; GFX8:       ; %bb.0:
26; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
27; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
28; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX8-NEXT:    v_mov_b32_e32 v1, s3
30; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
31; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
32; GFX8-NEXT:    flat_load_dword v3, v[0:1]
33; GFX8-NEXT:    v_mov_b32_e32 v1, s1
34; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
35; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
36; GFX8-NEXT:    s_waitcnt vmcnt(0)
37; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
38; GFX8-NEXT:    flat_store_dword v[0:1], v2
39; GFX8-NEXT:    s_endpgm
40;
41; GFX9-LABEL: v_clamp_f32:
42; GFX9:       ; %bb.0:
43; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
44; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
45; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
47; GFX9-NEXT:    s_waitcnt vmcnt(0)
48; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
49; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
50; GFX9-NEXT:    s_endpgm
51;
52; GFX11-LABEL: v_clamp_f32:
53; GFX11:       ; %bb.0:
54; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
55; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
56; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
58; GFX11-NEXT:    s_waitcnt vmcnt(0)
59; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
60; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
61; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
62; GFX11-NEXT:    s_endpgm
63  %tid = call i32 @llvm.amdgcn.workitem.id.x()
64  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
65  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
66  %a = load float, float addrspace(1)* %gep0
67  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
68  %med = call float @llvm.minnum.f32(float %max, float 1.0)
69
70  store float %med, float addrspace(1)* %out.gep
71  ret void
72}
73
74define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
75; GFX6-LABEL: v_clamp_neg_f32:
76; GFX6:       ; %bb.0:
77; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
78; GFX6-NEXT:    s_mov_b32 s7, 0xf000
79; GFX6-NEXT:    s_mov_b32 s6, 0
80; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
81; GFX6-NEXT:    v_mov_b32_e32 v1, 0
82; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
84; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
85; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
86; GFX6-NEXT:    s_waitcnt vmcnt(0)
87; GFX6-NEXT:    v_max_f32_e64 v2, -v2, -v2 clamp
88; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
89; GFX6-NEXT:    s_endpgm
90;
91; GFX8-LABEL: v_clamp_neg_f32:
92; GFX8:       ; %bb.0:
93; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
94; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
95; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX8-NEXT:    v_mov_b32_e32 v1, s3
97; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
98; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
99; GFX8-NEXT:    flat_load_dword v3, v[0:1]
100; GFX8-NEXT:    v_mov_b32_e32 v1, s1
101; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
102; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
103; GFX8-NEXT:    s_waitcnt vmcnt(0)
104; GFX8-NEXT:    v_max_f32_e64 v2, -v3, -v3 clamp
105; GFX8-NEXT:    flat_store_dword v[0:1], v2
106; GFX8-NEXT:    s_endpgm
107;
108; GFX9-LABEL: v_clamp_neg_f32:
109; GFX9:       ; %bb.0:
110; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
111; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
112; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
114; GFX9-NEXT:    s_waitcnt vmcnt(0)
115; GFX9-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
116; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
117; GFX9-NEXT:    s_endpgm
118;
119; GFX11-LABEL: v_clamp_neg_f32:
120; GFX11:       ; %bb.0:
121; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
122; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
123; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
125; GFX11-NEXT:    s_waitcnt vmcnt(0)
126; GFX11-NEXT:    v_max_f32_e64 v1, -v1, -v1 clamp
127; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
128; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
129; GFX11-NEXT:    s_endpgm
130  %tid = call i32 @llvm.amdgcn.workitem.id.x()
131  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
132  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
133  %a = load float, float addrspace(1)* %gep0
134  %fneg.a = fneg float %a
135  %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
136  %med = call float @llvm.minnum.f32(float %max, float 1.0)
137
138  store float %med, float addrspace(1)* %out.gep
139  ret void
140}
141
142define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
143; GFX6-LABEL: v_clamp_negabs_f32:
144; GFX6:       ; %bb.0:
145; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
146; GFX6-NEXT:    s_mov_b32 s7, 0xf000
147; GFX6-NEXT:    s_mov_b32 s6, 0
148; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
149; GFX6-NEXT:    v_mov_b32_e32 v1, 0
150; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
152; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
153; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
154; GFX6-NEXT:    s_waitcnt vmcnt(0)
155; GFX6-NEXT:    v_max_f32_e64 v2, -|v2|, -|v2| clamp
156; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
157; GFX6-NEXT:    s_endpgm
158;
159; GFX8-LABEL: v_clamp_negabs_f32:
160; GFX8:       ; %bb.0:
161; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
162; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
163; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX8-NEXT:    v_mov_b32_e32 v1, s3
165; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
166; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
167; GFX8-NEXT:    flat_load_dword v3, v[0:1]
168; GFX8-NEXT:    v_mov_b32_e32 v1, s1
169; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
170; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
171; GFX8-NEXT:    s_waitcnt vmcnt(0)
172; GFX8-NEXT:    v_max_f32_e64 v2, -|v3|, -|v3| clamp
173; GFX8-NEXT:    flat_store_dword v[0:1], v2
174; GFX8-NEXT:    s_endpgm
175;
176; GFX9-LABEL: v_clamp_negabs_f32:
177; GFX9:       ; %bb.0:
178; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
179; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
180; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
182; GFX9-NEXT:    s_waitcnt vmcnt(0)
183; GFX9-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1| clamp
184; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
185; GFX9-NEXT:    s_endpgm
186;
187; GFX11-LABEL: v_clamp_negabs_f32:
188; GFX11:       ; %bb.0:
189; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
190; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
191; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
193; GFX11-NEXT:    s_waitcnt vmcnt(0)
194; GFX11-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1| clamp
195; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
196; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
197; GFX11-NEXT:    s_endpgm
198  %tid = call i32 @llvm.amdgcn.workitem.id.x()
199  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
200  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
201  %a = load float, float addrspace(1)* %gep0
202  %fabs.a = call float @llvm.fabs.f32(float %a)
203  %fneg.fabs.a = fneg float %fabs.a
204
205  %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
206  %med = call float @llvm.minnum.f32(float %max, float 1.0)
207
208  store float %med, float addrspace(1)* %out.gep
209  ret void
210}
211
212define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
213; GFX6-LABEL: v_clamp_negzero_f32:
214; GFX6:       ; %bb.0:
215; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
216; GFX6-NEXT:    s_mov_b32 s7, 0xf000
217; GFX6-NEXT:    s_mov_b32 s6, 0
218; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
219; GFX6-NEXT:    v_mov_b32_e32 v1, 0
220; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
222; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
223; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
224; GFX6-NEXT:    s_waitcnt vmcnt(0)
225; GFX6-NEXT:    v_add_f32_e32 v2, 0.5, v2
226; GFX6-NEXT:    v_max_f32_e32 v2, 0x80000000, v2
227; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
228; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
229; GFX6-NEXT:    s_endpgm
230;
231; GFX8-LABEL: v_clamp_negzero_f32:
232; GFX8:       ; %bb.0:
233; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
234; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX8-NEXT:    v_mov_b32_e32 v1, s3
237; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
238; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
239; GFX8-NEXT:    flat_load_dword v3, v[0:1]
240; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
241; GFX8-NEXT:    v_mov_b32_e32 v1, s1
242; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
243; GFX8-NEXT:    s_waitcnt vmcnt(0)
244; GFX8-NEXT:    v_add_f32_e32 v2, 0.5, v3
245; GFX8-NEXT:    v_max_f32_e32 v2, 0x80000000, v2
246; GFX8-NEXT:    v_min_f32_e32 v2, 1.0, v2
247; GFX8-NEXT:    flat_store_dword v[0:1], v2
248; GFX8-NEXT:    s_endpgm
249;
250; GFX9-LABEL: v_clamp_negzero_f32:
251; GFX9:       ; %bb.0:
252; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
253; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
254; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
256; GFX9-NEXT:    s_waitcnt vmcnt(0)
257; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
258; GFX9-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
259; GFX9-NEXT:    v_min_f32_e32 v1, 1.0, v1
260; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
261; GFX9-NEXT:    s_endpgm
262;
263; GFX11-LABEL: v_clamp_negzero_f32:
264; GFX11:       ; %bb.0:
265; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
266; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
267; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
269; GFX11-NEXT:    s_waitcnt vmcnt(0)
270; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
271; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
272; GFX11-NEXT:    v_maxmin_f32 v1, v1, 0x80000000, 1.0
273; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
274; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
275; GFX11-NEXT:    s_endpgm
276  %tid = call i32 @llvm.amdgcn.workitem.id.x()
277  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
278  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
279  %a = load float, float addrspace(1)* %gep0
280  %add = fadd nnan float %a, 0.5
281  %max = call float @llvm.maxnum.f32(float %add, float -0.0)
282  %med = call float @llvm.minnum.f32(float %max, float 1.0)
283
284  store float %med, float addrspace(1)* %out.gep
285  ret void
286}
287
288; FIXME: Weird inconsistency in how -0.0 is treated. Accepted if clamp
289; matched through med3, not if directly. Is this correct?
290define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
291; GFX6-LABEL: v_clamp_negzero_maybe_snan_f32:
292; GFX6:       ; %bb.0:
293; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
294; GFX6-NEXT:    s_mov_b32 s7, 0xf000
295; GFX6-NEXT:    s_mov_b32 s6, 0
296; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
297; GFX6-NEXT:    v_mov_b32_e32 v1, 0
298; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
300; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
301; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
302; GFX6-NEXT:    s_waitcnt vmcnt(0)
303; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
304; GFX6-NEXT:    v_max_f32_e32 v2, 0x80000000, v2
305; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
306; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
307; GFX6-NEXT:    s_endpgm
308;
309; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32:
310; GFX8:       ; %bb.0:
311; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
312; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
313; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX8-NEXT:    v_mov_b32_e32 v1, s3
315; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
316; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
317; GFX8-NEXT:    flat_load_dword v3, v[0:1]
318; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
319; GFX8-NEXT:    v_mov_b32_e32 v1, s1
320; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
321; GFX8-NEXT:    s_waitcnt vmcnt(0)
322; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
323; GFX8-NEXT:    v_max_f32_e32 v2, 0x80000000, v2
324; GFX8-NEXT:    v_min_f32_e32 v2, 1.0, v2
325; GFX8-NEXT:    flat_store_dword v[0:1], v2
326; GFX8-NEXT:    s_endpgm
327;
328; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32:
329; GFX9:       ; %bb.0:
330; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
331; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
334; GFX9-NEXT:    s_waitcnt vmcnt(0)
335; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
336; GFX9-NEXT:    v_max_f32_e32 v1, 0x80000000, v1
337; GFX9-NEXT:    v_min_f32_e32 v1, 1.0, v1
338; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
339; GFX9-NEXT:    s_endpgm
340;
341; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32:
342; GFX11:       ; %bb.0:
343; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
344; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
345; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
346; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
347; GFX11-NEXT:    s_waitcnt vmcnt(0)
348; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
349; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
350; GFX11-NEXT:    v_maxmin_f32 v1, v1, 0x80000000, 1.0
351; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
352; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
353; GFX11-NEXT:    s_endpgm
354  %tid = call i32 @llvm.amdgcn.workitem.id.x()
355  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
356  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
357  %a = load float, float addrspace(1)* %gep0
358  %max = call float @llvm.maxnum.f32(float %a, float -0.0)
359  %med = call float @llvm.minnum.f32(float %max, float 1.0)
360
361  store float %med, float addrspace(1)* %out.gep
362  ret void
363}
364
365define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
366; GFX6-LABEL: v_clamp_multi_use_max_f32:
367; GFX6:       ; %bb.0:
368; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
369; GFX6-NEXT:    s_mov_b32 s6, 0
370; GFX6-NEXT:    s_mov_b32 s7, 0xf000
371; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
372; GFX6-NEXT:    v_mov_b32_e32 v1, 0
373; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
375; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
376; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
377; GFX6-NEXT:    s_mov_b32 s6, -1
378; GFX6-NEXT:    s_waitcnt vmcnt(0)
379; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
380; GFX6-NEXT:    v_max_f32_e32 v2, 0, v2
381; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v2
382; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
383; GFX6-NEXT:    buffer_store_dword v2, off, s[4:7], 0
384; GFX6-NEXT:    s_waitcnt vmcnt(0)
385; GFX6-NEXT:    s_endpgm
386;
387; GFX8-LABEL: v_clamp_multi_use_max_f32:
388; GFX8:       ; %bb.0:
389; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
390; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
391; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX8-NEXT:    v_mov_b32_e32 v1, s3
393; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
394; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
395; GFX8-NEXT:    flat_load_dword v3, v[0:1]
396; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
397; GFX8-NEXT:    v_mov_b32_e32 v1, s1
398; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
399; GFX8-NEXT:    s_waitcnt vmcnt(0)
400; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
401; GFX8-NEXT:    v_max_f32_e32 v2, 0, v2
402; GFX8-NEXT:    v_min_f32_e32 v3, 1.0, v2
403; GFX8-NEXT:    flat_store_dword v[0:1], v3
404; GFX8-NEXT:    flat_store_dword v[0:1], v2
405; GFX8-NEXT:    s_waitcnt vmcnt(0)
406; GFX8-NEXT:    s_endpgm
407;
408; GFX9-LABEL: v_clamp_multi_use_max_f32:
409; GFX9:       ; %bb.0:
410; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
411; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
412; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
414; GFX9-NEXT:    s_waitcnt vmcnt(0)
415; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
416; GFX9-NEXT:    v_max_f32_e32 v1, 0, v1
417; GFX9-NEXT:    v_min_f32_e32 v2, 1.0, v1
418; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
419; GFX9-NEXT:    global_store_dword v[0:1], v1, off
420; GFX9-NEXT:    s_waitcnt vmcnt(0)
421; GFX9-NEXT:    s_endpgm
422;
423; GFX11-LABEL: v_clamp_multi_use_max_f32:
424; GFX11:       ; %bb.0:
425; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
426; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
427; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
429; GFX11-NEXT:    s_waitcnt vmcnt(0)
430; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
431; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
432; GFX11-NEXT:    v_max_f32_e32 v1, 0, v1
433; GFX11-NEXT:    v_min_f32_e32 v2, 1.0, v1
434; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
435; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
436; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
437; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
438; GFX11-NEXT:    s_endpgm
439  %tid = call i32 @llvm.amdgcn.workitem.id.x()
440  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
441  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
442  %a = load float, float addrspace(1)* %gep0
443  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
444  %med = call float @llvm.minnum.f32(float %max, float 1.0)
445
446  store float %med, float addrspace(1)* %out.gep
447  store volatile float %max, float addrspace(1)* undef
448  ret void
449}
450
451define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
452; GFX6-LABEL: v_clamp_f16:
453; GFX6:       ; %bb.0:
454; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
455; GFX6-NEXT:    s_mov_b32 s7, 0xf000
456; GFX6-NEXT:    s_mov_b32 s6, 0
457; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
458; GFX6-NEXT:    v_mov_b32_e32 v1, 0
459; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
461; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
462; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
463; GFX6-NEXT:    s_waitcnt vmcnt(0)
464; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
465; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
466; GFX6-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
467; GFX6-NEXT:    s_endpgm
468;
469; GFX8-LABEL: v_clamp_f16:
470; GFX8:       ; %bb.0:
471; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
472; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
473; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
474; GFX8-NEXT:    v_mov_b32_e32 v1, s3
475; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
476; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
477; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
478; GFX8-NEXT:    v_mov_b32_e32 v1, s1
479; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
480; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
481; GFX8-NEXT:    s_waitcnt vmcnt(0)
482; GFX8-NEXT:    v_max_f16_e64 v2, v3, v3 clamp
483; GFX8-NEXT:    flat_store_short v[0:1], v2
484; GFX8-NEXT:    s_endpgm
485;
486; GFX9-LABEL: v_clamp_f16:
487; GFX9:       ; %bb.0:
488; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
489; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
490; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
491; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
492; GFX9-NEXT:    s_waitcnt vmcnt(0)
493; GFX9-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
494; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
495; GFX9-NEXT:    s_endpgm
496;
497; GFX11-LABEL: v_clamp_f16:
498; GFX11:       ; %bb.0:
499; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
500; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
501; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
503; GFX11-NEXT:    s_waitcnt vmcnt(0)
504; GFX11-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
505; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
506; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
507; GFX11-NEXT:    s_endpgm
508  %tid = call i32 @llvm.amdgcn.workitem.id.x()
509  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
510  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
511  %a = load half, half addrspace(1)* %gep0
512  %max = call half @llvm.maxnum.f16(half %a, half 0.0)
513  %med = call half @llvm.minnum.f16(half %max, half 1.0)
514
515  store half %med, half addrspace(1)* %out.gep
516  ret void
517}
518
519define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
520; GFX6-LABEL: v_clamp_neg_f16:
521; GFX6:       ; %bb.0:
522; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
523; GFX6-NEXT:    s_mov_b32 s7, 0xf000
524; GFX6-NEXT:    s_mov_b32 s6, 0
525; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
526; GFX6-NEXT:    v_mov_b32_e32 v1, 0
527; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
528; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
529; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
530; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
531; GFX6-NEXT:    s_waitcnt vmcnt(0)
532; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, -v2 clamp
533; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
534; GFX6-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
535; GFX6-NEXT:    s_endpgm
536;
537; GFX8-LABEL: v_clamp_neg_f16:
538; GFX8:       ; %bb.0:
539; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
540; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
541; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX8-NEXT:    v_mov_b32_e32 v1, s3
543; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
544; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
545; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
546; GFX8-NEXT:    v_mov_b32_e32 v1, s1
547; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
548; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
549; GFX8-NEXT:    s_waitcnt vmcnt(0)
550; GFX8-NEXT:    v_max_f16_e64 v2, -v3, -v3 clamp
551; GFX8-NEXT:    flat_store_short v[0:1], v2
552; GFX8-NEXT:    s_endpgm
553;
554; GFX9-LABEL: v_clamp_neg_f16:
555; GFX9:       ; %bb.0:
556; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
557; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
558; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
560; GFX9-NEXT:    s_waitcnt vmcnt(0)
561; GFX9-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
562; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
563; GFX9-NEXT:    s_endpgm
564;
565; GFX11-LABEL: v_clamp_neg_f16:
566; GFX11:       ; %bb.0:
567; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
568; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
569; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
570; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
571; GFX11-NEXT:    s_waitcnt vmcnt(0)
572; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
573; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
574; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
575; GFX11-NEXT:    s_endpgm
576  %tid = call i32 @llvm.amdgcn.workitem.id.x()
577  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
578  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
579  %a = load half, half addrspace(1)* %gep0
580  %fneg.a = fsub half -0.0, %a
581  %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
582  %med = call half @llvm.minnum.f16(half %max, half 1.0)
583
584  store half %med, half addrspace(1)* %out.gep
585  ret void
586}
587
588define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
589; GFX6-LABEL: v_clamp_negabs_f16:
590; GFX6:       ; %bb.0:
591; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
592; GFX6-NEXT:    s_mov_b32 s7, 0xf000
593; GFX6-NEXT:    s_mov_b32 s6, 0
594; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
595; GFX6-NEXT:    v_mov_b32_e32 v1, 0
596; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
597; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
598; GFX6-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
599; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
600; GFX6-NEXT:    s_waitcnt vmcnt(0)
601; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, -|v2| clamp
602; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
603; GFX6-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
604; GFX6-NEXT:    s_endpgm
605;
606; GFX8-LABEL: v_clamp_negabs_f16:
607; GFX8:       ; %bb.0:
608; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
609; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
610; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
611; GFX8-NEXT:    v_mov_b32_e32 v1, s3
612; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
613; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
614; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
615; GFX8-NEXT:    v_mov_b32_e32 v1, s1
616; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
617; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
618; GFX8-NEXT:    s_waitcnt vmcnt(0)
619; GFX8-NEXT:    v_max_f16_e64 v2, -|v3|, -|v3| clamp
620; GFX8-NEXT:    flat_store_short v[0:1], v2
621; GFX8-NEXT:    s_endpgm
622;
623; GFX9-LABEL: v_clamp_negabs_f16:
624; GFX9:       ; %bb.0:
625; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
626; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
627; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
628; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3]
629; GFX9-NEXT:    s_waitcnt vmcnt(0)
630; GFX9-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
631; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
632; GFX9-NEXT:    s_endpgm
633;
634; GFX11-LABEL: v_clamp_negabs_f16:
635; GFX11:       ; %bb.0:
636; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
637; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
638; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
639; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
640; GFX11-NEXT:    s_waitcnt vmcnt(0)
641; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
642; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
643; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
644; GFX11-NEXT:    s_endpgm
645  %tid = call i32 @llvm.amdgcn.workitem.id.x()
646  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
647  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
648  %a = load half, half addrspace(1)* %gep0
649  %fabs.a = call half @llvm.fabs.f16(half %a)
650  %fneg.fabs.a = fsub half -0.0, %fabs.a
651
652  %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
653  %med = call half @llvm.minnum.f16(half %max, half 1.0)
654
655  store half %med, half addrspace(1)* %out.gep
656  ret void
657}
658
659define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
660; GFX6-LABEL: v_clamp_f64:
661; GFX6:       ; %bb.0:
662; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
663; GFX6-NEXT:    s_mov_b32 s7, 0xf000
664; GFX6-NEXT:    s_mov_b32 s6, 0
665; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
666; GFX6-NEXT:    v_mov_b32_e32 v1, 0
667; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
668; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
669; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
670; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
671; GFX6-NEXT:    s_waitcnt vmcnt(0)
672; GFX6-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3] clamp
673; GFX6-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
674; GFX6-NEXT:    s_endpgm
675;
676; GFX8-LABEL: v_clamp_f64:
677; GFX8:       ; %bb.0:
678; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
679; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
680; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX8-NEXT:    v_mov_b32_e32 v1, s3
682; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
683; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
684; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
685; GFX8-NEXT:    v_mov_b32_e32 v3, s1
686; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
687; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
688; GFX8-NEXT:    s_waitcnt vmcnt(0)
689; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
690; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
691; GFX8-NEXT:    s_endpgm
692;
693; GFX9-LABEL: v_clamp_f64:
694; GFX9:       ; %bb.0:
695; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
696; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
697; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
699; GFX9-NEXT:    s_waitcnt vmcnt(0)
700; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
701; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
702; GFX9-NEXT:    s_endpgm
703;
704; GFX11-LABEL: v_clamp_f64:
705; GFX11:       ; %bb.0:
706; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
707; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
708; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
710; GFX11-NEXT:    s_waitcnt vmcnt(0)
711; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
712; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
713; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
714; GFX11-NEXT:    s_endpgm
715  %tid = call i32 @llvm.amdgcn.workitem.id.x()
716  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
717  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
718  %a = load double, double addrspace(1)* %gep0
719  %max = call double @llvm.maxnum.f64(double %a, double 0.0)
720  %med = call double @llvm.minnum.f64(double %max, double 1.0)
721
722  store double %med, double addrspace(1)* %out.gep
723  ret void
724}
725
726define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
727; GFX6-LABEL: v_clamp_neg_f64:
728; GFX6:       ; %bb.0:
729; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
730; GFX6-NEXT:    s_mov_b32 s7, 0xf000
731; GFX6-NEXT:    s_mov_b32 s6, 0
732; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
733; GFX6-NEXT:    v_mov_b32_e32 v1, 0
734; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
735; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
736; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
737; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
738; GFX6-NEXT:    s_waitcnt vmcnt(0)
739; GFX6-NEXT:    v_max_f64 v[2:3], -v[2:3], -v[2:3] clamp
740; GFX6-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
741; GFX6-NEXT:    s_endpgm
742;
743; GFX8-LABEL: v_clamp_neg_f64:
744; GFX8:       ; %bb.0:
745; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
746; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
747; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX8-NEXT:    v_mov_b32_e32 v1, s3
749; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
750; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
751; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
752; GFX8-NEXT:    v_mov_b32_e32 v3, s1
753; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
754; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
755; GFX8-NEXT:    s_waitcnt vmcnt(0)
756; GFX8-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
757; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
758; GFX8-NEXT:    s_endpgm
759;
760; GFX9-LABEL: v_clamp_neg_f64:
761; GFX9:       ; %bb.0:
762; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
763; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
764; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
766; GFX9-NEXT:    s_waitcnt vmcnt(0)
767; GFX9-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
768; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
769; GFX9-NEXT:    s_endpgm
770;
771; GFX11-LABEL: v_clamp_neg_f64:
772; GFX11:       ; %bb.0:
773; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
774; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
775; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
776; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
777; GFX11-NEXT:    s_waitcnt vmcnt(0)
778; GFX11-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
779; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
780; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
781; GFX11-NEXT:    s_endpgm
782  %tid = call i32 @llvm.amdgcn.workitem.id.x()
783  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
784  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
785  %a = load double, double addrspace(1)* %gep0
786  %fneg.a = fsub double -0.0, %a
787  %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
788  %med = call double @llvm.minnum.f64(double %max, double 1.0)
789
790  store double %med, double addrspace(1)* %out.gep
791  ret void
792}
793
794define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
795; GFX6-LABEL: v_clamp_negabs_f64:
796; GFX6:       ; %bb.0:
797; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
798; GFX6-NEXT:    s_mov_b32 s7, 0xf000
799; GFX6-NEXT:    s_mov_b32 s6, 0
800; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
801; GFX6-NEXT:    v_mov_b32_e32 v1, 0
802; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
803; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
804; GFX6-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
805; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
806; GFX6-NEXT:    s_waitcnt vmcnt(0)
807; GFX6-NEXT:    v_max_f64 v[2:3], -|v[2:3]|, -|v[2:3]| clamp
808; GFX6-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
809; GFX6-NEXT:    s_endpgm
810;
811; GFX8-LABEL: v_clamp_negabs_f64:
812; GFX8:       ; %bb.0:
813; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
814; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
815; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
816; GFX8-NEXT:    v_mov_b32_e32 v1, s3
817; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
818; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
819; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
820; GFX8-NEXT:    v_mov_b32_e32 v3, s1
821; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
822; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
823; GFX8-NEXT:    s_waitcnt vmcnt(0)
824; GFX8-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
825; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
826; GFX8-NEXT:    s_endpgm
827;
828; GFX9-LABEL: v_clamp_negabs_f64:
829; GFX9:       ; %bb.0:
830; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
831; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
832; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
833; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
834; GFX9-NEXT:    s_waitcnt vmcnt(0)
835; GFX9-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
836; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
837; GFX9-NEXT:    s_endpgm
838;
839; GFX11-LABEL: v_clamp_negabs_f64:
840; GFX11:       ; %bb.0:
841; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
842; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
843; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
845; GFX11-NEXT:    s_waitcnt vmcnt(0)
846; GFX11-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
847; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
848; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
849; GFX11-NEXT:    s_endpgm
850  %tid = call i32 @llvm.amdgcn.workitem.id.x()
851  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
852  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
853  %a = load double, double addrspace(1)* %gep0
854  %fabs.a = call double @llvm.fabs.f64(double %a)
855  %fneg.fabs.a = fsub double -0.0, %fabs.a
856
857  %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
858  %med = call double @llvm.minnum.f64(double %max, double 1.0)
859
860  store double %med, double addrspace(1)* %out.gep
861  ret void
862}
863
864define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
865; GFX6-LABEL: v_clamp_med3_aby_negzero_f32:
866; GFX6:       ; %bb.0:
867; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
868; GFX6-NEXT:    s_mov_b32 s7, 0xf000
869; GFX6-NEXT:    s_mov_b32 s6, 0
870; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
871; GFX6-NEXT:    v_mov_b32_e32 v1, 0
872; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
873; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
874; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
875; GFX6-NEXT:    s_brev_b32 s4, 1
876; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
877; GFX6-NEXT:    s_waitcnt vmcnt(0)
878; GFX6-NEXT:    v_med3_f32 v2, s4, 1.0, v2
879; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
880; GFX6-NEXT:    s_endpgm
881;
882; GFX8-LABEL: v_clamp_med3_aby_negzero_f32:
883; GFX8:       ; %bb.0:
884; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
885; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
886; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX8-NEXT:    v_mov_b32_e32 v1, s3
888; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
889; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
890; GFX8-NEXT:    flat_load_dword v3, v[0:1]
891; GFX8-NEXT:    v_mov_b32_e32 v1, s1
892; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
893; GFX8-NEXT:    s_brev_b32 s0, 1
894; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
895; GFX8-NEXT:    s_waitcnt vmcnt(0)
896; GFX8-NEXT:    v_med3_f32 v2, s0, 1.0, v3
897; GFX8-NEXT:    flat_store_dword v[0:1], v2
898; GFX8-NEXT:    s_endpgm
899;
900; GFX9-LABEL: v_clamp_med3_aby_negzero_f32:
901; GFX9:       ; %bb.0:
902; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
903; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
904; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
906; GFX9-NEXT:    s_brev_b32 s2, 1
907; GFX9-NEXT:    s_waitcnt vmcnt(0)
908; GFX9-NEXT:    v_med3_f32 v1, s2, 1.0, v1
909; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
910; GFX9-NEXT:    s_endpgm
911;
912; GFX11-LABEL: v_clamp_med3_aby_negzero_f32:
913; GFX11:       ; %bb.0:
914; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
915; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
916; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
917; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
918; GFX11-NEXT:    s_waitcnt vmcnt(0)
919; GFX11-NEXT:    v_med3_f32 v1, 0x80000000, 1.0, v1
920; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
921; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
922; GFX11-NEXT:    s_endpgm
923  %tid = call i32 @llvm.amdgcn.workitem.id.x()
924  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
925  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
926  %a = load float, float addrspace(1)* %gep0
927  %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
928  store float %med, float addrspace(1)* %out.gep
929  ret void
930}
931
932define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
933; GFX6-LABEL: v_clamp_med3_aby_f32:
934; GFX6:       ; %bb.0:
935; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
936; GFX6-NEXT:    s_mov_b32 s7, 0xf000
937; GFX6-NEXT:    s_mov_b32 s6, 0
938; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
939; GFX6-NEXT:    v_mov_b32_e32 v1, 0
940; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
941; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
942; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
943; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
944; GFX6-NEXT:    s_waitcnt vmcnt(0)
945; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
946; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
947; GFX6-NEXT:    s_endpgm
948;
949; GFX8-LABEL: v_clamp_med3_aby_f32:
950; GFX8:       ; %bb.0:
951; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
952; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
953; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX8-NEXT:    v_mov_b32_e32 v1, s3
955; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
956; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
957; GFX8-NEXT:    flat_load_dword v3, v[0:1]
958; GFX8-NEXT:    v_mov_b32_e32 v1, s1
959; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
960; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
961; GFX8-NEXT:    s_waitcnt vmcnt(0)
962; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
963; GFX8-NEXT:    flat_store_dword v[0:1], v2
964; GFX8-NEXT:    s_endpgm
965;
966; GFX9-LABEL: v_clamp_med3_aby_f32:
967; GFX9:       ; %bb.0:
968; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
969; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
970; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
971; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
972; GFX9-NEXT:    s_waitcnt vmcnt(0)
973; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
974; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
975; GFX9-NEXT:    s_endpgm
976;
977; GFX11-LABEL: v_clamp_med3_aby_f32:
978; GFX11:       ; %bb.0:
979; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
980; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
981; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
983; GFX11-NEXT:    s_waitcnt vmcnt(0)
984; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
985; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
986; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
987; GFX11-NEXT:    s_endpgm
988  %tid = call i32 @llvm.amdgcn.workitem.id.x()
989  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
990  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
991  %a = load float, float addrspace(1)* %gep0
992  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
993  store float %med, float addrspace(1)* %out.gep
994  ret void
995}
996
997define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
998; GFX6-LABEL: v_clamp_med3_bay_f32:
999; GFX6:       ; %bb.0:
1000; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1001; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1002; GFX6-NEXT:    s_mov_b32 s6, 0
1003; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1004; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1005; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1007; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1008; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1009; GFX6-NEXT:    s_waitcnt vmcnt(0)
1010; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1011; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1012; GFX6-NEXT:    s_endpgm
1013;
1014; GFX8-LABEL: v_clamp_med3_bay_f32:
1015; GFX8:       ; %bb.0:
1016; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1017; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1018; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1019; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1020; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1021; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1022; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1023; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1024; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1025; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1026; GFX8-NEXT:    s_waitcnt vmcnt(0)
1027; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1028; GFX8-NEXT:    flat_store_dword v[0:1], v2
1029; GFX8-NEXT:    s_endpgm
1030;
1031; GFX9-LABEL: v_clamp_med3_bay_f32:
1032; GFX9:       ; %bb.0:
1033; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1034; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1035; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1037; GFX9-NEXT:    s_waitcnt vmcnt(0)
1038; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1039; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1040; GFX9-NEXT:    s_endpgm
1041;
1042; GFX11-LABEL: v_clamp_med3_bay_f32:
1043; GFX11:       ; %bb.0:
1044; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1045; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1046; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1047; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1048; GFX11-NEXT:    s_waitcnt vmcnt(0)
1049; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1050; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1051; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1052; GFX11-NEXT:    s_endpgm
1053  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1054  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1055  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1056  %a = load float, float addrspace(1)* %gep0
1057  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
1058  store float %med, float addrspace(1)* %out.gep
1059  ret void
1060}
1061
1062define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
1063; GFX6-LABEL: v_clamp_med3_yab_f32:
1064; GFX6:       ; %bb.0:
1065; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1066; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1067; GFX6-NEXT:    s_mov_b32 s6, 0
1068; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1069; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1070; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1072; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1073; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1074; GFX6-NEXT:    s_waitcnt vmcnt(0)
1075; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1076; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1077; GFX6-NEXT:    s_endpgm
1078;
1079; GFX8-LABEL: v_clamp_med3_yab_f32:
1080; GFX8:       ; %bb.0:
1081; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1082; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1083; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1085; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1086; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1087; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1088; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1089; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1090; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1091; GFX8-NEXT:    s_waitcnt vmcnt(0)
1092; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1093; GFX8-NEXT:    flat_store_dword v[0:1], v2
1094; GFX8-NEXT:    s_endpgm
1095;
1096; GFX9-LABEL: v_clamp_med3_yab_f32:
1097; GFX9:       ; %bb.0:
1098; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1099; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1100; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1101; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1102; GFX9-NEXT:    s_waitcnt vmcnt(0)
1103; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1104; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1105; GFX9-NEXT:    s_endpgm
1106;
1107; GFX11-LABEL: v_clamp_med3_yab_f32:
1108; GFX11:       ; %bb.0:
1109; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1110; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1111; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1112; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1113; GFX11-NEXT:    s_waitcnt vmcnt(0)
1114; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1115; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1116; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1117; GFX11-NEXT:    s_endpgm
1118  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1119  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1120  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1121  %a = load float, float addrspace(1)* %gep0
1122  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
1123  store float %med, float addrspace(1)* %out.gep
1124  ret void
1125}
1126
1127define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
1128; GFX6-LABEL: v_clamp_med3_yba_f32:
1129; GFX6:       ; %bb.0:
1130; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1131; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1132; GFX6-NEXT:    s_mov_b32 s6, 0
1133; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1134; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1135; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1137; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1138; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1139; GFX6-NEXT:    s_waitcnt vmcnt(0)
1140; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1141; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1142; GFX6-NEXT:    s_endpgm
1143;
1144; GFX8-LABEL: v_clamp_med3_yba_f32:
1145; GFX8:       ; %bb.0:
1146; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1147; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1148; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1150; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1151; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1152; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1153; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1154; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1155; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1156; GFX8-NEXT:    s_waitcnt vmcnt(0)
1157; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1158; GFX8-NEXT:    flat_store_dword v[0:1], v2
1159; GFX8-NEXT:    s_endpgm
1160;
1161; GFX9-LABEL: v_clamp_med3_yba_f32:
1162; GFX9:       ; %bb.0:
1163; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1164; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1165; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1166; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1167; GFX9-NEXT:    s_waitcnt vmcnt(0)
1168; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1169; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1170; GFX9-NEXT:    s_endpgm
1171;
1172; GFX11-LABEL: v_clamp_med3_yba_f32:
1173; GFX11:       ; %bb.0:
1174; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1175; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1176; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1177; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1178; GFX11-NEXT:    s_waitcnt vmcnt(0)
1179; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1180; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1181; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1182; GFX11-NEXT:    s_endpgm
1183  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1184  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1185  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1186  %a = load float, float addrspace(1)* %gep0
1187  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
1188  store float %med, float addrspace(1)* %out.gep
1189  ret void
1190}
1191
1192define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
1193; GFX6-LABEL: v_clamp_med3_ayb_f32:
1194; GFX6:       ; %bb.0:
1195; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1196; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1197; GFX6-NEXT:    s_mov_b32 s6, 0
1198; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1199; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1200; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1201; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1202; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1203; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1204; GFX6-NEXT:    s_waitcnt vmcnt(0)
1205; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1206; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1207; GFX6-NEXT:    s_endpgm
1208;
1209; GFX8-LABEL: v_clamp_med3_ayb_f32:
1210; GFX8:       ; %bb.0:
1211; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1212; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1213; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1214; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1215; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1216; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1217; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1218; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1219; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1220; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1221; GFX8-NEXT:    s_waitcnt vmcnt(0)
1222; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1223; GFX8-NEXT:    flat_store_dword v[0:1], v2
1224; GFX8-NEXT:    s_endpgm
1225;
1226; GFX9-LABEL: v_clamp_med3_ayb_f32:
1227; GFX9:       ; %bb.0:
1228; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1229; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1230; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1231; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1232; GFX9-NEXT:    s_waitcnt vmcnt(0)
1233; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1234; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1235; GFX9-NEXT:    s_endpgm
1236;
1237; GFX11-LABEL: v_clamp_med3_ayb_f32:
1238; GFX11:       ; %bb.0:
1239; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1240; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1241; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1242; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1243; GFX11-NEXT:    s_waitcnt vmcnt(0)
1244; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1245; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1246; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1247; GFX11-NEXT:    s_endpgm
1248  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1249  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1250  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1251  %a = load float, float addrspace(1)* %gep0
1252  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
1253  store float %med, float addrspace(1)* %out.gep
1254  ret void
1255}
1256
1257define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
1258; GFX6-LABEL: v_clamp_med3_bya_f32:
1259; GFX6:       ; %bb.0:
1260; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1261; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1262; GFX6-NEXT:    s_mov_b32 s6, 0
1263; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1264; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1265; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1267; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1268; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1269; GFX6-NEXT:    s_waitcnt vmcnt(0)
1270; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1271; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1272; GFX6-NEXT:    s_endpgm
1273;
1274; GFX8-LABEL: v_clamp_med3_bya_f32:
1275; GFX8:       ; %bb.0:
1276; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1277; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1278; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1279; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1280; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1281; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1282; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1283; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1284; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1285; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1286; GFX8-NEXT:    s_waitcnt vmcnt(0)
1287; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1288; GFX8-NEXT:    flat_store_dword v[0:1], v2
1289; GFX8-NEXT:    s_endpgm
1290;
1291; GFX9-LABEL: v_clamp_med3_bya_f32:
1292; GFX9:       ; %bb.0:
1293; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1294; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1295; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1296; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1297; GFX9-NEXT:    s_waitcnt vmcnt(0)
1298; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1299; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1300; GFX9-NEXT:    s_endpgm
1301;
1302; GFX11-LABEL: v_clamp_med3_bya_f32:
1303; GFX11:       ; %bb.0:
1304; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1305; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1306; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1307; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1308; GFX11-NEXT:    s_waitcnt vmcnt(0)
1309; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1310; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1311; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1312; GFX11-NEXT:    s_endpgm
1313  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1314  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1315  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1316  %a = load float, float addrspace(1)* %gep0
1317  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
1318  store float %med, float addrspace(1)* %out.gep
1319  ret void
1320}
1321
1322define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
1323; GFX6-LABEL: v_clamp_constants_to_one_f32:
1324; GFX6:       ; %bb.0:
1325; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1326; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1327; GFX6-NEXT:    s_mov_b32 s2, 0
1328; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1329; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1330; GFX6-NEXT:    v_mov_b32_e32 v2, 1.0
1331; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1332; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1333; GFX6-NEXT:    s_endpgm
1334;
1335; GFX8-LABEL: v_clamp_constants_to_one_f32:
1336; GFX8:       ; %bb.0:
1337; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1338; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1339; GFX8-NEXT:    v_mov_b32_e32 v2, 1.0
1340; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1341; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1342; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1343; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1344; GFX8-NEXT:    flat_store_dword v[0:1], v2
1345; GFX8-NEXT:    s_endpgm
1346;
1347; GFX9-LABEL: v_clamp_constants_to_one_f32:
1348; GFX9:       ; %bb.0:
1349; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1350; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1351; GFX9-NEXT:    v_mov_b32_e32 v1, 1.0
1352; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1353; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1354; GFX9-NEXT:    s_endpgm
1355;
1356; GFX11-LABEL: v_clamp_constants_to_one_f32:
1357; GFX11:       ; %bb.0:
1358; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1359; GFX11-NEXT:    v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
1360; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1361; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1362; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1363; GFX11-NEXT:    s_endpgm
1364  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1365  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1366  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
1367  store float %med, float addrspace(1)* %out.gep
1368  ret void
1369}
1370
1371define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
1372; GFX6-LABEL: v_clamp_constants_to_zero_f32:
1373; GFX6:       ; %bb.0:
1374; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1375; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1376; GFX6-NEXT:    s_mov_b32 s2, 0
1377; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1378; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1379; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1380; GFX6-NEXT:    buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1381; GFX6-NEXT:    s_endpgm
1382;
1383; GFX8-LABEL: v_clamp_constants_to_zero_f32:
1384; GFX8:       ; %bb.0:
1385; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1386; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1387; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1388; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1390; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1391; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1392; GFX8-NEXT:    flat_store_dword v[0:1], v2
1393; GFX8-NEXT:    s_endpgm
1394;
1395; GFX9-LABEL: v_clamp_constants_to_zero_f32:
1396; GFX9:       ; %bb.0:
1397; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1398; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1399; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1400; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1401; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1402; GFX9-NEXT:    s_endpgm
1403;
1404; GFX11-LABEL: v_clamp_constants_to_zero_f32:
1405; GFX11:       ; %bb.0:
1406; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1407; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1408; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1409; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1410; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1411; GFX11-NEXT:    s_endpgm
1412  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1413  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1414  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
1415  store float %med, float addrspace(1)* %out.gep
1416  ret void
1417}
1418
1419define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
1420; GFX6-LABEL: v_clamp_constant_preserve_f32:
1421; GFX6:       ; %bb.0:
1422; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1423; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1424; GFX6-NEXT:    s_mov_b32 s2, 0
1425; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1426; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1427; GFX6-NEXT:    v_mov_b32_e32 v2, 0.5
1428; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1429; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1430; GFX6-NEXT:    s_endpgm
1431;
1432; GFX8-LABEL: v_clamp_constant_preserve_f32:
1433; GFX8:       ; %bb.0:
1434; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1435; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1436; GFX8-NEXT:    v_mov_b32_e32 v2, 0.5
1437; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1438; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1439; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1440; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1441; GFX8-NEXT:    flat_store_dword v[0:1], v2
1442; GFX8-NEXT:    s_endpgm
1443;
1444; GFX9-LABEL: v_clamp_constant_preserve_f32:
1445; GFX9:       ; %bb.0:
1446; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1447; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1448; GFX9-NEXT:    v_mov_b32_e32 v1, 0.5
1449; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1450; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1451; GFX9-NEXT:    s_endpgm
1452;
1453; GFX11-LABEL: v_clamp_constant_preserve_f32:
1454; GFX11:       ; %bb.0:
1455; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1456; GFX11-NEXT:    v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
1457; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1458; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1459; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1460; GFX11-NEXT:    s_endpgm
1461  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1462  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1463  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
1464  store float %med, float addrspace(1)* %out.gep
1465  ret void
1466}
1467
1468define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
1469; GFX6-LABEL: v_clamp_constant_preserve_denorm_f32:
1470; GFX6:       ; %bb.0:
1471; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1472; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1473; GFX6-NEXT:    s_mov_b32 s2, 0
1474; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1475; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1476; GFX6-NEXT:    v_mov_b32_e32 v2, 0x7fffff
1477; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1478; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1479; GFX6-NEXT:    s_endpgm
1480;
1481; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32:
1482; GFX8:       ; %bb.0:
1483; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1484; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1485; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fffff
1486; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1487; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1488; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1489; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1490; GFX8-NEXT:    flat_store_dword v[0:1], v2
1491; GFX8-NEXT:    s_endpgm
1492;
1493; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32:
1494; GFX9:       ; %bb.0:
1495; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1496; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1497; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fffff
1498; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1499; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1500; GFX9-NEXT:    s_endpgm
1501;
1502; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32:
1503; GFX11:       ; %bb.0:
1504; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1505; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
1506; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1507; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1508; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1509; GFX11-NEXT:    s_endpgm
1510  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1511  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1512  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
1513  store float %med, float addrspace(1)* %out.gep
1514  ret void
1515}
1516
1517define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
1518; GFX6-LABEL: v_clamp_constant_qnan_f32:
1519; GFX6:       ; %bb.0:
1520; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1521; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1522; GFX6-NEXT:    s_mov_b32 s2, 0
1523; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1524; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1525; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1526; GFX6-NEXT:    buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1527; GFX6-NEXT:    s_endpgm
1528;
1529; GFX8-LABEL: v_clamp_constant_qnan_f32:
1530; GFX8:       ; %bb.0:
1531; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1532; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1533; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1534; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1535; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1536; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1537; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1538; GFX8-NEXT:    flat_store_dword v[0:1], v2
1539; GFX8-NEXT:    s_endpgm
1540;
1541; GFX9-LABEL: v_clamp_constant_qnan_f32:
1542; GFX9:       ; %bb.0:
1543; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1544; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1545; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1546; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1547; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1548; GFX9-NEXT:    s_endpgm
1549;
1550; GFX11-LABEL: v_clamp_constant_qnan_f32:
1551; GFX11:       ; %bb.0:
1552; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1553; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1554; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1555; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1556; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1557; GFX11-NEXT:    s_endpgm
1558  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1559  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1560  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
1561  store float %med, float addrspace(1)* %out.gep
1562  ret void
1563}
1564
1565define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
1566; GFX6-LABEL: v_clamp_constant_snan_f32:
1567; GFX6:       ; %bb.0:
1568; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1569; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1570; GFX6-NEXT:    s_mov_b32 s2, 0
1571; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1572; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1573; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1574; GFX6-NEXT:    buffer_store_dword v1, v[0:1], s[0:3], 0 addr64
1575; GFX6-NEXT:    s_endpgm
1576;
1577; GFX8-LABEL: v_clamp_constant_snan_f32:
1578; GFX8:       ; %bb.0:
1579; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1580; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1581; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1582; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1583; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1584; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1585; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1586; GFX8-NEXT:    flat_store_dword v[0:1], v2
1587; GFX8-NEXT:    s_endpgm
1588;
1589; GFX9-LABEL: v_clamp_constant_snan_f32:
1590; GFX9:       ; %bb.0:
1591; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1592; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1593; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1594; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1595; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1596; GFX9-NEXT:    s_endpgm
1597;
1598; GFX11-LABEL: v_clamp_constant_snan_f32:
1599; GFX11:       ; %bb.0:
1600; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1601; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
1602; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1604; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1605; GFX11-NEXT:    s_endpgm
1606  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1607  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1608  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
1609  store float %med, float addrspace(1)* %out.gep
1610  ret void
1611}
1612
1613; ---------------------------------------------------------------------
1614; Test non-default behaviors enabling snans and disabling dx10_clamp
1615; ---------------------------------------------------------------------
1616
1617define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
1618; GFX6-LABEL: v_clamp_f32_no_dx10_clamp:
1619; GFX6:       ; %bb.0:
1620; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1621; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1622; GFX6-NEXT:    s_mov_b32 s6, 0
1623; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1624; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1625; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1626; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1627; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1628; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1629; GFX6-NEXT:    s_waitcnt vmcnt(0)
1630; GFX6-NEXT:    v_add_f32_e32 v2, 0.5, v2
1631; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 1.0
1632; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1633; GFX6-NEXT:    s_endpgm
1634;
1635; GFX8-LABEL: v_clamp_f32_no_dx10_clamp:
1636; GFX8:       ; %bb.0:
1637; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1638; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1639; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1640; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1641; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1642; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1643; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1644; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1645; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1646; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1647; GFX8-NEXT:    s_waitcnt vmcnt(0)
1648; GFX8-NEXT:    v_add_f32_e32 v2, 0.5, v3
1649; GFX8-NEXT:    v_med3_f32 v2, v2, 0, 1.0
1650; GFX8-NEXT:    flat_store_dword v[0:1], v2
1651; GFX8-NEXT:    s_endpgm
1652;
1653; GFX9-LABEL: v_clamp_f32_no_dx10_clamp:
1654; GFX9:       ; %bb.0:
1655; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1656; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1657; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1658; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1659; GFX9-NEXT:    s_waitcnt vmcnt(0)
1660; GFX9-NEXT:    v_add_f32_e32 v1, 0.5, v1
1661; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
1662; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1663; GFX9-NEXT:    s_endpgm
1664;
1665; GFX11-LABEL: v_clamp_f32_no_dx10_clamp:
1666; GFX11:       ; %bb.0:
1667; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1668; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1669; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1670; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1671; GFX11-NEXT:    s_waitcnt vmcnt(0)
1672; GFX11-NEXT:    v_add_f32_e32 v1, 0.5, v1
1673; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1674; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
1675; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1676; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1677; GFX11-NEXT:    s_endpgm
1678  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1679  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1680  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1681  %a = load float, float addrspace(1)* %gep0
1682  %a.nnan = fadd nnan float %a, 0.5
1683  %max = call float @llvm.maxnum.f32(float %a.nnan, float 0.0)
1684  %med = call float @llvm.minnum.f32(float %max, float 1.0)
1685
1686  store float %med, float addrspace(1)* %out.gep
1687  ret void
1688}
1689
1690define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
1691; GFX6-LABEL: v_clamp_f32_snan_dx10clamp:
1692; GFX6:       ; %bb.0:
1693; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1694; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1695; GFX6-NEXT:    s_mov_b32 s6, 0
1696; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1697; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1698; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1700; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1701; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1702; GFX6-NEXT:    s_waitcnt vmcnt(0)
1703; GFX6-NEXT:    v_add_f32_e64 v2, v2, 0.5 clamp
1704; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1705; GFX6-NEXT:    s_endpgm
1706;
1707; GFX8-LABEL: v_clamp_f32_snan_dx10clamp:
1708; GFX8:       ; %bb.0:
1709; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1710; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1711; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1712; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1713; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1714; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1715; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1716; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1717; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1718; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1719; GFX8-NEXT:    s_waitcnt vmcnt(0)
1720; GFX8-NEXT:    v_add_f32_e64 v2, v3, 0.5 clamp
1721; GFX8-NEXT:    flat_store_dword v[0:1], v2
1722; GFX8-NEXT:    s_endpgm
1723;
1724; GFX9-LABEL: v_clamp_f32_snan_dx10clamp:
1725; GFX9:       ; %bb.0:
1726; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1727; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1728; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1729; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1730; GFX9-NEXT:    s_waitcnt vmcnt(0)
1731; GFX9-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
1732; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1733; GFX9-NEXT:    s_endpgm
1734;
1735; GFX11-LABEL: v_clamp_f32_snan_dx10clamp:
1736; GFX11:       ; %bb.0:
1737; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1738; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1739; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1741; GFX11-NEXT:    s_waitcnt vmcnt(0)
1742; GFX11-NEXT:    v_add_f32_e64 v1, v1, 0.5 clamp
1743; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1744; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1745; GFX11-NEXT:    s_endpgm
1746  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1747  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1748  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1749  %a = load float, float addrspace(1)* %gep0
1750  %add = fadd float %a, 0.5
1751  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
1752  %med = call float @llvm.minnum.f32(float %max, float 1.0)
1753
1754  store float %med, float addrspace(1)* %out.gep
1755  ret void
1756}
1757
1758define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
1759; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp:
1760; GFX6:       ; %bb.0:
1761; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1762; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1763; GFX6-NEXT:    s_mov_b32 s6, 0
1764; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1765; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1766; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1767; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1768; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1769; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1770; GFX6-NEXT:    s_waitcnt vmcnt(0)
1771; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
1772; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 1.0
1773; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1774; GFX6-NEXT:    s_endpgm
1775;
1776; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp:
1777; GFX8:       ; %bb.0:
1778; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1779; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1780; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1781; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1782; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1783; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1784; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1785; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1786; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1787; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1788; GFX8-NEXT:    s_waitcnt vmcnt(0)
1789; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v3
1790; GFX8-NEXT:    v_med3_f32 v2, v2, 0, 1.0
1791; GFX8-NEXT:    flat_store_dword v[0:1], v2
1792; GFX8-NEXT:    s_endpgm
1793;
1794; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp:
1795; GFX9:       ; %bb.0:
1796; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1797; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1798; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1799; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1800; GFX9-NEXT:    s_waitcnt vmcnt(0)
1801; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
1802; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
1803; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1804; GFX9-NEXT:    s_endpgm
1805;
1806; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp:
1807; GFX11:       ; %bb.0:
1808; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1809; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1810; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1811; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1812; GFX11-NEXT:    s_waitcnt vmcnt(0)
1813; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
1814; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1815; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
1816; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1817; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1818; GFX11-NEXT:    s_endpgm
1819  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1820  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1821  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1822  %a = load float, float addrspace(1)* %gep0
1823  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
1824  %med = call float @llvm.minnum.f32(float %max, float 1.0)
1825
1826  store float %med, float addrspace(1)* %out.gep
1827  ret void
1828}
1829
1830define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
1831; GFX6-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
1832; GFX6:       ; %bb.0:
1833; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1834; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1835; GFX6-NEXT:    s_mov_b32 s6, 0
1836; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1837; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1838; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1839; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1840; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1841; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1842; GFX6-NEXT:    s_waitcnt vmcnt(0)
1843; GFX6-NEXT:    v_add_f32_e32 v2, 1.0, v2
1844; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 1.0
1845; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1846; GFX6-NEXT:    s_endpgm
1847;
1848; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
1849; GFX8:       ; %bb.0:
1850; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1851; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1852; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1853; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1854; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1855; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1856; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1857; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1858; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1859; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1860; GFX8-NEXT:    s_waitcnt vmcnt(0)
1861; GFX8-NEXT:    v_add_f32_e32 v2, 1.0, v3
1862; GFX8-NEXT:    v_med3_f32 v2, v2, 0, 1.0
1863; GFX8-NEXT:    flat_store_dword v[0:1], v2
1864; GFX8-NEXT:    s_endpgm
1865;
1866; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
1867; GFX9:       ; %bb.0:
1868; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1869; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1870; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1871; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1872; GFX9-NEXT:    s_waitcnt vmcnt(0)
1873; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
1874; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
1875; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1876; GFX9-NEXT:    s_endpgm
1877;
1878; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
1879; GFX11:       ; %bb.0:
1880; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1881; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1882; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1883; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1884; GFX11-NEXT:    s_waitcnt vmcnt(0)
1885; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
1886; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1887; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
1888; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1889; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1890; GFX11-NEXT:    s_endpgm
1891  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1892  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1893  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1894  %a = load float, float addrspace(1)* %gep0
1895  %add  = fadd nnan float %a, 1.0
1896  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
1897  %med = call float @llvm.minnum.f32(float %max, float 1.0)
1898
1899  store float %med, float addrspace(1)* %out.gep
1900  ret void
1901}
1902
1903define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
1904; GFX6-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
1905; GFX6:       ; %bb.0:
1906; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1907; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1908; GFX6-NEXT:    s_mov_b32 s6, 0
1909; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1910; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1911; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1912; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1913; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1914; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1915; GFX6-NEXT:    s_waitcnt vmcnt(0)
1916; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1917; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1918; GFX6-NEXT:    s_endpgm
1919;
1920; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
1921; GFX8:       ; %bb.0:
1922; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1923; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1924; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1926; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1927; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1928; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1929; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1930; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1931; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1932; GFX8-NEXT:    s_waitcnt vmcnt(0)
1933; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1934; GFX8-NEXT:    flat_store_dword v[0:1], v2
1935; GFX8-NEXT:    s_endpgm
1936;
1937; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
1938; GFX9:       ; %bb.0:
1939; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1940; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1941; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1942; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1943; GFX9-NEXT:    s_waitcnt vmcnt(0)
1944; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1945; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1946; GFX9-NEXT:    s_endpgm
1947;
1948; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
1949; GFX11:       ; %bb.0:
1950; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1951; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1952; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1953; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
1954; GFX11-NEXT:    s_waitcnt vmcnt(0)
1955; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
1956; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1957; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1958; GFX11-NEXT:    s_endpgm
1959  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1960  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1961  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1962  %a = load float, float addrspace(1)* %gep0
1963  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
1964  store float %med, float addrspace(1)* %out.gep
1965  ret void
1966}
1967
1968define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
1969; GFX6-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
1970; GFX6:       ; %bb.0:
1971; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1972; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1973; GFX6-NEXT:    s_mov_b32 s6, 0
1974; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1975; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1976; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1977; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
1978; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1979; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
1980; GFX6-NEXT:    s_waitcnt vmcnt(0)
1981; GFX6-NEXT:    v_max_f32_e64 v2, v2, v2 clamp
1982; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
1983; GFX6-NEXT:    s_endpgm
1984;
1985; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
1986; GFX8:       ; %bb.0:
1987; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1988; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1989; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1990; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1991; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1992; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1993; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1994; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1995; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
1996; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1997; GFX8-NEXT:    s_waitcnt vmcnt(0)
1998; GFX8-NEXT:    v_max_f32_e64 v2, v3, v3 clamp
1999; GFX8-NEXT:    flat_store_dword v[0:1], v2
2000; GFX8-NEXT:    s_endpgm
2001;
2002; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2003; GFX9:       ; %bb.0:
2004; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2005; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2006; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2007; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2008; GFX9-NEXT:    s_waitcnt vmcnt(0)
2009; GFX9-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
2010; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2011; GFX9-NEXT:    s_endpgm
2012;
2013; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
2014; GFX11:       ; %bb.0:
2015; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2016; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2017; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2018; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2019; GFX11-NEXT:    s_waitcnt vmcnt(0)
2020; GFX11-NEXT:    v_max_f32_e64 v1, v1, v1 clamp
2021; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2022; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2023; GFX11-NEXT:    s_endpgm
2024  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2025  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
2026  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
2027  %a = load float, float addrspace(1)* %gep0
2028  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
2029  store float %med, float addrspace(1)* %out.gep
2030  ret void
2031}
2032
2033define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
2034; GFX6-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2035; GFX6:       ; %bb.0:
2036; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2037; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2038; GFX6-NEXT:    s_mov_b32 s6, 0
2039; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2040; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2041; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2042; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2043; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2044; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2045; GFX6-NEXT:    s_waitcnt vmcnt(0)
2046; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 1.0
2047; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2048; GFX6-NEXT:    s_endpgm
2049;
2050; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2051; GFX8:       ; %bb.0:
2052; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2053; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2054; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2055; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2056; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2057; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2058; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2059; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2060; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2061; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2062; GFX8-NEXT:    s_waitcnt vmcnt(0)
2063; GFX8-NEXT:    v_med3_f32 v2, v3, 0, 1.0
2064; GFX8-NEXT:    flat_store_dword v[0:1], v2
2065; GFX8-NEXT:    s_endpgm
2066;
2067; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2068; GFX9:       ; %bb.0:
2069; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2070; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2071; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2072; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2073; GFX9-NEXT:    s_waitcnt vmcnt(0)
2074; GFX9-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2075; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2076; GFX9-NEXT:    s_endpgm
2077;
2078; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
2079; GFX11:       ; %bb.0:
2080; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2081; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2082; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2083; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2084; GFX11-NEXT:    s_waitcnt vmcnt(0)
2085; GFX11-NEXT:    v_med3_f32 v1, v1, 0, 1.0
2086; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2087; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2088; GFX11-NEXT:    s_endpgm
2089  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2090  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
2091  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
2092  %a = load float, float addrspace(1)* %gep0
2093  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
2094  store float %med, float addrspace(1)* %out.gep
2095  ret void
2096}
2097
2098define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
2099; GFX6-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2100; GFX6:       ; %bb.0:
2101; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2102; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2103; GFX6-NEXT:    s_mov_b32 s6, 0
2104; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2105; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2106; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2107; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2108; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2109; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2110; GFX6-NEXT:    s_waitcnt vmcnt(0)
2111; GFX6-NEXT:    v_med3_f32 v2, v2, 1.0, 0
2112; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2113; GFX6-NEXT:    s_endpgm
2114;
2115; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2116; GFX8:       ; %bb.0:
2117; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2118; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2119; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2120; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2121; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2122; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2123; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2124; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2125; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2126; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2127; GFX8-NEXT:    s_waitcnt vmcnt(0)
2128; GFX8-NEXT:    v_med3_f32 v2, v3, 1.0, 0
2129; GFX8-NEXT:    flat_store_dword v[0:1], v2
2130; GFX8-NEXT:    s_endpgm
2131;
2132; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2133; GFX9:       ; %bb.0:
2134; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2135; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2136; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2137; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2138; GFX9-NEXT:    s_waitcnt vmcnt(0)
2139; GFX9-NEXT:    v_med3_f32 v1, v1, 1.0, 0
2140; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2141; GFX9-NEXT:    s_endpgm
2142;
2143; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
2144; GFX11:       ; %bb.0:
2145; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2146; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2147; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2148; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2149; GFX11-NEXT:    s_waitcnt vmcnt(0)
2150; GFX11-NEXT:    v_med3_f32 v1, v1, 1.0, 0
2151; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2152; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2153; GFX11-NEXT:    s_endpgm
2154  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2155  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
2156  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
2157  %a = load float, float addrspace(1)* %gep0
2158  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
2159  store float %med, float addrspace(1)* %out.gep
2160  ret void
2161}
2162
2163define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
2164; GFX6-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2165; GFX6:       ; %bb.0:
2166; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2167; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2168; GFX6-NEXT:    s_mov_b32 s6, 0
2169; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2170; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2171; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2172; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2173; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2174; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2175; GFX6-NEXT:    s_waitcnt vmcnt(0)
2176; GFX6-NEXT:    v_med3_f32 v2, 0, v2, 1.0
2177; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2178; GFX6-NEXT:    s_endpgm
2179;
2180; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2181; GFX8:       ; %bb.0:
2182; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2183; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2184; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2185; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2186; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2187; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2188; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2189; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2190; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2191; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2192; GFX8-NEXT:    s_waitcnt vmcnt(0)
2193; GFX8-NEXT:    v_med3_f32 v2, 0, v3, 1.0
2194; GFX8-NEXT:    flat_store_dword v[0:1], v2
2195; GFX8-NEXT:    s_endpgm
2196;
2197; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2198; GFX9:       ; %bb.0:
2199; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2200; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2201; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2202; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2203; GFX9-NEXT:    s_waitcnt vmcnt(0)
2204; GFX9-NEXT:    v_med3_f32 v1, 0, v1, 1.0
2205; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2206; GFX9-NEXT:    s_endpgm
2207;
2208; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
2209; GFX11:       ; %bb.0:
2210; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2211; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2212; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2214; GFX11-NEXT:    s_waitcnt vmcnt(0)
2215; GFX11-NEXT:    v_med3_f32 v1, 0, v1, 1.0
2216; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2217; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2218; GFX11-NEXT:    s_endpgm
2219  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2220  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
2221  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
2222  %a = load float, float addrspace(1)* %gep0
2223  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
2224  store float %med, float addrspace(1)* %out.gep
2225  ret void
2226}
2227
2228define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
2229; GFX6-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2230; GFX6:       ; %bb.0:
2231; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2232; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2233; GFX6-NEXT:    s_mov_b32 s6, 0
2234; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2235; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2236; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2237; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2238; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2239; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2240; GFX6-NEXT:    s_waitcnt vmcnt(0)
2241; GFX6-NEXT:    v_med3_f32 v2, 1.0, v2, 0
2242; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2243; GFX6-NEXT:    s_endpgm
2244;
2245; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2246; GFX8:       ; %bb.0:
2247; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2248; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2249; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2250; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2251; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2252; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2253; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2254; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2255; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2256; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2257; GFX8-NEXT:    s_waitcnt vmcnt(0)
2258; GFX8-NEXT:    v_med3_f32 v2, 1.0, v3, 0
2259; GFX8-NEXT:    flat_store_dword v[0:1], v2
2260; GFX8-NEXT:    s_endpgm
2261;
2262; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2263; GFX9:       ; %bb.0:
2264; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2265; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2266; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2267; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2268; GFX9-NEXT:    s_waitcnt vmcnt(0)
2269; GFX9-NEXT:    v_med3_f32 v1, 1.0, v1, 0
2270; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2271; GFX9-NEXT:    s_endpgm
2272;
2273; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
2274; GFX11:       ; %bb.0:
2275; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2276; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2277; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2278; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2279; GFX11-NEXT:    s_waitcnt vmcnt(0)
2280; GFX11-NEXT:    v_med3_f32 v1, 1.0, v1, 0
2281; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2282; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2283; GFX11-NEXT:    s_endpgm
2284  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2285  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
2286  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
2287  %a = load float, float addrspace(1)* %gep0
2288  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
2289  store float %med, float addrspace(1)* %out.gep
2290  ret void
2291}
2292
2293define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
2294; GFX6-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2295; GFX6:       ; %bb.0:
2296; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2297; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2298; GFX6-NEXT:    s_mov_b32 s2, 0
2299; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2300; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2301; GFX6-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
2302; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2303; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2304; GFX6-NEXT:    s_endpgm
2305;
2306; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2307; GFX8:       ; %bb.0:
2308; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2309; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2310; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
2311; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2312; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2313; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2314; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2315; GFX8-NEXT:    flat_store_dword v[0:1], v2
2316; GFX8-NEXT:    s_endpgm
2317;
2318; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2319; GFX9:       ; %bb.0:
2320; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2321; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2322; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
2323; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2324; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2325; GFX9-NEXT:    s_endpgm
2326;
2327; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
2328; GFX11:       ; %bb.0:
2329; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2330; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0
2331; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2332; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2333; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2334; GFX11-NEXT:    s_endpgm
2335  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2336  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
2337  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
2338  store float %med, float addrspace(1)* %out.gep
2339  ret void
2340}
2341
2342define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
2343; GFX6-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2344; GFX6:       ; %bb.0:
2345; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2346; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2347; GFX6-NEXT:    s_mov_b32 s2, 0
2348; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2349; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2350; GFX6-NEXT:    v_mov_b32_e32 v2, 0x7f800001
2351; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2352; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2353; GFX6-NEXT:    s_endpgm
2354;
2355; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2356; GFX8:       ; %bb.0:
2357; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2358; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2359; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7f800001
2360; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2361; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2362; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2363; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2364; GFX8-NEXT:    flat_store_dword v[0:1], v2
2365; GFX8-NEXT:    s_endpgm
2366;
2367; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2368; GFX9:       ; %bb.0:
2369; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2370; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2371; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7f800001
2372; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2373; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2374; GFX9-NEXT:    s_endpgm
2375;
2376; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
2377; GFX11:       ; %bb.0:
2378; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2379; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0
2380; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2381; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2382; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2383; GFX11-NEXT:    s_endpgm
2384  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2385  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
2386  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
2387  store float %med, float addrspace(1)* %out.gep
2388  ret void
2389}
2390
2391define amdgpu_kernel void @v_clamp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
2392; GFX6-LABEL: v_clamp_v2f16:
2393; GFX6:       ; %bb.0:
2394; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2395; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2396; GFX6-NEXT:    s_mov_b32 s6, 0
2397; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2398; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2399; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2400; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2401; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2402; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2403; GFX6-NEXT:    s_waitcnt vmcnt(0)
2404; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2405; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
2406; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
2407; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2408; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2409; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2410; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
2411; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2412; GFX6-NEXT:    s_endpgm
2413;
2414; GFX8-LABEL: v_clamp_v2f16:
2415; GFX8:       ; %bb.0:
2416; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2417; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2418; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2419; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2420; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2421; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2422; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2423; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2424; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2425; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2426; GFX8-NEXT:    s_waitcnt vmcnt(0)
2427; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2428; GFX8-NEXT:    v_max_f16_e64 v3, v3, v3 clamp
2429; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
2430; GFX8-NEXT:    flat_store_dword v[0:1], v2
2431; GFX8-NEXT:    s_endpgm
2432;
2433; GFX9-LABEL: v_clamp_v2f16:
2434; GFX9:       ; %bb.0:
2435; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2436; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2437; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2438; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2439; GFX9-NEXT:    s_waitcnt vmcnt(0)
2440; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
2441; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2442; GFX9-NEXT:    s_endpgm
2443;
2444; GFX11-LABEL: v_clamp_v2f16:
2445; GFX11:       ; %bb.0:
2446; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2447; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2448; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2449; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2450; GFX11-NEXT:    s_waitcnt vmcnt(0)
2451; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
2452; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2453; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2454; GFX11-NEXT:    s_endpgm
2455  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2456  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
2457  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
2458  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
2459  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> zeroinitializer)
2460  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2461
2462  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
2463  ret void
2464}
2465
2466define amdgpu_kernel void @v_clamp_v2f16_undef_elt(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
2467; GFX6-LABEL: v_clamp_v2f16_undef_elt:
2468; GFX6:       ; %bb.0:
2469; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2470; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2471; GFX6-NEXT:    s_mov_b32 s6, 0
2472; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2473; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2474; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2475; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2476; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2477; GFX6-NEXT:    s_mov_b32 s2, 0x7fc00000
2478; GFX6-NEXT:    s_waitcnt vmcnt(0)
2479; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
2480; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2481; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
2482; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
2483; GFX6-NEXT:    v_max_f32_e32 v3, 0x7fc00000, v3
2484; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
2485; GFX6-NEXT:    v_med3_f32 v2, v2, 0, s2
2486; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2487; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
2488; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2489; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2490; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2491; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
2492; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2493; GFX6-NEXT:    s_endpgm
2494;
2495; GFX8-LABEL: v_clamp_v2f16_undef_elt:
2496; GFX8:       ; %bb.0:
2497; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2498; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2499; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7e00
2500; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2501; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2502; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2503; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2504; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2505; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2506; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2507; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2508; GFX8-NEXT:    s_waitcnt vmcnt(0)
2509; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2510; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
2511; GFX8-NEXT:    v_max_f16_e32 v2, 0, v2
2512; GFX8-NEXT:    v_max_f16_e32 v3, 0x7e00, v3
2513; GFX8-NEXT:    v_min_f16_e32 v3, 1.0, v3
2514; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2515; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
2516; GFX8-NEXT:    flat_store_dword v[0:1], v2
2517; GFX8-NEXT:    s_endpgm
2518;
2519; GFX9-LABEL: v_clamp_v2f16_undef_elt:
2520; GFX9:       ; %bb.0:
2521; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2522; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2523; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2524; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2525; GFX9-NEXT:    s_waitcnt vmcnt(0)
2526; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
2527; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2528; GFX9-NEXT:    s_endpgm
2529;
2530; GFX11-LABEL: v_clamp_v2f16_undef_elt:
2531; GFX11:       ; %bb.0:
2532; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2533; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2534; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2535; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2536; GFX11-NEXT:    s_waitcnt vmcnt(0)
2537; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
2538; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2539; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2540; GFX11-NEXT:    s_endpgm
2541  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2542  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
2543  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
2544  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
2545  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
2546  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
2547
2548  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
2549  ret void
2550}
2551
2552define amdgpu_kernel void @v_clamp_v2f16_not_zero(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
2553; GFX6-LABEL: v_clamp_v2f16_not_zero:
2554; GFX6:       ; %bb.0:
2555; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2556; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2557; GFX6-NEXT:    s_mov_b32 s6, 0
2558; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2559; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2560; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2561; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2562; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2563; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2564; GFX6-NEXT:    s_waitcnt vmcnt(0)
2565; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
2566; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2567; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
2568; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
2569; GFX6-NEXT:    v_max_f32_e32 v3, 2.0, v3
2570; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2571; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
2572; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2573; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2574; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
2575; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2576; GFX6-NEXT:    s_endpgm
2577;
2578; GFX8-LABEL: v_clamp_v2f16_not_zero:
2579; GFX8:       ; %bb.0:
2580; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2581; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2582; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2583; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2584; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2585; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2586; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2587; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2588; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2589; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2590; GFX8-NEXT:    s_waitcnt vmcnt(0)
2591; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
2592; GFX8-NEXT:    v_max_f16_e32 v2, 2.0, v2
2593; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2594; GFX8-NEXT:    v_min_f16_e32 v2, 1.0, v2
2595; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
2596; GFX8-NEXT:    flat_store_dword v[0:1], v2
2597; GFX8-NEXT:    s_endpgm
2598;
2599; GFX9-LABEL: v_clamp_v2f16_not_zero:
2600; GFX9:       ; %bb.0:
2601; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2602; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2603; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2604; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2605; GFX9-NEXT:    s_waitcnt vmcnt(0)
2606; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2607; GFX9-NEXT:    v_pk_max_f16 v1, v1, 2.0
2608; GFX9-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
2609; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2610; GFX9-NEXT:    s_endpgm
2611;
2612; GFX11-LABEL: v_clamp_v2f16_not_zero:
2613; GFX11:       ; %bb.0:
2614; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2615; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2616; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2617; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2618; GFX11-NEXT:    s_waitcnt vmcnt(0)
2619; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
2620; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2621; GFX11-NEXT:    v_pk_max_f16 v1, v1, 2.0
2622; GFX11-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
2623; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2624; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2625; GFX11-NEXT:    s_endpgm
2626  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2627  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
2628  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
2629  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
2630  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 0.0>)
2631  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2632
2633  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
2634  ret void
2635}
2636
2637define amdgpu_kernel void @v_clamp_v2f16_not_one(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
2638; GFX6-LABEL: v_clamp_v2f16_not_one:
2639; GFX6:       ; %bb.0:
2640; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2641; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2642; GFX6-NEXT:    s_mov_b32 s6, 0
2643; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2644; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2645; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2646; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2647; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2648; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2649; GFX6-NEXT:    s_waitcnt vmcnt(0)
2650; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2651; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
2652; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
2653; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
2654; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2655; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 0
2656; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2657; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2658; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
2659; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2660; GFX6-NEXT:    s_endpgm
2661;
2662; GFX8-LABEL: v_clamp_v2f16_not_one:
2663; GFX8:       ; %bb.0:
2664; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2665; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2666; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2667; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2668; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2669; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2670; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2671; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2672; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2673; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2674; GFX8-NEXT:    s_waitcnt vmcnt(0)
2675; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
2676; GFX8-NEXT:    v_max_f16_e32 v2, 0, v2
2677; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2678; GFX8-NEXT:    v_min_f16_e32 v2, 0, v2
2679; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
2680; GFX8-NEXT:    flat_store_dword v[0:1], v2
2681; GFX8-NEXT:    s_endpgm
2682;
2683; GFX9-LABEL: v_clamp_v2f16_not_one:
2684; GFX9:       ; %bb.0:
2685; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2686; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2687; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2688; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2689; GFX9-NEXT:    s_waitcnt vmcnt(0)
2690; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
2691; GFX9-NEXT:    v_pk_max_f16 v1, v1, 0
2692; GFX9-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
2693; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2694; GFX9-NEXT:    s_endpgm
2695;
2696; GFX11-LABEL: v_clamp_v2f16_not_one:
2697; GFX11:       ; %bb.0:
2698; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2699; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2700; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2701; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2702; GFX11-NEXT:    s_waitcnt vmcnt(0)
2703; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
2704; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2705; GFX11-NEXT:    v_pk_max_f16 v1, v1, 0
2706; GFX11-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
2707; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2708; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2709; GFX11-NEXT:    s_endpgm
2710  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2711  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
2712  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
2713  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
2714  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half 0.0>)
2715  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 0.0, half 1.0>)
2716
2717  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
2718  ret void
2719}
2720
2721define amdgpu_kernel void @v_clamp_neg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
2722; GFX6-LABEL: v_clamp_neg_v2f16:
2723; GFX6:       ; %bb.0:
2724; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2725; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2726; GFX6-NEXT:    s_mov_b32 s6, 0
2727; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2728; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2729; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2730; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2731; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2732; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2733; GFX6-NEXT:    s_waitcnt vmcnt(0)
2734; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
2735; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2736; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
2737; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
2738; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2739; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2740; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2741; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
2742; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2743; GFX6-NEXT:    s_endpgm
2744;
2745; GFX8-LABEL: v_clamp_neg_v2f16:
2746; GFX8:       ; %bb.0:
2747; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2748; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2749; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2750; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2751; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2752; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2753; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2754; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2755; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2756; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2757; GFX8-NEXT:    s_waitcnt vmcnt(0)
2758; GFX8-NEXT:    v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2759; GFX8-NEXT:    v_max_f16_e64 v3, -v3, -v3 clamp
2760; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
2761; GFX8-NEXT:    flat_store_dword v[0:1], v2
2762; GFX8-NEXT:    s_endpgm
2763;
2764; GFX9-LABEL: v_clamp_neg_v2f16:
2765; GFX9:       ; %bb.0:
2766; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2767; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2768; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2769; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2770; GFX9-NEXT:    s_waitcnt vmcnt(0)
2771; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
2772; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2773; GFX9-NEXT:    s_endpgm
2774;
2775; GFX11-LABEL: v_clamp_neg_v2f16:
2776; GFX11:       ; %bb.0:
2777; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2778; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2779; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2780; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2781; GFX11-NEXT:    s_waitcnt vmcnt(0)
2782; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
2783; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2784; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2785; GFX11-NEXT:    s_endpgm
2786  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2787  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
2788  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
2789  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
2790  %fneg.a = fsub <2 x half> <half -0.0, half -0.0>, %a
2791  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.a, <2 x half> zeroinitializer)
2792  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2793
2794  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
2795  ret void
2796}
2797
2798define amdgpu_kernel void @v_clamp_negabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
2799; GFX6-LABEL: v_clamp_negabs_v2f16:
2800; GFX6:       ; %bb.0:
2801; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2802; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2803; GFX6-NEXT:    s_mov_b32 s6, 0
2804; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2805; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2806; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2807; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2808; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2809; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2810; GFX6-NEXT:    s_waitcnt vmcnt(0)
2811; GFX6-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
2812; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2813; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
2814; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
2815; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2816; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2817; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2818; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
2819; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2820; GFX6-NEXT:    s_endpgm
2821;
2822; GFX8-LABEL: v_clamp_negabs_v2f16:
2823; GFX8:       ; %bb.0:
2824; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2825; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2826; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2828; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2829; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2830; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2831; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2832; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2833; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2834; GFX8-NEXT:    s_waitcnt vmcnt(0)
2835; GFX8-NEXT:    v_max_f16_sdwa v2, -|v3|, -|v3| clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2836; GFX8-NEXT:    v_max_f16_e64 v3, -|v3|, -|v3| clamp
2837; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
2838; GFX8-NEXT:    flat_store_dword v[0:1], v2
2839; GFX8-NEXT:    s_endpgm
2840;
2841; GFX9-LABEL: v_clamp_negabs_v2f16:
2842; GFX9:       ; %bb.0:
2843; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2844; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2845; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2846; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2847; GFX9-NEXT:    s_waitcnt vmcnt(0)
2848; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
2849; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
2850; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2851; GFX9-NEXT:    s_endpgm
2852;
2853; GFX11-LABEL: v_clamp_negabs_v2f16:
2854; GFX11:       ; %bb.0:
2855; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2856; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2857; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2858; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2859; GFX11-NEXT:    s_waitcnt vmcnt(0)
2860; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
2861; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2862; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
2863; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2864; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2865; GFX11-NEXT:    s_endpgm
2866  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2867  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
2868  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
2869  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
2870  %fabs.a = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
2871  %fneg.fabs.a = fsub <2 x half> <half -0.0, half -0.0>, %fabs.a
2872
2873  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %fneg.fabs.a, <2 x half> zeroinitializer)
2874  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2875
2876  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
2877  ret void
2878}
2879
2880define amdgpu_kernel void @v_clamp_neglo_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
2881; GFX6-LABEL: v_clamp_neglo_v2f16:
2882; GFX6:       ; %bb.0:
2883; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2884; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2885; GFX6-NEXT:    s_mov_b32 s6, 0
2886; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2887; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2888; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2889; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2890; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2891; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2892; GFX6-NEXT:    s_waitcnt vmcnt(0)
2893; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2894; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
2895; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2896; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, -v2 clamp
2897; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2898; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2899; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2900; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
2901; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2902; GFX6-NEXT:    s_endpgm
2903;
2904; GFX8-LABEL: v_clamp_neglo_v2f16:
2905; GFX8:       ; %bb.0:
2906; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2907; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2908; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2909; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2910; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2911; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2912; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2913; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2914; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2915; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2916; GFX8-NEXT:    s_waitcnt vmcnt(0)
2917; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2918; GFX8-NEXT:    v_max_f16_e64 v3, -v3, -v3 clamp
2919; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
2920; GFX8-NEXT:    flat_store_dword v[0:1], v2
2921; GFX8-NEXT:    s_endpgm
2922;
2923; GFX9-LABEL: v_clamp_neglo_v2f16:
2924; GFX9:       ; %bb.0:
2925; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2926; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2927; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2928; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
2929; GFX9-NEXT:    s_waitcnt vmcnt(0)
2930; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
2931; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2932; GFX9-NEXT:    s_endpgm
2933;
2934; GFX11-LABEL: v_clamp_neglo_v2f16:
2935; GFX11:       ; %bb.0:
2936; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2937; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2938; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2939; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2940; GFX11-NEXT:    s_waitcnt vmcnt(0)
2941; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
2942; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2943; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2944; GFX11-NEXT:    s_endpgm
2945  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2946  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
2947  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
2948  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
2949  %lo = extractelement <2 x half> %a, i32 0
2950  %neg.lo = fsub half -0.0, %lo
2951  %neg.lo.vec = insertelement <2 x half> %a, half %neg.lo, i32 0
2952  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.lo.vec, <2 x half> zeroinitializer)
2953  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
2954
2955  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
2956  ret void
2957}
2958
2959define amdgpu_kernel void @v_clamp_neghi_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
2960; GFX6-LABEL: v_clamp_neghi_v2f16:
2961; GFX6:       ; %bb.0:
2962; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2963; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2964; GFX6-NEXT:    s_mov_b32 s6, 0
2965; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2966; GFX6-NEXT:    v_mov_b32_e32 v1, 0
2967; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2968; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
2969; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2970; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
2971; GFX6-NEXT:    s_waitcnt vmcnt(0)
2972; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2973; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, -v3 clamp
2974; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
2975; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
2976; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
2977; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2978; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
2979; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2980; GFX6-NEXT:    s_endpgm
2981;
2982; GFX8-LABEL: v_clamp_neghi_v2f16:
2983; GFX8:       ; %bb.0:
2984; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2985; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2986; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2987; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2988; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
2989; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2990; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2991; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2992; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
2993; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2994; GFX8-NEXT:    s_waitcnt vmcnt(0)
2995; GFX8-NEXT:    v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2996; GFX8-NEXT:    v_max_f16_e64 v3, v3, v3 clamp
2997; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
2998; GFX8-NEXT:    flat_store_dword v[0:1], v2
2999; GFX8-NEXT:    s_endpgm
3000;
3001; GFX9-LABEL: v_clamp_neghi_v2f16:
3002; GFX9:       ; %bb.0:
3003; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3004; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3005; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3006; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3007; GFX9-NEXT:    s_waitcnt vmcnt(0)
3008; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
3009; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3010; GFX9-NEXT:    s_endpgm
3011;
3012; GFX11-LABEL: v_clamp_neghi_v2f16:
3013; GFX11:       ; %bb.0:
3014; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3015; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3016; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3017; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3018; GFX11-NEXT:    s_waitcnt vmcnt(0)
3019; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
3020; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3021; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3022; GFX11-NEXT:    s_endpgm
3023  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3024  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
3025  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
3026  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
3027  %hi = extractelement <2 x half> %a, i32 1
3028  %neg.hi = fsub half -0.0, %hi
3029  %neg.hi.vec = insertelement <2 x half> %a, half %neg.hi, i32 1
3030  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %neg.hi.vec, <2 x half> zeroinitializer)
3031  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3032
3033  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
3034  ret void
3035}
3036
3037define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
3038; GFX6-LABEL: v_clamp_v2f16_shuffle:
3039; GFX6:       ; %bb.0:
3040; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3041; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3042; GFX6-NEXT:    s_mov_b32 s6, 0
3043; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3044; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3045; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3046; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3047; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3048; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3049; GFX6-NEXT:    s_waitcnt vmcnt(0)
3050; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3051; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
3052; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
3053; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3054; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3055; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3056; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
3057; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3058; GFX6-NEXT:    s_endpgm
3059;
3060; GFX8-LABEL: v_clamp_v2f16_shuffle:
3061; GFX8:       ; %bb.0:
3062; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3063; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3064; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3065; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3066; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3067; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3068; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3069; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3070; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3071; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3072; GFX8-NEXT:    s_waitcnt vmcnt(0)
3073; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3074; GFX8-NEXT:    v_max_f16_sdwa v3, v3, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3075; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3076; GFX8-NEXT:    flat_store_dword v[0:1], v2
3077; GFX8-NEXT:    s_endpgm
3078;
3079; GFX9-LABEL: v_clamp_v2f16_shuffle:
3080; GFX9:       ; %bb.0:
3081; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3082; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3083; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3084; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3085; GFX9-NEXT:    s_waitcnt vmcnt(0)
3086; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
3087; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3088; GFX9-NEXT:    s_endpgm
3089;
3090; GFX11-LABEL: v_clamp_v2f16_shuffle:
3091; GFX11:       ; %bb.0:
3092; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3093; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3094; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3095; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3096; GFX11-NEXT:    s_waitcnt vmcnt(0)
3097; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
3098; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3099; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3100; GFX11-NEXT:    s_endpgm
3101  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3102  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
3103  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
3104  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
3105  %shuf = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
3106  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %shuf, <2 x half> zeroinitializer)
3107  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half 1.0>)
3108
3109  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
3110  ret void
3111}
3112
3113define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
3114; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts0:
3115; GFX6:       ; %bb.0:
3116; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3117; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3118; GFX6-NEXT:    s_mov_b32 s6, 0
3119; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3120; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3121; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3122; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3123; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3124; GFX6-NEXT:    s_mov_b32 s2, 0x7fc00000
3125; GFX6-NEXT:    s_waitcnt vmcnt(0)
3126; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
3127; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
3128; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
3129; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
3130; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
3131; GFX6-NEXT:    v_med3_f32 v3, v3, s2, 1.0
3132; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3133; GFX6-NEXT:    v_med3_f32 v2, v2, 0, s2
3134; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3135; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3136; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3137; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3138; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3139; GFX6-NEXT:    s_endpgm
3140;
3141; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0:
3142; GFX8:       ; %bb.0:
3143; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3144; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3145; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3c00
3146; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3147; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3148; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3149; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3150; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3151; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3152; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3153; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3154; GFX8-NEXT:    s_waitcnt vmcnt(0)
3155; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3156; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
3157; GFX8-NEXT:    v_max_f16_e32 v2, 0x7e00, v2
3158; GFX8-NEXT:    v_max_f16_e32 v3, 0, v3
3159; GFX8-NEXT:    v_min_f16_e32 v3, 0x7e00, v3
3160; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3161; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3162; GFX8-NEXT:    flat_store_dword v[0:1], v2
3163; GFX8-NEXT:    s_endpgm
3164;
3165; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0:
3166; GFX9:       ; %bb.0:
3167; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3168; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3170; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3171; GFX9-NEXT:    s_waitcnt vmcnt(0)
3172; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3173; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3174; GFX9-NEXT:    s_endpgm
3175;
3176; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0:
3177; GFX11:       ; %bb.0:
3178; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3179; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3180; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3181; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3182; GFX11-NEXT:    s_waitcnt vmcnt(0)
3183; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3184; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3185; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3186; GFX11-NEXT:    s_endpgm
3187  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3188  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
3189  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
3190  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
3191  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
3192  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
3193
3194  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
3195  ret void
3196}
3197
3198define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
3199; GFX6-LABEL: v_clamp_v2f16_undef_limit_elts1:
3200; GFX6:       ; %bb.0:
3201; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3202; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3203; GFX6-NEXT:    s_mov_b32 s6, 0
3204; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3205; GFX6-NEXT:    v_mov_b32_e32 v1, 0
3206; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3207; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
3208; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3209; GFX6-NEXT:    s_mov_b32 s2, 0x7fc00000
3210; GFX6-NEXT:    s_waitcnt vmcnt(0)
3211; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
3212; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
3213; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
3214; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
3215; GFX6-NEXT:    v_max_f32_e32 v3, 0x7fc00000, v3
3216; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
3217; GFX6-NEXT:    v_med3_f32 v2, v2, 0, s2
3218; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
3219; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
3220; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
3221; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
3222; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3223; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
3224; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
3225; GFX6-NEXT:    s_endpgm
3226;
3227; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1:
3228; GFX8:       ; %bb.0:
3229; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3230; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3231; GFX8-NEXT:    v_mov_b32_e32 v4, 0x7e00
3232; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3233; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3234; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
3235; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3236; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3237; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
3238; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3239; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3240; GFX8-NEXT:    s_waitcnt vmcnt(0)
3241; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3242; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
3243; GFX8-NEXT:    v_max_f16_e32 v2, 0, v2
3244; GFX8-NEXT:    v_max_f16_e32 v3, 0x7e00, v3
3245; GFX8-NEXT:    v_min_f16_e32 v3, 1.0, v3
3246; GFX8-NEXT:    v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3247; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
3248; GFX8-NEXT:    flat_store_dword v[0:1], v2
3249; GFX8-NEXT:    s_endpgm
3250;
3251; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1:
3252; GFX9:       ; %bb.0:
3253; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3254; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3255; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3256; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
3257; GFX9-NEXT:    s_waitcnt vmcnt(0)
3258; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3259; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
3260; GFX9-NEXT:    s_endpgm
3261;
3262; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1:
3263; GFX11:       ; %bb.0:
3264; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3265; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3266; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3267; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
3268; GFX11-NEXT:    s_waitcnt vmcnt(0)
3269; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
3270; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
3271; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3272; GFX11-NEXT:    s_endpgm
3273  %tid = call i32 @llvm.amdgcn.workitem.id.x()
3274  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
3275  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
3276  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
3277  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
3278  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
3279
3280  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
3281  ret void
3282}
3283
3284define amdgpu_kernel void @v_clamp_diff_source_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0
3285; GFX6-LABEL: v_clamp_diff_source_f32:
3286; GFX6:       ; %bb.0:
3287; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3288; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3289; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
3290; GFX6-NEXT:    s_load_dword s2, s[2:3], 0x2
3291; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3292; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3293; GFX6-NEXT:    v_mov_b32_e32 v0, s5
3294; GFX6-NEXT:    v_mov_b32_e32 v1, s2
3295; GFX6-NEXT:    v_add_f32_e32 v0, s4, v0
3296; GFX6-NEXT:    v_add_f32_e32 v1, s4, v1
3297; GFX6-NEXT:    v_max_f32_e64 v0, v0, v1 clamp
3298; GFX6-NEXT:    s_mov_b32 s2, -1
3299; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
3300; GFX6-NEXT:    s_endpgm
3301;
3302; GFX8-LABEL: v_clamp_diff_source_f32:
3303; GFX8:       ; %bb.0:
3304; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3305; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3306; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
3307; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x8
3308; GFX8-NEXT:    s_add_u32 s0, s0, 12
3309; GFX8-NEXT:    s_addc_u32 s1, s1, 0
3310; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3311; GFX8-NEXT:    v_mov_b32_e32 v0, s5
3312; GFX8-NEXT:    v_mov_b32_e32 v1, s2
3313; GFX8-NEXT:    v_add_f32_e32 v0, s4, v0
3314; GFX8-NEXT:    v_add_f32_e32 v1, s4, v1
3315; GFX8-NEXT:    v_max_f32_e64 v2, v0, v1 clamp
3316; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3317; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3318; GFX8-NEXT:    flat_store_dword v[0:1], v2
3319; GFX8-NEXT:    s_endpgm
3320;
3321; GFX9-LABEL: v_clamp_diff_source_f32:
3322; GFX9:       ; %bb.0:
3323; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3324; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3325; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3326; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
3327; GFX9-NEXT:    s_load_dword s6, s[2:3], 0x8
3328; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3329; GFX9-NEXT:    v_mov_b32_e32 v1, s5
3330; GFX9-NEXT:    v_mov_b32_e32 v2, s6
3331; GFX9-NEXT:    v_add_f32_e32 v1, s4, v1
3332; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
3333; GFX9-NEXT:    v_max_f32_e64 v1, v1, v2 clamp
3334; GFX9-NEXT:    global_store_dword v0, v1, s[0:1] offset:12
3335; GFX9-NEXT:    s_endpgm
3336;
3337; GFX11-LABEL: v_clamp_diff_source_f32:
3338; GFX11:       ; %bb.0:
3339; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3340; GFX11-NEXT:    v_mov_b32_e32 v2, 0
3341; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3342; GFX11-NEXT:    s_clause 0x1
3343; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
3344; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x8
3345; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3346; GFX11-NEXT:    v_add_f32_e64 v0, s4, s5
3347; GFX11-NEXT:    v_add_f32_e64 v1, s4, s2
3348; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3349; GFX11-NEXT:    v_max_f32_e64 v0, v0, v1 clamp
3350; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1] offset:12
3351; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3352; GFX11-NEXT:    s_endpgm
3353{
3354  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 0
3355  %gep1 = getelementptr float, float addrspace(1)* %aptr, i32 1
3356  %gep2 = getelementptr float, float addrspace(1)* %aptr, i32 2
3357  %l0 = load float, float addrspace(1)* %gep0
3358  %l1 = load float, float addrspace(1)* %gep1
3359  %l2 = load float, float addrspace(1)* %gep2
3360  %a = fadd nsz float %l0, %l1
3361  %b = fadd nsz float %l0, %l2
3362  %res = call nsz float @llvm.maxnum.f32(float %a, float %b)
3363  %max = call nsz float @llvm.maxnum.f32(float %res, float 0.0)
3364  %min = call nsz float @llvm.minnum.f32(float %max, float 1.0)
3365  %out.gep = getelementptr float, float addrspace(1)* %out, i32 3
3366  store float %min, float addrspace(1)* %out.gep
3367  ret void
3368}
3369
3370declare i32 @llvm.amdgcn.workitem.id.x() #1
3371declare float @llvm.fabs.f32(float) #1
3372declare float @llvm.minnum.f32(float, float) #1
3373declare float @llvm.maxnum.f32(float, float) #1
3374declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
3375declare double @llvm.fabs.f64(double) #1
3376declare double @llvm.minnum.f64(double, double) #1
3377declare double @llvm.maxnum.f64(double, double) #1
3378declare half @llvm.fabs.f16(half) #1
3379declare half @llvm.minnum.f16(half, half) #1
3380declare half @llvm.maxnum.f16(half, half) #1
3381declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
3382declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
3383declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
3384
3385attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3386attributes #1 = { nounwind readnone }
3387attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
3388attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
3389attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" }
3390