1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s
4
5; IEEE bit enabled for compute kernel, so shouldn't use.
6define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
7; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
8; SI:       ; %bb.0:
9; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
10; SI-NEXT:    s_mov_b32 s7, 0xf000
11; SI-NEXT:    s_mov_b32 s6, 0
12; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
13; SI-NEXT:    v_mov_b32_e32 v1, 0
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
16; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
17; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
18; SI-NEXT:    s_waitcnt vmcnt(0)
19; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
20; SI-NEXT:    v_mul_f32_e32 v2, 0.5, v2
21; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
22; SI-NEXT:    s_endpgm
23;
24; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
25; VI:       ; %bb.0:
26; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
27; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
28; VI-NEXT:    s_waitcnt lgkmcnt(0)
29; VI-NEXT:    v_mov_b32_e32 v1, s3
30; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
31; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
32; VI-NEXT:    flat_load_dword v3, v[0:1]
33; VI-NEXT:    v_mov_b32_e32 v1, s1
34; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
35; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
36; VI-NEXT:    s_waitcnt vmcnt(0)
37; VI-NEXT:    v_add_f32_e32 v2, 1.0, v3
38; VI-NEXT:    v_mul_f32_e32 v2, 0.5, v2
39; VI-NEXT:    flat_store_dword v[0:1], v2
40; VI-NEXT:    s_endpgm
41  %tid = call i32 @llvm.amdgcn.workitem.id.x()
42  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
43  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
44  %a = load float, float addrspace(1)* %gep0
45  %add = fadd float %a, 1.0
46  %div2 = fmul float %add, 0.5
47  store float %div2, float addrspace(1)* %out.gep
48  ret void
49}
50
51; IEEE bit enabled for compute kernel, so shouldn't use.
52define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(double addrspace(1)* %out, double addrspace(1)* %aptr) #4 {
53; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
54; SI:       ; %bb.0:
55; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
56; SI-NEXT:    s_mov_b32 s7, 0xf000
57; SI-NEXT:    s_mov_b32 s6, 0
58; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
59; SI-NEXT:    v_mov_b32_e32 v1, 0
60; SI-NEXT:    s_waitcnt lgkmcnt(0)
61; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
62; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
63; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
64; SI-NEXT:    s_waitcnt vmcnt(0)
65; SI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
66; SI-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
67; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
68; SI-NEXT:    s_endpgm
69;
70; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
71; VI:       ; %bb.0:
72; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
73; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
74; VI-NEXT:    s_waitcnt lgkmcnt(0)
75; VI-NEXT:    v_mov_b32_e32 v1, s3
76; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
77; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
78; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
79; VI-NEXT:    v_mov_b32_e32 v3, s1
80; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
81; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
82; VI-NEXT:    s_waitcnt vmcnt(0)
83; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
84; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
85; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
86; VI-NEXT:    s_endpgm
87  %tid = call i32 @llvm.amdgcn.workitem.id.x()
88  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
89  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
90  %a = load double, double addrspace(1)* %gep0
91  %add = fadd double %a, 1.0
92  %div2 = fmul double %add, 0.5
93  store double %div2, double addrspace(1)* %out.gep
94  ret void
95}
96
97; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed
98define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
99; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
100; SI:       ; %bb.0:
101; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
102; SI-NEXT:    s_mov_b32 s7, 0xf000
103; SI-NEXT:    s_mov_b32 s6, 0
104; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
105; SI-NEXT:    v_mov_b32_e32 v1, 0
106; SI-NEXT:    s_waitcnt lgkmcnt(0)
107; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
108; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
109; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
110; SI-NEXT:    s_waitcnt vmcnt(0)
111; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
112; SI-NEXT:    v_mul_f32_e32 v2, 0.5, v2
113; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
114; SI-NEXT:    s_endpgm
115;
116; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
117; VI:       ; %bb.0:
118; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
119; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
120; VI-NEXT:    s_waitcnt lgkmcnt(0)
121; VI-NEXT:    v_mov_b32_e32 v1, s3
122; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
123; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
124; VI-NEXT:    flat_load_dword v3, v[0:1]
125; VI-NEXT:    v_mov_b32_e32 v1, s1
126; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
127; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
128; VI-NEXT:    s_waitcnt vmcnt(0)
129; VI-NEXT:    v_add_f32_e32 v2, 1.0, v3
130; VI-NEXT:    v_mul_f32_e32 v2, 0.5, v2
131; VI-NEXT:    flat_store_dword v[0:1], v2
132; VI-NEXT:    s_endpgm
133  %tid = call i32 @llvm.amdgcn.workitem.id.x()
134  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
135  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
136  %a = load float, float addrspace(1)* %gep0
137  %add = fadd float %a, 1.0
138  %div2 = fmul float %add, 0.5
139  store float %div2, float addrspace(1)* %out.gep
140  ret void
141}
142
143; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed.
144define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(double addrspace(1)* %out, double addrspace(1)* %aptr) #5 {
145; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
146; SI:       ; %bb.0:
147; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
148; SI-NEXT:    s_mov_b32 s7, 0xf000
149; SI-NEXT:    s_mov_b32 s6, 0
150; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
151; SI-NEXT:    v_mov_b32_e32 v1, 0
152; SI-NEXT:    s_waitcnt lgkmcnt(0)
153; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
154; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
155; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
156; SI-NEXT:    s_waitcnt vmcnt(0)
157; SI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
158; SI-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
159; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
160; SI-NEXT:    s_endpgm
161;
162; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
163; VI:       ; %bb.0:
164; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
165; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
166; VI-NEXT:    s_waitcnt lgkmcnt(0)
167; VI-NEXT:    v_mov_b32_e32 v1, s3
168; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
169; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
170; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
171; VI-NEXT:    v_mov_b32_e32 v3, s1
172; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
173; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
174; VI-NEXT:    s_waitcnt vmcnt(0)
175; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
176; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
177; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
178; VI-NEXT:    s_endpgm
179  %tid = call i32 @llvm.amdgcn.workitem.id.x()
180  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
181  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
182  %a = load double, double addrspace(1)* %gep0
183  %add = fadd double %a, 1.0
184  %div2 = fmul double %add, 0.5
185  store double %div2, double addrspace(1)* %out.gep
186  ret void
187}
188
189; Only allow without IEEE bit if signed zeros are significant.
190define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
191; SI-LABEL: v_omod_div2_f32_signed_zeros:
192; SI:       ; %bb.0:
193; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
194; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
195; SI-NEXT:    s_mov_b32 s3, 0xf000
196; SI-NEXT:    s_mov_b32 s2, -1
197; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
198; SI-NEXT:    s_endpgm
199;
200; VI-LABEL: v_omod_div2_f32_signed_zeros:
201; VI:       ; %bb.0:
202; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
203; VI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
204; VI-NEXT:    flat_store_dword v[0:1], v0
205; VI-NEXT:    s_endpgm
206  %add = fadd float %a, 1.0
207  %div2 = fmul float %add, 0.5
208  store float %div2, float addrspace(1)* undef
209  ret void
210}
211
212; Only allow without IEEE bit if signed zeros are significant.
213define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 {
214; SI-LABEL: v_omod_div2_f64_signed_zeros:
215; SI:       ; %bb.0:
216; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
217; SI-NEXT:    s_mov_b32 s3, 0xf000
218; SI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
219; SI-NEXT:    s_mov_b32 s2, -1
220; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
221; SI-NEXT:    s_endpgm
222;
223; VI-LABEL: v_omod_div2_f64_signed_zeros:
224; VI:       ; %bb.0:
225; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
226; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
227; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
228; VI-NEXT:    s_endpgm
229  %add = fadd double %a, 1.0
230  %div2 = fmul double %add, 0.5
231  store double %div2, double addrspace(1)* undef
232  ret void
233}
234
235define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
236; SI-LABEL: v_omod_div2_f32:
237; SI:       ; %bb.0:
238; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
239; SI-NEXT:    s_mov_b32 s3, 0xf000
240; SI-NEXT:    s_mov_b32 s2, -1
241; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
242; SI-NEXT:    s_endpgm
243;
244; VI-LABEL: v_omod_div2_f32:
245; VI:       ; %bb.0:
246; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
247; VI-NEXT:    flat_store_dword v[0:1], v0
248; VI-NEXT:    s_endpgm
249  %add = fadd float %a, 1.0
250  %div2 = fmul float %add, 0.5
251  store float %div2, float addrspace(1)* undef
252  ret void
253}
254
255define amdgpu_ps void @v_omod_div2_f64(double %a) #5 {
256; SI-LABEL: v_omod_div2_f64:
257; SI:       ; %bb.0:
258; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 div:2
259; SI-NEXT:    s_mov_b32 s3, 0xf000
260; SI-NEXT:    s_mov_b32 s2, -1
261; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
262; SI-NEXT:    s_endpgm
263;
264; VI-LABEL: v_omod_div2_f64:
265; VI:       ; %bb.0:
266; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 div:2
267; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
268; VI-NEXT:    s_endpgm
269  %add = fadd nsz double %a, 1.0
270  %div2 = fmul nsz double %add, 0.5
271  store double %div2, double addrspace(1)* undef
272  ret void
273}
274
275define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
276; SI-LABEL: v_omod_mul2_f32:
277; SI:       ; %bb.0:
278; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:2
279; SI-NEXT:    s_mov_b32 s3, 0xf000
280; SI-NEXT:    s_mov_b32 s2, -1
281; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
282; SI-NEXT:    s_endpgm
283;
284; VI-LABEL: v_omod_mul2_f32:
285; VI:       ; %bb.0:
286; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:2
287; VI-NEXT:    flat_store_dword v[0:1], v0
288; VI-NEXT:    s_endpgm
289  %add = fadd float %a, 1.0
290  %div2 = fmul float %add, 2.0
291  store float %div2, float addrspace(1)* undef
292  ret void
293}
294
295define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 {
296; SI-LABEL: v_omod_mul2_f64:
297; SI:       ; %bb.0:
298; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:2
299; SI-NEXT:    s_mov_b32 s3, 0xf000
300; SI-NEXT:    s_mov_b32 s2, -1
301; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
302; SI-NEXT:    s_endpgm
303;
304; VI-LABEL: v_omod_mul2_f64:
305; VI:       ; %bb.0:
306; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:2
307; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
308; VI-NEXT:    s_endpgm
309  %add = fadd nsz double %a, 1.0
310  %div2 = fmul nsz double %add, 2.0
311  store double %div2, double addrspace(1)* undef
312  ret void
313}
314
315define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
316; SI-LABEL: v_omod_mul4_f32:
317; SI:       ; %bb.0:
318; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
319; SI-NEXT:    s_mov_b32 s3, 0xf000
320; SI-NEXT:    s_mov_b32 s2, -1
321; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
322; SI-NEXT:    s_endpgm
323;
324; VI-LABEL: v_omod_mul4_f32:
325; VI:       ; %bb.0:
326; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
327; VI-NEXT:    flat_store_dword v[0:1], v0
328; VI-NEXT:    s_endpgm
329  %add = fadd float %a, 1.0
330  %div2 = fmul float %add, 4.0
331  store float %div2, float addrspace(1)* undef
332  ret void
333}
334
335define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 {
336; SI-LABEL: v_omod_mul4_f64:
337; SI:       ; %bb.0:
338; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:4
339; SI-NEXT:    s_mov_b32 s3, 0xf000
340; SI-NEXT:    s_mov_b32 s2, -1
341; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
342; SI-NEXT:    s_endpgm
343;
344; VI-LABEL: v_omod_mul4_f64:
345; VI:       ; %bb.0:
346; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:4
347; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
348; VI-NEXT:    s_endpgm
349  %add = fadd nsz double %a, 1.0
350  %div2 = fmul nsz double %add, 4.0
351  store double %div2, double addrspace(1)* undef
352  ret void
353}
354
355define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
356; SI-LABEL: v_omod_mul4_multi_use_f32:
357; SI:       ; %bb.0:
358; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
359; SI-NEXT:    v_mul_f32_e32 v1, 4.0, v0
360; SI-NEXT:    s_mov_b32 s3, 0xf000
361; SI-NEXT:    s_mov_b32 s2, -1
362; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
363; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
364; SI-NEXT:    s_waitcnt vmcnt(0)
365; SI-NEXT:    s_endpgm
366;
367; VI-LABEL: v_omod_mul4_multi_use_f32:
368; VI:       ; %bb.0:
369; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
370; VI-NEXT:    v_mul_f32_e32 v1, 4.0, v0
371; VI-NEXT:    flat_store_dword v[0:1], v1
372; VI-NEXT:    flat_store_dword v[0:1], v0
373; VI-NEXT:    s_waitcnt vmcnt(0)
374; VI-NEXT:    s_endpgm
375  %add = fadd float %a, 1.0
376  %div2 = fmul float %add, 4.0
377  store float %div2, float addrspace(1)* undef
378  store volatile float %add, float addrspace(1)* undef
379  ret void
380}
381
382define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
383; SI-LABEL: v_omod_mul4_dbg_use_f32:
384; SI:       ; %bb.0:
385; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
386; SI-NEXT:    s_mov_b32 s3, 0xf000
387; SI-NEXT:    s_mov_b32 s2, -1
388; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
389; SI-NEXT:    s_endpgm
390;
391; VI-LABEL: v_omod_mul4_dbg_use_f32:
392; VI:       ; %bb.0:
393; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
394; VI-NEXT:    flat_store_dword v[0:1], v0
395; VI-NEXT:    s_endpgm
396  %add = fadd float %a, 1.0
397  call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
398  %div2 = fmul float %add, 4.0
399  store float %div2, float addrspace(1)* undef
400  ret void
401}
402
403; Clamp is applied after omod, folding both into instruction is OK.
404define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
405; SI-LABEL: v_clamp_omod_div2_f32:
406; SI:       ; %bb.0:
407; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp div:2
408; SI-NEXT:    s_mov_b32 s3, 0xf000
409; SI-NEXT:    s_mov_b32 s2, -1
410; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
411; SI-NEXT:    s_endpgm
412;
413; VI-LABEL: v_clamp_omod_div2_f32:
414; VI:       ; %bb.0:
415; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp div:2
416; VI-NEXT:    flat_store_dword v[0:1], v0
417; VI-NEXT:    s_endpgm
418  %add = fadd float %a, 1.0
419  %div2 = fmul float %add, 0.5
420
421  %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
422  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
423  store float %clamp, float addrspace(1)* undef
424  ret void
425}
426
427; Cannot fold omod into clamp
428define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
429; SI-LABEL: v_omod_div2_clamp_f32:
430; SI:       ; %bb.0:
431; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
432; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
433; SI-NEXT:    s_mov_b32 s3, 0xf000
434; SI-NEXT:    s_mov_b32 s2, -1
435; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
436; SI-NEXT:    s_endpgm
437;
438; VI-LABEL: v_omod_div2_clamp_f32:
439; VI:       ; %bb.0:
440; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
441; VI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
442; VI-NEXT:    flat_store_dword v[0:1], v0
443; VI-NEXT:    s_endpgm
444  %add = fadd float %a, 1.0
445  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
446  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
447  %div2 = fmul float %clamp, 0.5
448  store float %div2, float addrspace(1)* undef
449  ret void
450}
451
452define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
453; SI-LABEL: v_omod_div2_abs_src_f32:
454; SI:       ; %bb.0:
455; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
456; SI-NEXT:    v_mul_f32_e64 v0, |v0|, 0.5
457; SI-NEXT:    s_mov_b32 s3, 0xf000
458; SI-NEXT:    s_mov_b32 s2, -1
459; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
460; SI-NEXT:    s_endpgm
461;
462; VI-LABEL: v_omod_div2_abs_src_f32:
463; VI:       ; %bb.0:
464; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
465; VI-NEXT:    v_mul_f32_e64 v0, |v0|, 0.5
466; VI-NEXT:    flat_store_dword v[0:1], v0
467; VI-NEXT:    s_endpgm
468  %add = fadd float %a, 1.0
469  %abs.add = call float @llvm.fabs.f32(float %add)
470  %div2 = fmul float %abs.add, 0.5
471  store float %div2, float addrspace(1)* undef
472  ret void
473}
474
475define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
476; SI-LABEL: v_omod_add_self_clamp_f32:
477; SI:       ; %bb.0:
478; SI-NEXT:    v_add_f32_e64 v0, v0, v0 clamp
479; SI-NEXT:    s_mov_b32 s3, 0xf000
480; SI-NEXT:    s_mov_b32 s2, -1
481; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
482; SI-NEXT:    s_endpgm
483;
484; VI-LABEL: v_omod_add_self_clamp_f32:
485; VI:       ; %bb.0:
486; VI-NEXT:    v_add_f32_e64 v0, v0, v0 clamp
487; VI-NEXT:    flat_store_dword v[0:1], v0
488; VI-NEXT:    s_endpgm
489  %add = fadd float %a, %a
490  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
491  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
492  store float %clamp, float addrspace(1)* undef
493  ret void
494}
495
496define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
497; SI-LABEL: v_omod_add_clamp_self_f32:
498; SI:       ; %bb.0:
499; SI-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
500; SI-NEXT:    v_add_f32_e32 v0, v0, v0
501; SI-NEXT:    s_mov_b32 s3, 0xf000
502; SI-NEXT:    s_mov_b32 s2, -1
503; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
504; SI-NEXT:    s_endpgm
505;
506; VI-LABEL: v_omod_add_clamp_self_f32:
507; VI:       ; %bb.0:
508; VI-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
509; VI-NEXT:    v_add_f32_e32 v0, v0, v0
510; VI-NEXT:    flat_store_dword v[0:1], v0
511; VI-NEXT:    s_endpgm
512  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
513  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
514  %add = fadd float %clamp, %clamp
515  store float %add, float addrspace(1)* undef
516  ret void
517}
518
519define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
520; SI-LABEL: v_omod_add_abs_self_f32:
521; SI:       ; %bb.0:
522; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
523; SI-NEXT:    v_add_f32_e64 v0, |v0|, |v0|
524; SI-NEXT:    s_mov_b32 s3, 0xf000
525; SI-NEXT:    s_mov_b32 s2, -1
526; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
527; SI-NEXT:    s_endpgm
528;
529; VI-LABEL: v_omod_add_abs_self_f32:
530; VI:       ; %bb.0:
531; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
532; VI-NEXT:    v_add_f32_e64 v0, |v0|, |v0|
533; VI-NEXT:    flat_store_dword v[0:1], v0
534; VI-NEXT:    s_endpgm
535  %x = fadd float %a, 1.0
536  %abs.x = call float @llvm.fabs.f32(float %x)
537  %add = fadd float %abs.x, %abs.x
538  store float %add, float addrspace(1)* undef
539  ret void
540}
541
542define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
543; SI-LABEL: v_omod_add_abs_x_x_f32:
544; SI:       ; %bb.0:
545; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
546; SI-NEXT:    v_add_f32_e64 v0, |v0|, v0
547; SI-NEXT:    s_mov_b32 s3, 0xf000
548; SI-NEXT:    s_mov_b32 s2, -1
549; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
550; SI-NEXT:    s_endpgm
551;
552; VI-LABEL: v_omod_add_abs_x_x_f32:
553; VI:       ; %bb.0:
554; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
555; VI-NEXT:    v_add_f32_e64 v0, |v0|, v0
556; VI-NEXT:    flat_store_dword v[0:1], v0
557; VI-NEXT:    s_endpgm
558  %x = fadd float %a, 1.0
559  %abs.x = call float @llvm.fabs.f32(float %x)
560  %add = fadd float %abs.x, %x
561  store float %add, float addrspace(1)* undef
562  ret void
563}
564
565define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
566; SI-LABEL: v_omod_add_x_abs_x_f32:
567; SI:       ; %bb.0:
568; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
569; SI-NEXT:    v_add_f32_e64 v0, v0, |v0|
570; SI-NEXT:    s_mov_b32 s3, 0xf000
571; SI-NEXT:    s_mov_b32 s2, -1
572; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
573; SI-NEXT:    s_endpgm
574;
575; VI-LABEL: v_omod_add_x_abs_x_f32:
576; VI:       ; %bb.0:
577; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
578; VI-NEXT:    v_add_f32_e64 v0, v0, |v0|
579; VI-NEXT:    flat_store_dword v[0:1], v0
580; VI-NEXT:    s_endpgm
581  %x = fadd float %a, 1.0
582  %abs.x = call float @llvm.fabs.f32(float %x)
583  %add = fadd float %x, %abs.x
584  store float %add, float addrspace(1)* undef
585  ret void
586}
587
588; Don't fold omod into omod into another omod.
589define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
590; SI-LABEL: v_omod_div2_omod_div2_f32:
591; SI:       ; %bb.0:
592; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
593; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
594; SI-NEXT:    s_mov_b32 s3, 0xf000
595; SI-NEXT:    s_mov_b32 s2, -1
596; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
597; SI-NEXT:    s_endpgm
598;
599; VI-LABEL: v_omod_div2_omod_div2_f32:
600; VI:       ; %bb.0:
601; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
602; VI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
603; VI-NEXT:    flat_store_dword v[0:1], v0
604; VI-NEXT:    s_endpgm
605  %add = fadd float %a, 1.0
606  %div2.0 = fmul float %add, 0.5
607  %div2.1 = fmul float %div2.0, 0.5
608  store float %div2.1, float addrspace(1)* undef
609  ret void
610}
611
612; Don't fold omod if denorms enabled
613define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
614; SI-LABEL: v_omod_div2_f32_denormals:
615; SI:       ; %bb.0:
616; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
617; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
618; SI-NEXT:    s_mov_b32 s3, 0xf000
619; SI-NEXT:    s_mov_b32 s2, -1
620; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
621; SI-NEXT:    s_endpgm
622;
623; VI-LABEL: v_omod_div2_f32_denormals:
624; VI:       ; %bb.0:
625; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
626; VI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
627; VI-NEXT:    flat_store_dword v[0:1], v0
628; VI-NEXT:    s_endpgm
629  %add = fadd float %a, 1.0
630  %div2 = fmul float %add, 0.5
631  store float %div2, float addrspace(1)* undef
632  ret void
633}
634
635; Don't fold omod if denorms enabled.
636define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 {
637; SI-LABEL: v_omod_div2_f64_denormals:
638; SI:       ; %bb.0:
639; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
640; SI-NEXT:    s_mov_b32 s3, 0xf000
641; SI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
642; SI-NEXT:    s_mov_b32 s2, -1
643; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
644; SI-NEXT:    s_endpgm
645;
646; VI-LABEL: v_omod_div2_f64_denormals:
647; VI:       ; %bb.0:
648; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
649; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
650; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
651; VI-NEXT:    s_endpgm
652  %add = fadd double %a, 1.0
653  %div2 = fmul double %add, 0.5
654  store double %div2, double addrspace(1)* undef
655  ret void
656}
657
658; Don't fold omod if denorms enabled for add form.
659define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
660; SI-LABEL: v_omod_mul2_f32_denormals:
661; SI:       ; %bb.0:
662; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
663; SI-NEXT:    v_add_f32_e32 v0, v0, v0
664; SI-NEXT:    s_mov_b32 s3, 0xf000
665; SI-NEXT:    s_mov_b32 s2, -1
666; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
667; SI-NEXT:    s_endpgm
668;
669; VI-LABEL: v_omod_mul2_f32_denormals:
670; VI:       ; %bb.0:
671; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
672; VI-NEXT:    v_add_f32_e32 v0, v0, v0
673; VI-NEXT:    flat_store_dword v[0:1], v0
674; VI-NEXT:    s_endpgm
675  %add = fadd float %a, 1.0
676  %mul2 = fadd float %add, %add
677  store float %mul2, float addrspace(1)* undef
678  ret void
679}
680
681; Don't fold omod if denorms enabled for add form.
682define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
683; SI-LABEL: v_omod_mul2_f64_denormals:
684; SI:       ; %bb.0:
685; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
686; SI-NEXT:    s_mov_b32 s3, 0xf000
687; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
688; SI-NEXT:    s_mov_b32 s2, -1
689; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
690; SI-NEXT:    s_endpgm
691;
692; VI-LABEL: v_omod_mul2_f64_denormals:
693; VI:       ; %bb.0:
694; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
695; VI-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
696; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
697; VI-NEXT:    s_endpgm
698  %add = fadd double %a, 1.0
699  %mul2 = fadd double %add, %add
700  store double %mul2, double addrspace(1)* undef
701  ret void
702}
703
704; Don't fold omod if denorms enabled
705define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
706; SI-LABEL: v_omod_div2_f16_denormals:
707; SI:       ; %bb.0:
708; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
709; SI-NEXT:    s_mov_b32 s3, 0xf000
710; SI-NEXT:    s_mov_b32 s2, -1
711; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
712; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
713; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
714; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
715; SI-NEXT:    s_endpgm
716;
717; VI-LABEL: v_omod_div2_f16_denormals:
718; VI:       ; %bb.0:
719; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
720; VI-NEXT:    v_mul_f16_e32 v0, 0.5, v0
721; VI-NEXT:    flat_store_short v[0:1], v0
722; VI-NEXT:    s_endpgm
723  %add = fadd half %a, 1.0
724  %div2 = fmul half %add, 0.5
725  store half %div2, half addrspace(1)* undef
726  ret void
727}
728
729; Don't fold omod if denorms enabled for add form.
730define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
731; SI-LABEL: v_omod_mul2_f16_denormals:
732; SI:       ; %bb.0:
733; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
734; SI-NEXT:    s_mov_b32 s3, 0xf000
735; SI-NEXT:    s_mov_b32 s2, -1
736; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
737; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:2
738; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
739; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
740; SI-NEXT:    s_endpgm
741;
742; VI-LABEL: v_omod_mul2_f16_denormals:
743; VI:       ; %bb.0:
744; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
745; VI-NEXT:    v_add_f16_e32 v0, v0, v0
746; VI-NEXT:    flat_store_short v[0:1], v0
747; VI-NEXT:    s_endpgm
748  %add = fadd half %a, 1.0
749  %mul2 = fadd half %add, %add
750  store half %mul2, half addrspace(1)* undef
751  ret void
752}
753
754define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
755; SI-LABEL: v_omod_div2_f16_no_denormals:
756; SI:       ; %bb.0:
757; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
758; SI-NEXT:    s_mov_b32 s3, 0xf000
759; SI-NEXT:    s_mov_b32 s2, -1
760; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
761; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
762; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
763; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
764; SI-NEXT:    s_endpgm
765;
766; VI-LABEL: v_omod_div2_f16_no_denormals:
767; VI:       ; %bb.0:
768; VI-NEXT:    v_add_f16_e64 v0, v0, 1.0 div:2
769; VI-NEXT:    flat_store_short v[0:1], v0
770; VI-NEXT:    s_endpgm
771  %add = fadd half %a, 1.0
772  %div2 = fmul half %add, 0.5
773  store half %div2, half addrspace(1)* undef
774  ret void
775}
776
777define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
778; SI-LABEL: v_omod_mac_to_mad:
779; SI:       ; %bb.0:
780; SI-NEXT:    v_mad_f32 v1, v1, v1, v0 mul:2
781; SI-NEXT:    v_mul_f32_e32 v0, v1, v0
782; SI-NEXT:    s_mov_b32 s3, 0xf000
783; SI-NEXT:    s_mov_b32 s2, -1
784; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
785; SI-NEXT:    s_endpgm
786;
787; VI-LABEL: v_omod_mac_to_mad:
788; VI:       ; %bb.0:
789; VI-NEXT:    v_mad_f32 v1, v1, v1, v0 mul:2
790; VI-NEXT:    v_mul_f32_e32 v0, v1, v0
791; VI-NEXT:    flat_store_dword v[0:1], v0
792; VI-NEXT:    s_endpgm
793  %mul = fmul float %a, %a
794  %add = fadd float %mul, %b
795  %mad = fmul float %add, 2.0
796  %res = fmul float %mad, %b
797  store float %res, float addrspace(1)* undef
798  ret void
799}
800
801declare i32 @llvm.amdgcn.workitem.id.x() #1
802declare float @llvm.fabs.f32(float) #1
803declare float @llvm.floor.f32(float) #1
804declare float @llvm.minnum.f32(float, float) #1
805declare float @llvm.maxnum.f32(float, float) #1
806declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
807declare double @llvm.fabs.f64(double) #1
808declare double @llvm.minnum.f64(double, double) #1
809declare double @llvm.maxnum.f64(double, double) #1
810declare half @llvm.fabs.f16(half) #1
811declare half @llvm.minnum.f16(half, half) #1
812declare half @llvm.maxnum.f16(half, half) #1
813declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
814
815attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
816attributes #1 = { nounwind readnone }
817attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
818attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
819attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
820attributes #5 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
821attributes #6 = { nounwind "denormal-fp-math"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
822
823!llvm.dbg.cu = !{!0}
824!llvm.module.flags = !{!2, !3}
825
826!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
827!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
828!2 = !{i32 2, !"Dwarf Version", i32 4}
829!3 = !{i32 2, !"Debug Info Version", i32 3}
830!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
831!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
832!6 = !DISubroutineType(types: !7)
833!7 = !{null, !8}
834!8 = !DIBasicType(name: "float", size: 32, align: 32)
835!9 = !DIExpression()
836!10 = !DILocation(line: 1, column: 42, scope: !5)
837