1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs  < %s | FileCheck --check-prefix=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7
8define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
9; SI-LABEL: frem_f16:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
12; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
13; SI-NEXT:    s_mov_b32 s11, 0xf000
14; SI-NEXT:    s_mov_b32 s10, -1
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_mov_b32 s8, s4
17; SI-NEXT:    s_mov_b32 s9, s5
18; SI-NEXT:    s_mov_b32 s4, s6
19; SI-NEXT:    s_mov_b32 s5, s7
20; SI-NEXT:    s_mov_b32 s6, s10
21; SI-NEXT:    s_mov_b32 s7, s11
22; SI-NEXT:    s_mov_b32 s2, s10
23; SI-NEXT:    s_mov_b32 s3, s11
24; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
25; SI-NEXT:    s_waitcnt vmcnt(0)
26; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
27; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
30; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
31; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
32; SI-NEXT:    v_rcp_f32_e32 v4, v3
33; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
34; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
35; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
36; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
37; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
38; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
39; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
40; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
41; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
42; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
43; SI-NEXT:    v_trunc_f32_e32 v2, v2
44; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
45; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
46; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
47; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
48; SI-NEXT:    s_endpgm
49;
50; CI-LABEL: frem_f16:
51; CI:       ; %bb.0:
52; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
53; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
54; CI-NEXT:    s_mov_b32 s11, 0xf000
55; CI-NEXT:    s_mov_b32 s10, -1
56; CI-NEXT:    s_mov_b32 s2, s10
57; CI-NEXT:    s_waitcnt lgkmcnt(0)
58; CI-NEXT:    s_mov_b32 s8, s4
59; CI-NEXT:    s_mov_b32 s9, s5
60; CI-NEXT:    s_mov_b32 s4, s6
61; CI-NEXT:    s_mov_b32 s5, s7
62; CI-NEXT:    s_mov_b32 s6, s10
63; CI-NEXT:    s_mov_b32 s7, s11
64; CI-NEXT:    s_mov_b32 s3, s11
65; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
66; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
67; CI-NEXT:    s_waitcnt vmcnt(1)
68; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
69; CI-NEXT:    s_waitcnt vmcnt(0)
70; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
71; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
72; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
73; CI-NEXT:    v_rcp_f32_e32 v4, v3
74; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
75; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
76; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
77; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
78; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
79; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
80; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
81; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
82; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
83; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
84; CI-NEXT:    v_trunc_f32_e32 v2, v2
85; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
86; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
87; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
88; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
89; CI-NEXT:    s_endpgm
90;
91; VI-LABEL: frem_f16:
92; VI:       ; %bb.0:
93; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
94; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
95; VI-NEXT:    s_waitcnt lgkmcnt(0)
96; VI-NEXT:    v_mov_b32_e32 v2, s6
97; VI-NEXT:    s_add_u32 s0, s0, 8
98; VI-NEXT:    v_mov_b32_e32 v3, s7
99; VI-NEXT:    s_addc_u32 s1, s1, 0
100; VI-NEXT:    flat_load_ushort v4, v[2:3]
101; VI-NEXT:    v_mov_b32_e32 v3, s1
102; VI-NEXT:    v_mov_b32_e32 v2, s0
103; VI-NEXT:    flat_load_ushort v2, v[2:3]
104; VI-NEXT:    v_mov_b32_e32 v0, s4
105; VI-NEXT:    v_mov_b32_e32 v1, s5
106; VI-NEXT:    s_waitcnt vmcnt(1)
107; VI-NEXT:    v_cvt_f32_f16_e32 v3, v4
108; VI-NEXT:    s_waitcnt vmcnt(0)
109; VI-NEXT:    v_cvt_f32_f16_e32 v5, v2
110; VI-NEXT:    v_rcp_f32_e32 v5, v5
111; VI-NEXT:    v_mul_f32_e32 v3, v3, v5
112; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
113; VI-NEXT:    v_div_fixup_f16 v3, v3, v2, v4
114; VI-NEXT:    v_trunc_f16_e32 v3, v3
115; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
116; VI-NEXT:    flat_store_short v[0:1], v2
117; VI-NEXT:    s_endpgm
118;
119; GFX9-LABEL: frem_f16:
120; GFX9:       ; %bb.0:
121; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
122; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
123; GFX9-NEXT:    v_mov_b32_e32 v0, 0
124; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
126; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
127; GFX9-NEXT:    s_waitcnt vmcnt(1)
128; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
129; GFX9-NEXT:    s_waitcnt vmcnt(0)
130; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
131; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
132; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
133; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
134; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
135; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
136; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
137; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
138; GFX9-NEXT:    s_endpgm
139;
140; GFX10-LABEL: frem_f16:
141; GFX10:       ; %bb.0:
142; GFX10-NEXT:    s_clause 0x1
143; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
144; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
145; GFX10-NEXT:    v_mov_b32_e32 v0, 0
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    s_clause 0x1
148; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
149; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
150; GFX10-NEXT:    s_waitcnt vmcnt(1)
151; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
152; GFX10-NEXT:    s_waitcnt vmcnt(0)
153; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
154; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
155; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
156; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
157; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
158; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
159; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
160; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
161; GFX10-NEXT:    s_endpgm
162                      half addrspace(1)* %in2) #0 {
163   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
164   %r0 = load half, half addrspace(1)* %in1, align 4
165   %r1 = load half, half addrspace(1)* %gep2, align 4
166   %r2 = frem half %r0, %r1
167   store half %r2, half addrspace(1)* %out, align 4
168   ret void
169}
170
171define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
172; SI-LABEL: fast_frem_f16:
173; SI:       ; %bb.0:
174; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
175; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
176; SI-NEXT:    s_mov_b32 s11, 0xf000
177; SI-NEXT:    s_mov_b32 s10, -1
178; SI-NEXT:    s_waitcnt lgkmcnt(0)
179; SI-NEXT:    s_mov_b32 s8, s4
180; SI-NEXT:    s_mov_b32 s9, s5
181; SI-NEXT:    s_mov_b32 s4, s6
182; SI-NEXT:    s_mov_b32 s5, s7
183; SI-NEXT:    s_mov_b32 s6, s10
184; SI-NEXT:    s_mov_b32 s7, s11
185; SI-NEXT:    s_mov_b32 s2, s10
186; SI-NEXT:    s_mov_b32 s3, s11
187; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
188; SI-NEXT:    s_waitcnt vmcnt(0)
189; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
190; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
191; SI-NEXT:    s_waitcnt vmcnt(0)
192; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
193; SI-NEXT:    v_rcp_f32_e32 v2, v1
194; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
195; SI-NEXT:    v_trunc_f32_e32 v2, v2
196; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
197; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
198; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
199; SI-NEXT:    s_endpgm
200;
201; CI-LABEL: fast_frem_f16:
202; CI:       ; %bb.0:
203; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
204; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
205; CI-NEXT:    s_mov_b32 s11, 0xf000
206; CI-NEXT:    s_mov_b32 s10, -1
207; CI-NEXT:    s_mov_b32 s2, s10
208; CI-NEXT:    s_mov_b32 s3, s11
209; CI-NEXT:    s_waitcnt lgkmcnt(0)
210; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
211; CI-NEXT:    s_mov_b32 s8, s4
212; CI-NEXT:    s_mov_b32 s9, s5
213; CI-NEXT:    s_mov_b32 s4, s6
214; CI-NEXT:    s_mov_b32 s5, s7
215; CI-NEXT:    s_mov_b32 s6, s10
216; CI-NEXT:    s_mov_b32 s7, s11
217; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
218; CI-NEXT:    s_waitcnt vmcnt(1)
219; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
220; CI-NEXT:    v_rcp_f32_e32 v2, v1
221; CI-NEXT:    s_waitcnt vmcnt(0)
222; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
223; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
224; CI-NEXT:    v_trunc_f32_e32 v2, v2
225; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
226; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
227; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
228; CI-NEXT:    s_endpgm
229;
230; VI-LABEL: fast_frem_f16:
231; VI:       ; %bb.0:
232; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
233; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
234; VI-NEXT:    s_waitcnt lgkmcnt(0)
235; VI-NEXT:    v_mov_b32_e32 v2, s6
236; VI-NEXT:    s_add_u32 s0, s0, 8
237; VI-NEXT:    v_mov_b32_e32 v3, s7
238; VI-NEXT:    s_addc_u32 s1, s1, 0
239; VI-NEXT:    flat_load_ushort v4, v[2:3]
240; VI-NEXT:    v_mov_b32_e32 v3, s1
241; VI-NEXT:    v_mov_b32_e32 v2, s0
242; VI-NEXT:    flat_load_ushort v2, v[2:3]
243; VI-NEXT:    v_mov_b32_e32 v0, s4
244; VI-NEXT:    v_mov_b32_e32 v1, s5
245; VI-NEXT:    s_waitcnt vmcnt(0)
246; VI-NEXT:    v_rcp_f16_e32 v3, v2
247; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
248; VI-NEXT:    v_trunc_f16_e32 v3, v3
249; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
250; VI-NEXT:    flat_store_short v[0:1], v2
251; VI-NEXT:    s_endpgm
252;
253; GFX9-LABEL: fast_frem_f16:
254; GFX9:       ; %bb.0:
255; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
256; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
257; GFX9-NEXT:    v_mov_b32_e32 v0, 0
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
260; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
261; GFX9-NEXT:    s_waitcnt vmcnt(0)
262; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
263; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
264; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
265; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
266; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
267; GFX9-NEXT:    s_endpgm
268;
269; GFX10-LABEL: fast_frem_f16:
270; GFX10:       ; %bb.0:
271; GFX10-NEXT:    s_clause 0x1
272; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
273; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
274; GFX10-NEXT:    v_mov_b32_e32 v0, 0
275; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX10-NEXT:    s_clause 0x1
277; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
278; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
279; GFX10-NEXT:    s_waitcnt vmcnt(0)
280; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
281; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
282; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
283; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
284; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
285; GFX10-NEXT:    s_endpgm
286                      half addrspace(1)* %in2) #0 {
287   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
288   %r0 = load half, half addrspace(1)* %in1, align 4
289   %r1 = load half, half addrspace(1)* %gep2, align 4
290   %r2 = frem fast half %r0, %r1
291   store half %r2, half addrspace(1)* %out, align 4
292   ret void
293}
294
295define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
296; SI-LABEL: unsafe_frem_f16:
297; SI:       ; %bb.0:
298; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
299; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
300; SI-NEXT:    s_mov_b32 s11, 0xf000
301; SI-NEXT:    s_mov_b32 s10, -1
302; SI-NEXT:    s_waitcnt lgkmcnt(0)
303; SI-NEXT:    s_mov_b32 s8, s4
304; SI-NEXT:    s_mov_b32 s9, s5
305; SI-NEXT:    s_mov_b32 s4, s6
306; SI-NEXT:    s_mov_b32 s5, s7
307; SI-NEXT:    s_mov_b32 s6, s10
308; SI-NEXT:    s_mov_b32 s7, s11
309; SI-NEXT:    s_mov_b32 s2, s10
310; SI-NEXT:    s_mov_b32 s3, s11
311; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
312; SI-NEXT:    s_waitcnt vmcnt(0)
313; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
314; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
315; SI-NEXT:    s_waitcnt vmcnt(0)
316; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
317; SI-NEXT:    v_rcp_f32_e32 v2, v1
318; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
319; SI-NEXT:    v_trunc_f32_e32 v2, v2
320; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
321; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
322; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
323; SI-NEXT:    s_endpgm
324;
325; CI-LABEL: unsafe_frem_f16:
326; CI:       ; %bb.0:
327; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
328; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
329; CI-NEXT:    s_mov_b32 s11, 0xf000
330; CI-NEXT:    s_mov_b32 s10, -1
331; CI-NEXT:    s_mov_b32 s2, s10
332; CI-NEXT:    s_mov_b32 s3, s11
333; CI-NEXT:    s_waitcnt lgkmcnt(0)
334; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
335; CI-NEXT:    s_mov_b32 s8, s4
336; CI-NEXT:    s_mov_b32 s9, s5
337; CI-NEXT:    s_mov_b32 s4, s6
338; CI-NEXT:    s_mov_b32 s5, s7
339; CI-NEXT:    s_mov_b32 s6, s10
340; CI-NEXT:    s_mov_b32 s7, s11
341; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
342; CI-NEXT:    s_waitcnt vmcnt(1)
343; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
344; CI-NEXT:    v_rcp_f32_e32 v2, v1
345; CI-NEXT:    s_waitcnt vmcnt(0)
346; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
347; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
348; CI-NEXT:    v_trunc_f32_e32 v2, v2
349; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
350; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
351; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
352; CI-NEXT:    s_endpgm
353;
354; VI-LABEL: unsafe_frem_f16:
355; VI:       ; %bb.0:
356; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
357; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
358; VI-NEXT:    s_waitcnt lgkmcnt(0)
359; VI-NEXT:    v_mov_b32_e32 v2, s6
360; VI-NEXT:    s_add_u32 s0, s0, 8
361; VI-NEXT:    v_mov_b32_e32 v3, s7
362; VI-NEXT:    s_addc_u32 s1, s1, 0
363; VI-NEXT:    flat_load_ushort v4, v[2:3]
364; VI-NEXT:    v_mov_b32_e32 v3, s1
365; VI-NEXT:    v_mov_b32_e32 v2, s0
366; VI-NEXT:    flat_load_ushort v2, v[2:3]
367; VI-NEXT:    v_mov_b32_e32 v0, s4
368; VI-NEXT:    v_mov_b32_e32 v1, s5
369; VI-NEXT:    s_waitcnt vmcnt(0)
370; VI-NEXT:    v_rcp_f16_e32 v3, v2
371; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
372; VI-NEXT:    v_trunc_f16_e32 v3, v3
373; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
374; VI-NEXT:    flat_store_short v[0:1], v2
375; VI-NEXT:    s_endpgm
376;
377; GFX9-LABEL: unsafe_frem_f16:
378; GFX9:       ; %bb.0:
379; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
380; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
381; GFX9-NEXT:    v_mov_b32_e32 v0, 0
382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
383; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
384; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
385; GFX9-NEXT:    s_waitcnt vmcnt(0)
386; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
387; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
388; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
389; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
390; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
391; GFX9-NEXT:    s_endpgm
392;
393; GFX10-LABEL: unsafe_frem_f16:
394; GFX10:       ; %bb.0:
395; GFX10-NEXT:    s_clause 0x1
396; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
397; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
398; GFX10-NEXT:    v_mov_b32_e32 v0, 0
399; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX10-NEXT:    s_clause 0x1
401; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
402; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
403; GFX10-NEXT:    s_waitcnt vmcnt(0)
404; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
405; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
406; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
407; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
408; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
409; GFX10-NEXT:    s_endpgm
410                             half addrspace(1)* %in2) #1 {
411   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
412   %r0 = load half, half addrspace(1)* %in1, align 4
413   %r1 = load half, half addrspace(1)* %gep2, align 4
414   %r2 = frem afn half %r0, %r1
415   store half %r2, half addrspace(1)* %out, align 4
416   ret void
417}
418
419define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
420; SI-LABEL: frem_f32:
421; SI:       ; %bb.0:
422; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
423; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
424; SI-NEXT:    s_mov_b32 s11, 0xf000
425; SI-NEXT:    s_mov_b32 s10, -1
426; SI-NEXT:    s_waitcnt lgkmcnt(0)
427; SI-NEXT:    s_mov_b32 s8, s4
428; SI-NEXT:    s_mov_b32 s9, s5
429; SI-NEXT:    s_mov_b32 s4, s6
430; SI-NEXT:    s_mov_b32 s5, s7
431; SI-NEXT:    s_mov_b32 s6, s10
432; SI-NEXT:    s_mov_b32 s7, s11
433; SI-NEXT:    s_mov_b32 s2, s10
434; SI-NEXT:    s_mov_b32 s3, s11
435; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
436; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
437; SI-NEXT:    s_waitcnt vmcnt(0)
438; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
439; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
440; SI-NEXT:    v_rcp_f32_e32 v4, v3
441; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
442; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
443; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
444; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
445; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
446; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
447; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
448; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
449; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
450; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
451; SI-NEXT:    v_trunc_f32_e32 v2, v2
452; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
453; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
454; SI-NEXT:    s_endpgm
455;
456; CI-LABEL: frem_f32:
457; CI:       ; %bb.0:
458; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
459; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
460; CI-NEXT:    s_mov_b32 s11, 0xf000
461; CI-NEXT:    s_mov_b32 s10, -1
462; CI-NEXT:    s_mov_b32 s2, s10
463; CI-NEXT:    s_waitcnt lgkmcnt(0)
464; CI-NEXT:    s_mov_b32 s8, s4
465; CI-NEXT:    s_mov_b32 s9, s5
466; CI-NEXT:    s_mov_b32 s4, s6
467; CI-NEXT:    s_mov_b32 s5, s7
468; CI-NEXT:    s_mov_b32 s6, s10
469; CI-NEXT:    s_mov_b32 s7, s11
470; CI-NEXT:    s_mov_b32 s3, s11
471; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
472; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
473; CI-NEXT:    s_waitcnt vmcnt(0)
474; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
475; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
476; CI-NEXT:    v_rcp_f32_e32 v4, v3
477; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
478; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
479; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
480; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
481; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
482; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
483; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
484; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
485; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
486; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
487; CI-NEXT:    v_trunc_f32_e32 v2, v2
488; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
489; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
490; CI-NEXT:    s_endpgm
491;
492; VI-LABEL: frem_f32:
493; VI:       ; %bb.0:
494; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
495; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
496; VI-NEXT:    s_waitcnt lgkmcnt(0)
497; VI-NEXT:    v_mov_b32_e32 v2, s6
498; VI-NEXT:    s_add_u32 s0, s0, 16
499; VI-NEXT:    v_mov_b32_e32 v3, s7
500; VI-NEXT:    s_addc_u32 s1, s1, 0
501; VI-NEXT:    flat_load_dword v4, v[2:3]
502; VI-NEXT:    v_mov_b32_e32 v3, s1
503; VI-NEXT:    v_mov_b32_e32 v2, s0
504; VI-NEXT:    flat_load_dword v2, v[2:3]
505; VI-NEXT:    v_mov_b32_e32 v0, s4
506; VI-NEXT:    v_mov_b32_e32 v1, s5
507; VI-NEXT:    s_waitcnt vmcnt(0)
508; VI-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v4
509; VI-NEXT:    v_div_scale_f32 v3, vcc, v4, v2, v4
510; VI-NEXT:    v_rcp_f32_e32 v6, v5
511; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
512; VI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
513; VI-NEXT:    v_fma_f32 v6, v7, v6, v6
514; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
515; VI-NEXT:    v_fma_f32 v8, -v5, v7, v3
516; VI-NEXT:    v_fma_f32 v7, v8, v6, v7
517; VI-NEXT:    v_fma_f32 v3, -v5, v7, v3
518; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
519; VI-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
520; VI-NEXT:    v_div_fixup_f32 v3, v3, v2, v4
521; VI-NEXT:    v_trunc_f32_e32 v3, v3
522; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
523; VI-NEXT:    flat_store_dword v[0:1], v2
524; VI-NEXT:    s_endpgm
525;
526; GFX9-LABEL: frem_f32:
527; GFX9:       ; %bb.0:
528; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
529; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
530; GFX9-NEXT:    v_mov_b32_e32 v0, 0
531; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
532; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
533; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
534; GFX9-NEXT:    s_waitcnt vmcnt(0)
535; GFX9-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, v1
536; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v1, v2, v1
537; GFX9-NEXT:    v_rcp_f32_e32 v5, v4
538; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
539; GFX9-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
540; GFX9-NEXT:    v_fma_f32 v5, v6, v5, v5
541; GFX9-NEXT:    v_mul_f32_e32 v6, v3, v5
542; GFX9-NEXT:    v_fma_f32 v7, -v4, v6, v3
543; GFX9-NEXT:    v_fma_f32 v6, v7, v5, v6
544; GFX9-NEXT:    v_fma_f32 v3, -v4, v6, v3
545; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
546; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
547; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
548; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
549; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
550; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
551; GFX9-NEXT:    s_endpgm
552;
553; GFX10-LABEL: frem_f32:
554; GFX10:       ; %bb.0:
555; GFX10-NEXT:    s_clause 0x1
556; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
557; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
558; GFX10-NEXT:    v_mov_b32_e32 v0, 0
559; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX10-NEXT:    s_clause 0x1
561; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
562; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
563; GFX10-NEXT:    s_waitcnt vmcnt(0)
564; GFX10-NEXT:    v_div_scale_f32 v4, s0, v2, v2, v1
565; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
566; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
567; GFX10-NEXT:    s_denorm_mode 15
568; GFX10-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
569; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v5
570; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
571; GFX10-NEXT:    v_fma_f32 v7, -v4, v6, v3
572; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v5
573; GFX10-NEXT:    v_fma_f32 v3, -v4, v6, v3
574; GFX10-NEXT:    s_denorm_mode 12
575; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
576; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
577; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
578; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
579; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
580; GFX10-NEXT:    s_endpgm
581                      float addrspace(1)* %in2) #0 {
582   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
583   %r0 = load float, float addrspace(1)* %in1, align 4
584   %r1 = load float, float addrspace(1)* %gep2, align 4
585   %r2 = frem float %r0, %r1
586   store float %r2, float addrspace(1)* %out, align 4
587   ret void
588}
589
590define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
591; SI-LABEL: fast_frem_f32:
592; SI:       ; %bb.0:
593; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
594; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
595; SI-NEXT:    s_mov_b32 s11, 0xf000
596; SI-NEXT:    s_mov_b32 s10, -1
597; SI-NEXT:    s_waitcnt lgkmcnt(0)
598; SI-NEXT:    s_mov_b32 s8, s4
599; SI-NEXT:    s_mov_b32 s9, s5
600; SI-NEXT:    s_mov_b32 s4, s6
601; SI-NEXT:    s_mov_b32 s5, s7
602; SI-NEXT:    s_mov_b32 s6, s10
603; SI-NEXT:    s_mov_b32 s7, s11
604; SI-NEXT:    s_mov_b32 s2, s10
605; SI-NEXT:    s_mov_b32 s3, s11
606; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
607; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
608; SI-NEXT:    s_waitcnt vmcnt(0)
609; SI-NEXT:    v_rcp_f32_e32 v2, v1
610; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
611; SI-NEXT:    v_trunc_f32_e32 v2, v2
612; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
613; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
614; SI-NEXT:    s_endpgm
615;
616; CI-LABEL: fast_frem_f32:
617; CI:       ; %bb.0:
618; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
619; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
620; CI-NEXT:    s_mov_b32 s11, 0xf000
621; CI-NEXT:    s_mov_b32 s10, -1
622; CI-NEXT:    s_mov_b32 s2, s10
623; CI-NEXT:    s_waitcnt lgkmcnt(0)
624; CI-NEXT:    s_mov_b32 s8, s4
625; CI-NEXT:    s_mov_b32 s9, s5
626; CI-NEXT:    s_mov_b32 s4, s6
627; CI-NEXT:    s_mov_b32 s5, s7
628; CI-NEXT:    s_mov_b32 s6, s10
629; CI-NEXT:    s_mov_b32 s7, s11
630; CI-NEXT:    s_mov_b32 s3, s11
631; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
632; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
633; CI-NEXT:    s_waitcnt vmcnt(0)
634; CI-NEXT:    v_rcp_f32_e32 v2, v1
635; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
636; CI-NEXT:    v_trunc_f32_e32 v2, v2
637; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
638; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
639; CI-NEXT:    s_endpgm
640;
641; VI-LABEL: fast_frem_f32:
642; VI:       ; %bb.0:
643; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
644; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
645; VI-NEXT:    s_waitcnt lgkmcnt(0)
646; VI-NEXT:    v_mov_b32_e32 v2, s6
647; VI-NEXT:    s_add_u32 s0, s0, 16
648; VI-NEXT:    v_mov_b32_e32 v3, s7
649; VI-NEXT:    s_addc_u32 s1, s1, 0
650; VI-NEXT:    flat_load_dword v4, v[2:3]
651; VI-NEXT:    v_mov_b32_e32 v3, s1
652; VI-NEXT:    v_mov_b32_e32 v2, s0
653; VI-NEXT:    flat_load_dword v2, v[2:3]
654; VI-NEXT:    v_mov_b32_e32 v0, s4
655; VI-NEXT:    v_mov_b32_e32 v1, s5
656; VI-NEXT:    s_waitcnt vmcnt(0)
657; VI-NEXT:    v_rcp_f32_e32 v3, v2
658; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
659; VI-NEXT:    v_trunc_f32_e32 v3, v3
660; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
661; VI-NEXT:    flat_store_dword v[0:1], v2
662; VI-NEXT:    s_endpgm
663;
664; GFX9-LABEL: fast_frem_f32:
665; GFX9:       ; %bb.0:
666; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
667; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
668; GFX9-NEXT:    v_mov_b32_e32 v0, 0
669; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
670; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
671; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
672; GFX9-NEXT:    s_waitcnt vmcnt(0)
673; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
674; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
675; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
676; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
677; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
678; GFX9-NEXT:    s_endpgm
679;
680; GFX10-LABEL: fast_frem_f32:
681; GFX10:       ; %bb.0:
682; GFX10-NEXT:    s_clause 0x1
683; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
684; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
685; GFX10-NEXT:    v_mov_b32_e32 v0, 0
686; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX10-NEXT:    s_clause 0x1
688; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
689; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
690; GFX10-NEXT:    s_waitcnt vmcnt(0)
691; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
692; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
693; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
694; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
695; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
696; GFX10-NEXT:    s_endpgm
697                      float addrspace(1)* %in2) #0 {
698   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
699   %r0 = load float, float addrspace(1)* %in1, align 4
700   %r1 = load float, float addrspace(1)* %gep2, align 4
701   %r2 = frem fast float %r0, %r1
702   store float %r2, float addrspace(1)* %out, align 4
703   ret void
704}
705
706define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
707; SI-LABEL: unsafe_frem_f32:
708; SI:       ; %bb.0:
709; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
710; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
711; SI-NEXT:    s_mov_b32 s11, 0xf000
712; SI-NEXT:    s_mov_b32 s10, -1
713; SI-NEXT:    s_waitcnt lgkmcnt(0)
714; SI-NEXT:    s_mov_b32 s8, s4
715; SI-NEXT:    s_mov_b32 s9, s5
716; SI-NEXT:    s_mov_b32 s4, s6
717; SI-NEXT:    s_mov_b32 s5, s7
718; SI-NEXT:    s_mov_b32 s6, s10
719; SI-NEXT:    s_mov_b32 s7, s11
720; SI-NEXT:    s_mov_b32 s2, s10
721; SI-NEXT:    s_mov_b32 s3, s11
722; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
723; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
724; SI-NEXT:    s_waitcnt vmcnt(0)
725; SI-NEXT:    v_rcp_f32_e32 v2, v1
726; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
727; SI-NEXT:    v_trunc_f32_e32 v2, v2
728; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
729; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
730; SI-NEXT:    s_endpgm
731;
732; CI-LABEL: unsafe_frem_f32:
733; CI:       ; %bb.0:
734; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
735; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
736; CI-NEXT:    s_mov_b32 s11, 0xf000
737; CI-NEXT:    s_mov_b32 s10, -1
738; CI-NEXT:    s_mov_b32 s2, s10
739; CI-NEXT:    s_waitcnt lgkmcnt(0)
740; CI-NEXT:    s_mov_b32 s8, s4
741; CI-NEXT:    s_mov_b32 s9, s5
742; CI-NEXT:    s_mov_b32 s4, s6
743; CI-NEXT:    s_mov_b32 s5, s7
744; CI-NEXT:    s_mov_b32 s6, s10
745; CI-NEXT:    s_mov_b32 s7, s11
746; CI-NEXT:    s_mov_b32 s3, s11
747; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
748; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
749; CI-NEXT:    s_waitcnt vmcnt(0)
750; CI-NEXT:    v_rcp_f32_e32 v2, v1
751; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
752; CI-NEXT:    v_trunc_f32_e32 v2, v2
753; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
754; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
755; CI-NEXT:    s_endpgm
756;
757; VI-LABEL: unsafe_frem_f32:
758; VI:       ; %bb.0:
759; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
760; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
761; VI-NEXT:    s_waitcnt lgkmcnt(0)
762; VI-NEXT:    v_mov_b32_e32 v2, s6
763; VI-NEXT:    s_add_u32 s0, s0, 16
764; VI-NEXT:    v_mov_b32_e32 v3, s7
765; VI-NEXT:    s_addc_u32 s1, s1, 0
766; VI-NEXT:    flat_load_dword v4, v[2:3]
767; VI-NEXT:    v_mov_b32_e32 v3, s1
768; VI-NEXT:    v_mov_b32_e32 v2, s0
769; VI-NEXT:    flat_load_dword v2, v[2:3]
770; VI-NEXT:    v_mov_b32_e32 v0, s4
771; VI-NEXT:    v_mov_b32_e32 v1, s5
772; VI-NEXT:    s_waitcnt vmcnt(0)
773; VI-NEXT:    v_rcp_f32_e32 v3, v2
774; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
775; VI-NEXT:    v_trunc_f32_e32 v3, v3
776; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
777; VI-NEXT:    flat_store_dword v[0:1], v2
778; VI-NEXT:    s_endpgm
779;
780; GFX9-LABEL: unsafe_frem_f32:
781; GFX9:       ; %bb.0:
782; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
783; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
784; GFX9-NEXT:    v_mov_b32_e32 v0, 0
785; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
787; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
788; GFX9-NEXT:    s_waitcnt vmcnt(0)
789; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
790; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
791; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
792; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
793; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
794; GFX9-NEXT:    s_endpgm
795;
796; GFX10-LABEL: unsafe_frem_f32:
797; GFX10:       ; %bb.0:
798; GFX10-NEXT:    s_clause 0x1
799; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
800; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
801; GFX10-NEXT:    v_mov_b32_e32 v0, 0
802; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
803; GFX10-NEXT:    s_clause 0x1
804; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
805; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
806; GFX10-NEXT:    s_waitcnt vmcnt(0)
807; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
808; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
809; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
810; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
811; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
812; GFX10-NEXT:    s_endpgm
813                             float addrspace(1)* %in2) #1 {
814   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
815   %r0 = load float, float addrspace(1)* %in1, align 4
816   %r1 = load float, float addrspace(1)* %gep2, align 4
817   %r2 = frem afn float %r0, %r1
818   store float %r2, float addrspace(1)* %out, align 4
819   ret void
820}
821
822define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
823; SI-LABEL: frem_f64:
824; SI:       ; %bb.0:
825; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
826; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
827; SI-NEXT:    s_mov_b32 s7, 0xf000
828; SI-NEXT:    s_mov_b32 s6, -1
829; SI-NEXT:    s_waitcnt lgkmcnt(0)
830; SI-NEXT:    s_mov_b32 s4, s8
831; SI-NEXT:    s_mov_b32 s5, s9
832; SI-NEXT:    s_mov_b32 s8, s10
833; SI-NEXT:    s_mov_b32 s9, s11
834; SI-NEXT:    s_mov_b32 s10, s6
835; SI-NEXT:    s_mov_b32 s11, s7
836; SI-NEXT:    s_mov_b32 s2, s6
837; SI-NEXT:    s_mov_b32 s3, s7
838; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
839; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
840; SI-NEXT:    s_waitcnt vmcnt(0)
841; SI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
842; SI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
843; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
844; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
845; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
846; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
847; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
848; SI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
849; SI-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
850; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
851; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v9
852; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
853; SI-NEXT:    s_nop 1
854; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
855; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
856; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
857; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
858; SI-NEXT:    s_mov_b32 s1, 0xfffff
859; SI-NEXT:    s_mov_b32 s0, s6
860; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
861; SI-NEXT:    v_not_b32_e32 v6, v6
862; SI-NEXT:    v_and_b32_e32 v6, v4, v6
863; SI-NEXT:    v_not_b32_e32 v7, v7
864; SI-NEXT:    v_and_b32_e32 v7, v5, v7
865; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
866; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
867; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
868; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
869; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
870; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
871; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
872; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
873; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
874; SI-NEXT:    s_endpgm
875;
876; CI-LABEL: frem_f64:
877; CI:       ; %bb.0:
878; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
879; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
880; CI-NEXT:    s_mov_b32 s11, 0xf000
881; CI-NEXT:    s_mov_b32 s10, -1
882; CI-NEXT:    s_mov_b32 s2, s10
883; CI-NEXT:    s_waitcnt lgkmcnt(0)
884; CI-NEXT:    s_mov_b32 s8, s4
885; CI-NEXT:    s_mov_b32 s9, s5
886; CI-NEXT:    s_mov_b32 s4, s6
887; CI-NEXT:    s_mov_b32 s5, s7
888; CI-NEXT:    s_mov_b32 s6, s10
889; CI-NEXT:    s_mov_b32 s7, s11
890; CI-NEXT:    s_mov_b32 s3, s11
891; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
892; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
893; CI-NEXT:    s_waitcnt vmcnt(0)
894; CI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
895; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
896; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
897; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
898; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
899; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
900; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
901; CI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
902; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
903; CI-NEXT:    s_nop 1
904; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
905; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
906; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
907; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
908; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
909; CI-NEXT:    s_endpgm
910;
911; VI-LABEL: frem_f64:
912; VI:       ; %bb.0:
913; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
914; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
915; VI-NEXT:    s_waitcnt lgkmcnt(0)
916; VI-NEXT:    v_mov_b32_e32 v2, s6
917; VI-NEXT:    v_mov_b32_e32 v3, s7
918; VI-NEXT:    v_mov_b32_e32 v4, s0
919; VI-NEXT:    v_mov_b32_e32 v5, s1
920; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
921; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
922; VI-NEXT:    v_mov_b32_e32 v0, s4
923; VI-NEXT:    v_mov_b32_e32 v1, s5
924; VI-NEXT:    s_waitcnt vmcnt(0)
925; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
926; VI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
927; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
928; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
929; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
930; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
931; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
932; VI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
933; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
934; VI-NEXT:    s_nop 1
935; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
936; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
937; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
938; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
939; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
940; VI-NEXT:    s_endpgm
941;
942; GFX9-LABEL: frem_f64:
943; GFX9:       ; %bb.0:
944; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
945; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
946; GFX9-NEXT:    v_mov_b32_e32 v12, 0
947; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX9-NEXT:    global_load_dwordx2 v[0:1], v12, s[6:7]
949; GFX9-NEXT:    global_load_dwordx2 v[2:3], v12, s[2:3]
950; GFX9-NEXT:    s_waitcnt vmcnt(0)
951; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
952; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
953; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
954; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
955; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
956; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
957; GFX9-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
958; GFX9-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
959; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
960; GFX9-NEXT:    s_nop 1
961; GFX9-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
962; GFX9-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
963; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
964; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
965; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[4:5]
966; GFX9-NEXT:    s_endpgm
967;
968; GFX10-LABEL: frem_f64:
969; GFX10:       ; %bb.0:
970; GFX10-NEXT:    s_clause 0x1
971; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
972; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
973; GFX10-NEXT:    v_mov_b32_e32 v12, 0
974; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
975; GFX10-NEXT:    s_clause 0x1
976; GFX10-NEXT:    global_load_dwordx2 v[0:1], v12, s[6:7]
977; GFX10-NEXT:    global_load_dwordx2 v[2:3], v12, s[2:3]
978; GFX10-NEXT:    s_waitcnt vmcnt(0)
979; GFX10-NEXT:    v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1]
980; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
981; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
982; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
983; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
984; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
985; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
986; GFX10-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
987; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
988; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
989; GFX10-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
990; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
991; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
992; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[4:5]
993; GFX10-NEXT:    s_endpgm
994                      double addrspace(1)* %in2) #0 {
995   %r0 = load double, double addrspace(1)* %in1, align 8
996   %r1 = load double, double addrspace(1)* %in2, align 8
997   %r2 = frem double %r0, %r1
998   store double %r2, double addrspace(1)* %out, align 8
999   ret void
1000}
1001
1002define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
1003; SI-LABEL: fast_frem_f64:
1004; SI:       ; %bb.0:
1005; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
1006; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1007; SI-NEXT:    s_mov_b32 s7, 0xf000
1008; SI-NEXT:    s_mov_b32 s6, -1
1009; SI-NEXT:    s_waitcnt lgkmcnt(0)
1010; SI-NEXT:    s_mov_b32 s4, s8
1011; SI-NEXT:    s_mov_b32 s5, s9
1012; SI-NEXT:    s_mov_b32 s8, s10
1013; SI-NEXT:    s_mov_b32 s9, s11
1014; SI-NEXT:    s_mov_b32 s10, s6
1015; SI-NEXT:    s_mov_b32 s11, s7
1016; SI-NEXT:    s_mov_b32 s2, s6
1017; SI-NEXT:    s_mov_b32 s3, s7
1018; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1019; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1020; SI-NEXT:    s_waitcnt vmcnt(0)
1021; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1022; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1023; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1024; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1025; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1026; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1027; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1028; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1029; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
1030; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1031; SI-NEXT:    s_mov_b32 s1, 0xfffff
1032; SI-NEXT:    s_mov_b32 s0, s6
1033; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
1034; SI-NEXT:    v_not_b32_e32 v6, v6
1035; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1036; SI-NEXT:    v_not_b32_e32 v7, v7
1037; SI-NEXT:    v_and_b32_e32 v7, v5, v7
1038; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
1039; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
1040; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1041; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
1042; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1043; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1044; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1045; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1046; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1047; SI-NEXT:    s_endpgm
1048;
1049; CI-LABEL: fast_frem_f64:
1050; CI:       ; %bb.0:
1051; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1052; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1053; CI-NEXT:    s_mov_b32 s11, 0xf000
1054; CI-NEXT:    s_mov_b32 s10, -1
1055; CI-NEXT:    s_mov_b32 s2, s10
1056; CI-NEXT:    s_waitcnt lgkmcnt(0)
1057; CI-NEXT:    s_mov_b32 s8, s4
1058; CI-NEXT:    s_mov_b32 s9, s5
1059; CI-NEXT:    s_mov_b32 s4, s6
1060; CI-NEXT:    s_mov_b32 s5, s7
1061; CI-NEXT:    s_mov_b32 s6, s10
1062; CI-NEXT:    s_mov_b32 s7, s11
1063; CI-NEXT:    s_mov_b32 s3, s11
1064; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1065; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1066; CI-NEXT:    s_waitcnt vmcnt(0)
1067; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1068; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1069; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1070; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1071; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1072; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1073; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1074; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1075; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1076; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1077; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1078; CI-NEXT:    s_endpgm
1079;
1080; VI-LABEL: fast_frem_f64:
1081; VI:       ; %bb.0:
1082; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1083; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1084; VI-NEXT:    s_waitcnt lgkmcnt(0)
1085; VI-NEXT:    v_mov_b32_e32 v2, s6
1086; VI-NEXT:    v_mov_b32_e32 v3, s7
1087; VI-NEXT:    v_mov_b32_e32 v4, s0
1088; VI-NEXT:    v_mov_b32_e32 v5, s1
1089; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1090; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1091; VI-NEXT:    v_mov_b32_e32 v0, s4
1092; VI-NEXT:    v_mov_b32_e32 v1, s5
1093; VI-NEXT:    s_waitcnt vmcnt(0)
1094; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1095; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1096; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1097; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1098; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1099; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
1100; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1101; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1102; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1103; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1104; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1105; VI-NEXT:    s_endpgm
1106;
1107; GFX9-LABEL: fast_frem_f64:
1108; GFX9:       ; %bb.0:
1109; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1110; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1111; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1112; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1114; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1115; GFX9-NEXT:    s_waitcnt vmcnt(0)
1116; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1117; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1118; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1119; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1120; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1121; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1122; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1123; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1124; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1125; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1126; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1127; GFX9-NEXT:    s_endpgm
1128;
1129; GFX10-LABEL: fast_frem_f64:
1130; GFX10:       ; %bb.0:
1131; GFX10-NEXT:    s_clause 0x1
1132; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1133; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1134; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1135; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX10-NEXT:    s_clause 0x1
1137; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1138; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1139; GFX10-NEXT:    s_waitcnt vmcnt(0)
1140; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1141; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1142; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1143; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1144; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1145; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1146; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1147; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1148; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1149; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1150; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1151; GFX10-NEXT:    s_endpgm
1152                      double addrspace(1)* %in2) #0 {
1153   %r0 = load double, double addrspace(1)* %in1, align 8
1154   %r1 = load double, double addrspace(1)* %in2, align 8
1155   %r2 = frem fast double %r0, %r1
1156   store double %r2, double addrspace(1)* %out, align 8
1157   ret void
1158}
1159
1160define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
1161; SI-LABEL: unsafe_frem_f64:
1162; SI:       ; %bb.0:
1163; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
1164; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1165; SI-NEXT:    s_mov_b32 s7, 0xf000
1166; SI-NEXT:    s_mov_b32 s6, -1
1167; SI-NEXT:    s_waitcnt lgkmcnt(0)
1168; SI-NEXT:    s_mov_b32 s4, s8
1169; SI-NEXT:    s_mov_b32 s5, s9
1170; SI-NEXT:    s_mov_b32 s8, s10
1171; SI-NEXT:    s_mov_b32 s9, s11
1172; SI-NEXT:    s_mov_b32 s10, s6
1173; SI-NEXT:    s_mov_b32 s11, s7
1174; SI-NEXT:    s_mov_b32 s2, s6
1175; SI-NEXT:    s_mov_b32 s3, s7
1176; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1177; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1178; SI-NEXT:    s_waitcnt vmcnt(0)
1179; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1180; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1181; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1182; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1183; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1184; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1185; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1186; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1187; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
1188; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1189; SI-NEXT:    s_mov_b32 s1, 0xfffff
1190; SI-NEXT:    s_mov_b32 s0, s6
1191; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
1192; SI-NEXT:    v_not_b32_e32 v6, v6
1193; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1194; SI-NEXT:    v_not_b32_e32 v7, v7
1195; SI-NEXT:    v_and_b32_e32 v7, v5, v7
1196; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
1197; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
1198; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1199; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
1200; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1201; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1202; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1203; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1204; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1205; SI-NEXT:    s_endpgm
1206;
1207; CI-LABEL: unsafe_frem_f64:
1208; CI:       ; %bb.0:
1209; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1210; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1211; CI-NEXT:    s_mov_b32 s11, 0xf000
1212; CI-NEXT:    s_mov_b32 s10, -1
1213; CI-NEXT:    s_mov_b32 s2, s10
1214; CI-NEXT:    s_waitcnt lgkmcnt(0)
1215; CI-NEXT:    s_mov_b32 s8, s4
1216; CI-NEXT:    s_mov_b32 s9, s5
1217; CI-NEXT:    s_mov_b32 s4, s6
1218; CI-NEXT:    s_mov_b32 s5, s7
1219; CI-NEXT:    s_mov_b32 s6, s10
1220; CI-NEXT:    s_mov_b32 s7, s11
1221; CI-NEXT:    s_mov_b32 s3, s11
1222; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1223; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1224; CI-NEXT:    s_waitcnt vmcnt(0)
1225; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1226; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1227; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1228; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1229; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1230; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1231; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1232; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1233; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1234; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1235; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1236; CI-NEXT:    s_endpgm
1237;
1238; VI-LABEL: unsafe_frem_f64:
1239; VI:       ; %bb.0:
1240; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1241; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1242; VI-NEXT:    s_waitcnt lgkmcnt(0)
1243; VI-NEXT:    v_mov_b32_e32 v2, s6
1244; VI-NEXT:    v_mov_b32_e32 v3, s7
1245; VI-NEXT:    v_mov_b32_e32 v4, s0
1246; VI-NEXT:    v_mov_b32_e32 v5, s1
1247; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1248; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1249; VI-NEXT:    v_mov_b32_e32 v0, s4
1250; VI-NEXT:    v_mov_b32_e32 v1, s5
1251; VI-NEXT:    s_waitcnt vmcnt(0)
1252; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1253; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1254; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1255; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1256; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1257; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
1258; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1259; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1260; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1261; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1262; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1263; VI-NEXT:    s_endpgm
1264;
1265; GFX9-LABEL: unsafe_frem_f64:
1266; GFX9:       ; %bb.0:
1267; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1268; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1269; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1270; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1271; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1272; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1273; GFX9-NEXT:    s_waitcnt vmcnt(0)
1274; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1275; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1276; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1277; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1278; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1279; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1280; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1281; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1282; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1283; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1284; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1285; GFX9-NEXT:    s_endpgm
1286;
1287; GFX10-LABEL: unsafe_frem_f64:
1288; GFX10:       ; %bb.0:
1289; GFX10-NEXT:    s_clause 0x1
1290; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1291; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1292; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1293; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX10-NEXT:    s_clause 0x1
1295; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1296; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1297; GFX10-NEXT:    s_waitcnt vmcnt(0)
1298; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1299; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1300; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1301; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1302; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1303; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1304; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1305; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1306; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1307; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1308; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1309; GFX10-NEXT:    s_endpgm
1310                             double addrspace(1)* %in2) #1 {
1311   %r0 = load double, double addrspace(1)* %in1, align 8
1312   %r1 = load double, double addrspace(1)* %in2, align 8
1313   %r2 = frem afn double %r0, %r1
1314   store double %r2, double addrspace(1)* %out, align 8
1315   ret void
1316}
1317
1318define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
1319; SI-LABEL: frem_v2f16:
1320; SI:       ; %bb.0:
1321; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1322; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1323; SI-NEXT:    s_mov_b32 s3, 0xf000
1324; SI-NEXT:    s_mov_b32 s2, -1
1325; SI-NEXT:    s_waitcnt lgkmcnt(0)
1326; SI-NEXT:    s_mov_b32 s0, s4
1327; SI-NEXT:    s_mov_b32 s1, s5
1328; SI-NEXT:    s_mov_b32 s4, s6
1329; SI-NEXT:    s_mov_b32 s5, s7
1330; SI-NEXT:    s_mov_b32 s6, s2
1331; SI-NEXT:    s_mov_b32 s7, s3
1332; SI-NEXT:    s_mov_b32 s10, s2
1333; SI-NEXT:    s_mov_b32 s11, s3
1334; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1335; SI-NEXT:    s_waitcnt vmcnt(0)
1336; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1337; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1338; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1339; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
1340; SI-NEXT:    s_waitcnt vmcnt(0)
1341; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1342; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1343; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1344; SI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1345; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1346; SI-NEXT:    v_rcp_f32_e32 v6, v5
1347; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1348; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1349; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
1350; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
1351; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1352; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
1353; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1354; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1355; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1356; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1357; SI-NEXT:    v_trunc_f32_e32 v4, v4
1358; SI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1359; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1360; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1361; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1362; SI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1363; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1364; SI-NEXT:    v_rcp_f32_e32 v5, v4
1365; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1366; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1367; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
1368; SI-NEXT:    v_mul_f32_e32 v6, v2, v5
1369; SI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1370; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
1371; SI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1372; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1373; SI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1374; SI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1375; SI-NEXT:    v_trunc_f32_e32 v2, v2
1376; SI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1377; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1378; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1379; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1380; SI-NEXT:    s_endpgm
1381;
1382; CI-LABEL: frem_v2f16:
1383; CI:       ; %bb.0:
1384; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1385; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1386; CI-NEXT:    s_mov_b32 s3, 0xf000
1387; CI-NEXT:    s_mov_b32 s2, -1
1388; CI-NEXT:    s_mov_b32 s10, s2
1389; CI-NEXT:    s_waitcnt lgkmcnt(0)
1390; CI-NEXT:    s_mov_b32 s0, s4
1391; CI-NEXT:    s_mov_b32 s1, s5
1392; CI-NEXT:    s_mov_b32 s4, s6
1393; CI-NEXT:    s_mov_b32 s5, s7
1394; CI-NEXT:    s_mov_b32 s6, s2
1395; CI-NEXT:    s_mov_b32 s7, s3
1396; CI-NEXT:    s_mov_b32 s11, s3
1397; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1398; CI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
1399; CI-NEXT:    s_waitcnt vmcnt(1)
1400; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1401; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1402; CI-NEXT:    s_waitcnt vmcnt(0)
1403; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1404; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1405; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1406; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1407; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1408; CI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1409; CI-NEXT:    v_rcp_f32_e32 v6, v5
1410; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1411; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1412; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
1413; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
1414; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1415; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
1416; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1417; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1418; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1419; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1420; CI-NEXT:    v_trunc_f32_e32 v4, v4
1421; CI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1422; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1423; CI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1424; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1425; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1426; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1427; CI-NEXT:    v_rcp_f32_e32 v5, v4
1428; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1429; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1430; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
1431; CI-NEXT:    v_mul_f32_e32 v6, v2, v5
1432; CI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1433; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
1434; CI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1435; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1436; CI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1437; CI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1438; CI-NEXT:    v_trunc_f32_e32 v2, v2
1439; CI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1440; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1441; CI-NEXT:    v_or_b32_e32 v0, v1, v0
1442; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1443; CI-NEXT:    s_endpgm
1444;
1445; VI-LABEL: frem_v2f16:
1446; VI:       ; %bb.0:
1447; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1448; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1449; VI-NEXT:    s_waitcnt lgkmcnt(0)
1450; VI-NEXT:    v_mov_b32_e32 v2, s6
1451; VI-NEXT:    s_add_u32 s0, s0, 16
1452; VI-NEXT:    v_mov_b32_e32 v3, s7
1453; VI-NEXT:    s_addc_u32 s1, s1, 0
1454; VI-NEXT:    flat_load_dword v4, v[2:3]
1455; VI-NEXT:    v_mov_b32_e32 v3, s1
1456; VI-NEXT:    v_mov_b32_e32 v2, s0
1457; VI-NEXT:    flat_load_dword v2, v[2:3]
1458; VI-NEXT:    v_mov_b32_e32 v0, s4
1459; VI-NEXT:    v_mov_b32_e32 v1, s5
1460; VI-NEXT:    s_waitcnt vmcnt(1)
1461; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
1462; VI-NEXT:    v_cvt_f32_f16_e32 v5, v3
1463; VI-NEXT:    s_waitcnt vmcnt(0)
1464; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1465; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1466; VI-NEXT:    v_rcp_f32_e32 v7, v7
1467; VI-NEXT:    v_mul_f32_e32 v5, v5, v7
1468; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1469; VI-NEXT:    v_div_fixup_f16 v5, v5, v6, v3
1470; VI-NEXT:    v_trunc_f16_e32 v5, v5
1471; VI-NEXT:    v_fma_f16 v3, -v5, v6, v3
1472; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
1473; VI-NEXT:    v_cvt_f32_f16_e32 v5, v4
1474; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1475; VI-NEXT:    v_rcp_f32_e32 v6, v6
1476; VI-NEXT:    v_mul_f32_e32 v5, v5, v6
1477; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1478; VI-NEXT:    v_div_fixup_f16 v5, v5, v2, v4
1479; VI-NEXT:    v_trunc_f16_e32 v5, v5
1480; VI-NEXT:    v_fma_f16 v2, -v5, v2, v4
1481; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1482; VI-NEXT:    flat_store_dword v[0:1], v2
1483; VI-NEXT:    s_endpgm
1484;
1485; GFX9-LABEL: frem_v2f16:
1486; GFX9:       ; %bb.0:
1487; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1488; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1489; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1490; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
1492; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
1493; GFX9-NEXT:    s_waitcnt vmcnt(1)
1494; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
1495; GFX9-NEXT:    s_waitcnt vmcnt(0)
1496; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
1497; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
1498; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
1499; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
1500; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
1501; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
1502; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v1
1503; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1504; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
1505; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1506; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v1
1507; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
1508; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v5
1509; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
1510; GFX9-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
1511; GFX9-NEXT:    v_trunc_f16_e32 v4, v4
1512; GFX9-NEXT:    v_fma_f16 v1, -v4, v2, v1
1513; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
1514; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
1515; GFX9-NEXT:    s_endpgm
1516;
1517; GFX10-LABEL: frem_v2f16:
1518; GFX10:       ; %bb.0:
1519; GFX10-NEXT:    s_clause 0x1
1520; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1521; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1522; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1523; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1524; GFX10-NEXT:    s_clause 0x1
1525; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
1526; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
1527; GFX10-NEXT:    s_waitcnt vmcnt(1)
1528; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
1529; GFX10-NEXT:    s_waitcnt vmcnt(0)
1530; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
1531; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
1532; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
1533; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
1534; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
1535; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
1536; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v1
1537; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1538; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1539; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
1540; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v1
1541; GFX10-NEXT:    v_rcp_f32_e32 v5, v5
1542; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v5
1543; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
1544; GFX10-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
1545; GFX10-NEXT:    v_trunc_f16_e32 v4, v4
1546; GFX10-NEXT:    v_fma_f16 v1, -v4, v2, v1
1547; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
1548; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
1549; GFX10-NEXT:    s_endpgm
1550                        <2 x half> addrspace(1)* %in2) #0 {
1551   %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4
1552   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8
1553   %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8
1554   %r2 = frem <2 x half> %r0, %r1
1555   store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8
1556   ret void
1557}
1558
1559define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1,
1560; SI-LABEL: frem_v4f16:
1561; SI:       ; %bb.0:
1562; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1563; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1564; SI-NEXT:    s_mov_b32 s3, 0xf000
1565; SI-NEXT:    s_mov_b32 s2, -1
1566; SI-NEXT:    s_waitcnt lgkmcnt(0)
1567; SI-NEXT:    s_mov_b32 s0, s4
1568; SI-NEXT:    s_mov_b32 s1, s5
1569; SI-NEXT:    s_mov_b32 s4, s6
1570; SI-NEXT:    s_mov_b32 s5, s7
1571; SI-NEXT:    s_mov_b32 s6, s2
1572; SI-NEXT:    s_mov_b32 s7, s3
1573; SI-NEXT:    s_mov_b32 s10, s2
1574; SI-NEXT:    s_mov_b32 s11, s3
1575; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1576; SI-NEXT:    s_waitcnt vmcnt(0)
1577; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1578; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1579; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
1580; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
1581; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1582; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1583; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1584; SI-NEXT:    s_waitcnt vmcnt(0)
1585; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1586; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1587; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1588; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1589; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1590; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1591; SI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
1592; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
1593; SI-NEXT:    v_rcp_f32_e32 v10, v9
1594; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1595; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1596; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
1597; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
1598; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1599; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
1600; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1601; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1602; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1603; SI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
1604; SI-NEXT:    v_trunc_f32_e32 v8, v8
1605; SI-NEXT:    v_fma_f32 v1, -v8, v1, v5
1606; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1607; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1608; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1609; SI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
1610; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
1611; SI-NEXT:    v_rcp_f32_e32 v9, v8
1612; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1613; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1614; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
1615; SI-NEXT:    v_mul_f32_e32 v10, v5, v9
1616; SI-NEXT:    v_fma_f32 v11, -v8, v10, v5
1617; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
1618; SI-NEXT:    v_fma_f32 v5, -v8, v10, v5
1619; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1620; SI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
1621; SI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
1622; SI-NEXT:    v_trunc_f32_e32 v5, v5
1623; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1624; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1625; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1626; SI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
1627; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
1628; SI-NEXT:    v_rcp_f32_e32 v7, v5
1629; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1630; SI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
1631; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
1632; SI-NEXT:    v_mul_f32_e32 v8, v4, v7
1633; SI-NEXT:    v_fma_f32 v9, -v5, v8, v4
1634; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
1635; SI-NEXT:    v_fma_f32 v4, -v5, v8, v4
1636; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1637; SI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
1638; SI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
1639; SI-NEXT:    v_trunc_f32_e32 v4, v4
1640; SI-NEXT:    v_fma_f32 v0, -v4, v0, v3
1641; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1642; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1643; SI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
1644; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
1645; SI-NEXT:    v_rcp_f32_e32 v5, v4
1646; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1647; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1648; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
1649; SI-NEXT:    v_mul_f32_e32 v7, v3, v5
1650; SI-NEXT:    v_fma_f32 v8, -v4, v7, v3
1651; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
1652; SI-NEXT:    v_fma_f32 v3, -v4, v7, v3
1653; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1654; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
1655; SI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
1656; SI-NEXT:    v_trunc_f32_e32 v3, v3
1657; SI-NEXT:    v_fma_f32 v2, -v3, v6, v2
1658; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1659; SI-NEXT:    v_or_b32_e32 v0, v2, v0
1660; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1661; SI-NEXT:    s_endpgm
1662;
1663; CI-LABEL: frem_v4f16:
1664; CI:       ; %bb.0:
1665; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1666; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1667; CI-NEXT:    s_mov_b32 s3, 0xf000
1668; CI-NEXT:    s_mov_b32 s2, -1
1669; CI-NEXT:    s_mov_b32 s10, s2
1670; CI-NEXT:    s_waitcnt lgkmcnt(0)
1671; CI-NEXT:    s_mov_b32 s0, s4
1672; CI-NEXT:    s_mov_b32 s1, s5
1673; CI-NEXT:    s_mov_b32 s4, s6
1674; CI-NEXT:    s_mov_b32 s5, s7
1675; CI-NEXT:    s_mov_b32 s6, s2
1676; CI-NEXT:    s_mov_b32 s7, s3
1677; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1678; CI-NEXT:    s_mov_b32 s11, s3
1679; CI-NEXT:    s_waitcnt vmcnt(0)
1680; CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1681; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1682; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
1683; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1684; CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
1685; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1686; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1687; CI-NEXT:    s_waitcnt vmcnt(0)
1688; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1689; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1690; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1691; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1692; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1693; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1694; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
1695; CI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
1696; CI-NEXT:    v_rcp_f32_e32 v10, v9
1697; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1698; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1699; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
1700; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
1701; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1702; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
1703; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1704; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1705; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1706; CI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
1707; CI-NEXT:    v_trunc_f32_e32 v8, v8
1708; CI-NEXT:    v_fma_f32 v1, -v8, v1, v5
1709; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
1710; CI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
1711; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1712; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1713; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1714; CI-NEXT:    v_rcp_f32_e32 v9, v8
1715; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1716; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1717; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
1718; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
1719; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
1720; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
1721; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
1722; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1723; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
1724; CI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
1725; CI-NEXT:    v_trunc_f32_e32 v5, v5
1726; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1727; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
1728; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1729; CI-NEXT:    v_or_b32_e32 v1, v4, v1
1730; CI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
1731; CI-NEXT:    v_rcp_f32_e32 v7, v5
1732; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1733; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
1734; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
1735; CI-NEXT:    v_mul_f32_e32 v8, v4, v7
1736; CI-NEXT:    v_fma_f32 v9, -v5, v8, v4
1737; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
1738; CI-NEXT:    v_fma_f32 v4, -v5, v8, v4
1739; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1740; CI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
1741; CI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
1742; CI-NEXT:    v_trunc_f32_e32 v4, v4
1743; CI-NEXT:    v_fma_f32 v0, -v4, v0, v3
1744; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
1745; CI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
1746; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1747; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1748; CI-NEXT:    v_rcp_f32_e32 v5, v4
1749; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1750; CI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1751; CI-NEXT:    v_fma_f32 v5, v7, v5, v5
1752; CI-NEXT:    v_mul_f32_e32 v7, v3, v5
1753; CI-NEXT:    v_fma_f32 v8, -v4, v7, v3
1754; CI-NEXT:    v_fma_f32 v7, v8, v5, v7
1755; CI-NEXT:    v_fma_f32 v3, -v4, v7, v3
1756; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1757; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
1758; CI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
1759; CI-NEXT:    v_trunc_f32_e32 v3, v3
1760; CI-NEXT:    v_fma_f32 v2, -v3, v6, v2
1761; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1762; CI-NEXT:    v_or_b32_e32 v0, v2, v0
1763; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1764; CI-NEXT:    s_endpgm
1765;
1766; VI-LABEL: frem_v4f16:
1767; VI:       ; %bb.0:
1768; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1769; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1770; VI-NEXT:    s_waitcnt lgkmcnt(0)
1771; VI-NEXT:    v_mov_b32_e32 v2, s6
1772; VI-NEXT:    s_add_u32 s0, s0, 32
1773; VI-NEXT:    s_addc_u32 s1, s1, 0
1774; VI-NEXT:    v_mov_b32_e32 v5, s1
1775; VI-NEXT:    v_mov_b32_e32 v4, s0
1776; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1777; VI-NEXT:    v_mov_b32_e32 v3, s7
1778; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1779; VI-NEXT:    v_mov_b32_e32 v0, s4
1780; VI-NEXT:    v_mov_b32_e32 v1, s5
1781; VI-NEXT:    s_waitcnt vmcnt(1)
1782; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
1783; VI-NEXT:    v_cvt_f32_f16_e32 v9, v8
1784; VI-NEXT:    s_waitcnt vmcnt(0)
1785; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1786; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1787; VI-NEXT:    v_rcp_f32_e32 v9, v9
1788; VI-NEXT:    v_mul_f32_e32 v7, v7, v9
1789; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1790; VI-NEXT:    v_div_fixup_f16 v7, v7, v8, v6
1791; VI-NEXT:    v_trunc_f16_e32 v7, v7
1792; VI-NEXT:    v_fma_f16 v6, -v7, v8, v6
1793; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
1794; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
1795; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
1796; VI-NEXT:    v_rcp_f32_e32 v8, v8
1797; VI-NEXT:    v_mul_f32_e32 v7, v7, v8
1798; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1799; VI-NEXT:    v_div_fixup_f16 v7, v7, v5, v3
1800; VI-NEXT:    v_trunc_f16_e32 v7, v7
1801; VI-NEXT:    v_fma_f16 v3, -v7, v5, v3
1802; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1803; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
1804; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1805; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1806; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
1807; VI-NEXT:    v_rcp_f32_e32 v8, v8
1808; VI-NEXT:    v_mul_f32_e32 v6, v6, v8
1809; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1810; VI-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
1811; VI-NEXT:    v_trunc_f16_e32 v6, v6
1812; VI-NEXT:    v_fma_f16 v5, -v6, v7, v5
1813; VI-NEXT:    v_cvt_f32_f16_e32 v7, v4
1814; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
1815; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1816; VI-NEXT:    v_rcp_f32_e32 v7, v7
1817; VI-NEXT:    v_mul_f32_e32 v6, v6, v7
1818; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1819; VI-NEXT:    v_div_fixup_f16 v6, v6, v4, v2
1820; VI-NEXT:    v_trunc_f16_e32 v6, v6
1821; VI-NEXT:    v_fma_f16 v2, -v6, v4, v2
1822; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1823; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1824; VI-NEXT:    s_endpgm
1825;
1826; GFX9-LABEL: frem_v4f16:
1827; GFX9:       ; %bb.0:
1828; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1829; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1830; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1831; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1832; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
1833; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
1834; GFX9-NEXT:    s_waitcnt vmcnt(1)
1835; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v1
1836; GFX9-NEXT:    s_waitcnt vmcnt(0)
1837; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v3
1838; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
1839; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
1840; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
1841; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
1842; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
1843; GFX9-NEXT:    v_fma_f16 v5, -v5, v3, v1
1844; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1845; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v3
1846; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1847; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v1
1848; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
1849; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v7
1850; GFX9-NEXT:    v_cvt_f16_f32_e32 v6, v6
1851; GFX9-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
1852; GFX9-NEXT:    v_trunc_f16_e32 v6, v6
1853; GFX9-NEXT:    v_fma_f16 v1, -v6, v3, v1
1854; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v1
1855; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
1856; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
1857; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
1858; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
1859; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
1860; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
1861; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
1862; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v0
1863; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1864; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v2
1865; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1866; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v0
1867; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
1868; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
1869; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
1870; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
1871; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
1872; GFX9-NEXT:    v_fma_f16 v0, -v5, v2, v0
1873; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
1874; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
1875; GFX9-NEXT:    s_endpgm
1876;
1877; GFX10-LABEL: frem_v4f16:
1878; GFX10:       ; %bb.0:
1879; GFX10-NEXT:    s_clause 0x1
1880; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1881; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1882; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1883; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1884; GFX10-NEXT:    s_clause 0x1
1885; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
1886; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
1887; GFX10-NEXT:    s_waitcnt vmcnt(1)
1888; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
1889; GFX10-NEXT:    s_waitcnt vmcnt(0)
1890; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v3
1891; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
1892; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v6
1893; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
1894; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
1895; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
1896; GFX10-NEXT:    v_fma_f16 v5, -v5, v3, v1
1897; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1898; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1899; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v3
1900; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v1
1901; GFX10-NEXT:    v_rcp_f32_e32 v7, v7
1902; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v7
1903; GFX10-NEXT:    v_cvt_f16_f32_e32 v6, v6
1904; GFX10-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
1905; GFX10-NEXT:    v_trunc_f16_e32 v6, v6
1906; GFX10-NEXT:    v_fma_f16 v1, -v6, v3, v1
1907; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
1908; GFX10-NEXT:    v_pack_b32_f16 v1, v5, v1
1909; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
1910; GFX10-NEXT:    v_rcp_f32_e32 v5, v5
1911; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
1912; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
1913; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
1914; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
1915; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v0
1916; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1917; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1918; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v2
1919; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v0
1920; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
1921; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v6
1922; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
1923; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
1924; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
1925; GFX10-NEXT:    v_fma_f16 v0, -v5, v2, v0
1926; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
1927; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
1928; GFX10-NEXT:    s_endpgm
1929                        <4 x half> addrspace(1)* %in2) #0 {
1930   %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
1931   %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16
1932   %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16
1933   %r2 = frem <4 x half> %r0, %r1
1934   store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16
1935   ret void
1936}
1937
1938define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
1939; SI-LABEL: frem_v2f32:
1940; SI:       ; %bb.0:
1941; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1942; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1943; SI-NEXT:    s_mov_b32 s3, 0xf000
1944; SI-NEXT:    s_mov_b32 s2, -1
1945; SI-NEXT:    s_waitcnt lgkmcnt(0)
1946; SI-NEXT:    s_mov_b32 s0, s4
1947; SI-NEXT:    s_mov_b32 s1, s5
1948; SI-NEXT:    s_mov_b32 s4, s6
1949; SI-NEXT:    s_mov_b32 s5, s7
1950; SI-NEXT:    s_mov_b32 s6, s2
1951; SI-NEXT:    s_mov_b32 s7, s3
1952; SI-NEXT:    s_mov_b32 s10, s2
1953; SI-NEXT:    s_mov_b32 s11, s3
1954; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1955; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
1956; SI-NEXT:    s_waitcnt vmcnt(0)
1957; SI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
1958; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
1959; SI-NEXT:    v_rcp_f32_e32 v6, v5
1960; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1961; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1962; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
1963; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
1964; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1965; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
1966; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1967; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1968; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1969; SI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
1970; SI-NEXT:    v_trunc_f32_e32 v4, v4
1971; SI-NEXT:    v_fma_f32 v1, -v4, v3, v1
1972; SI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
1973; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
1974; SI-NEXT:    v_rcp_f32_e32 v5, v4
1975; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1976; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1977; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
1978; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
1979; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
1980; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
1981; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
1982; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1983; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
1984; SI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
1985; SI-NEXT:    v_trunc_f32_e32 v3, v3
1986; SI-NEXT:    v_fma_f32 v0, -v3, v2, v0
1987; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1988; SI-NEXT:    s_endpgm
1989;
1990; CI-LABEL: frem_v2f32:
1991; CI:       ; %bb.0:
1992; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1993; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1994; CI-NEXT:    s_mov_b32 s3, 0xf000
1995; CI-NEXT:    s_mov_b32 s2, -1
1996; CI-NEXT:    s_mov_b32 s10, s2
1997; CI-NEXT:    s_waitcnt lgkmcnt(0)
1998; CI-NEXT:    s_mov_b32 s0, s4
1999; CI-NEXT:    s_mov_b32 s1, s5
2000; CI-NEXT:    s_mov_b32 s4, s6
2001; CI-NEXT:    s_mov_b32 s5, s7
2002; CI-NEXT:    s_mov_b32 s6, s2
2003; CI-NEXT:    s_mov_b32 s7, s3
2004; CI-NEXT:    s_mov_b32 s11, s3
2005; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2006; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2007; CI-NEXT:    s_waitcnt vmcnt(0)
2008; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
2009; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
2010; CI-NEXT:    v_rcp_f32_e32 v6, v5
2011; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2012; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2013; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
2014; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
2015; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
2016; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
2017; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2018; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2019; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
2020; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
2021; CI-NEXT:    v_trunc_f32_e32 v4, v4
2022; CI-NEXT:    v_fma_f32 v1, -v4, v3, v1
2023; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
2024; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2025; CI-NEXT:    v_rcp_f32_e32 v5, v4
2026; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2027; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
2028; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
2029; CI-NEXT:    v_mul_f32_e32 v6, v3, v5
2030; CI-NEXT:    v_fma_f32 v7, -v4, v6, v3
2031; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
2032; CI-NEXT:    v_fma_f32 v3, -v4, v6, v3
2033; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2034; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
2035; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2036; CI-NEXT:    v_trunc_f32_e32 v3, v3
2037; CI-NEXT:    v_fma_f32 v0, -v3, v2, v0
2038; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2039; CI-NEXT:    s_endpgm
2040;
2041; VI-LABEL: frem_v2f32:
2042; VI:       ; %bb.0:
2043; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2044; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2045; VI-NEXT:    s_waitcnt lgkmcnt(0)
2046; VI-NEXT:    v_mov_b32_e32 v2, s6
2047; VI-NEXT:    s_add_u32 s0, s0, 32
2048; VI-NEXT:    s_addc_u32 s1, s1, 0
2049; VI-NEXT:    v_mov_b32_e32 v5, s1
2050; VI-NEXT:    v_mov_b32_e32 v3, s7
2051; VI-NEXT:    v_mov_b32_e32 v4, s0
2052; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
2053; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
2054; VI-NEXT:    v_mov_b32_e32 v0, s4
2055; VI-NEXT:    v_mov_b32_e32 v1, s5
2056; VI-NEXT:    s_waitcnt vmcnt(0)
2057; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v3
2058; VI-NEXT:    v_div_scale_f32 v6, vcc, v3, v5, v3
2059; VI-NEXT:    v_rcp_f32_e32 v8, v7
2060; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2061; VI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2062; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
2063; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
2064; VI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2065; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
2066; VI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2067; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2068; VI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2069; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v3
2070; VI-NEXT:    v_trunc_f32_e32 v6, v6
2071; VI-NEXT:    v_fma_f32 v3, -v6, v5, v3
2072; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v2
2073; VI-NEXT:    v_div_scale_f32 v5, vcc, v2, v4, v2
2074; VI-NEXT:    v_rcp_f32_e32 v7, v6
2075; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2076; VI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2077; VI-NEXT:    v_fma_f32 v7, v8, v7, v7
2078; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
2079; VI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2080; VI-NEXT:    v_fma_f32 v8, v9, v7, v8
2081; VI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2082; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2083; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2084; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v2
2085; VI-NEXT:    v_trunc_f32_e32 v5, v5
2086; VI-NEXT:    v_fma_f32 v2, -v5, v4, v2
2087; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2088; VI-NEXT:    s_endpgm
2089;
2090; GFX9-LABEL: frem_v2f32:
2091; GFX9:       ; %bb.0:
2092; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2093; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2094; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2095; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2096; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
2097; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2098; GFX9-NEXT:    s_waitcnt vmcnt(0)
2099; GFX9-NEXT:    v_div_scale_f32 v6, s[0:1], v3, v3, v1
2100; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
2101; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
2102; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2103; GFX9-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2104; GFX9-NEXT:    v_fma_f32 v7, v8, v7, v7
2105; GFX9-NEXT:    v_mul_f32_e32 v8, v5, v7
2106; GFX9-NEXT:    v_fma_f32 v9, -v6, v8, v5
2107; GFX9-NEXT:    v_fma_f32 v8, v9, v7, v8
2108; GFX9-NEXT:    v_fma_f32 v5, -v6, v8, v5
2109; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2110; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2111; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
2112; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2113; GFX9-NEXT:    v_fma_f32 v1, -v5, v3, v1
2114; GFX9-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v0
2115; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2116; GFX9-NEXT:    v_rcp_f32_e32 v6, v5
2117; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2118; GFX9-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2119; GFX9-NEXT:    v_fma_f32 v6, v7, v6, v6
2120; GFX9-NEXT:    v_mul_f32_e32 v7, v3, v6
2121; GFX9-NEXT:    v_fma_f32 v8, -v5, v7, v3
2122; GFX9-NEXT:    v_fma_f32 v7, v8, v6, v7
2123; GFX9-NEXT:    v_fma_f32 v3, -v5, v7, v3
2124; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2125; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
2126; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2127; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2128; GFX9-NEXT:    v_fma_f32 v0, -v3, v2, v0
2129; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
2130; GFX9-NEXT:    s_endpgm
2131;
2132; GFX10-LABEL: frem_v2f32:
2133; GFX10:       ; %bb.0:
2134; GFX10-NEXT:    s_clause 0x1
2135; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2136; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2137; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2138; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2139; GFX10-NEXT:    s_clause 0x1
2140; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
2141; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2142; GFX10-NEXT:    s_waitcnt vmcnt(0)
2143; GFX10-NEXT:    v_div_scale_f32 v6, s0, v3, v3, v1
2144; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2145; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
2146; GFX10-NEXT:    s_denorm_mode 15
2147; GFX10-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2148; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v7
2149; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
2150; GFX10-NEXT:    v_fma_f32 v9, -v6, v8, v5
2151; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v7
2152; GFX10-NEXT:    v_fma_f32 v5, -v6, v8, v5
2153; GFX10-NEXT:    s_denorm_mode 12
2154; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2155; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
2156; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
2157; GFX10-NEXT:    v_fma_f32 v1, -v5, v3, v1
2158; GFX10-NEXT:    v_div_scale_f32 v5, s0, v2, v2, v0
2159; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2160; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
2161; GFX10-NEXT:    s_denorm_mode 15
2162; GFX10-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2163; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v6
2164; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
2165; GFX10-NEXT:    v_fma_f32 v8, -v5, v7, v3
2166; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v6
2167; GFX10-NEXT:    v_fma_f32 v3, -v5, v7, v3
2168; GFX10-NEXT:    s_denorm_mode 12
2169; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
2170; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2171; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
2172; GFX10-NEXT:    v_fma_f32 v0, -v3, v2, v0
2173; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
2174; GFX10-NEXT:    s_endpgm
2175                        <2 x float> addrspace(1)* %in2) #0 {
2176   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
2177   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
2178   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
2179   %r2 = frem <2 x float> %r0, %r1
2180   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
2181   ret void
2182}
2183
2184define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
2185; SI-LABEL: frem_v4f32:
2186; SI:       ; %bb.0:
2187; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2188; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2189; SI-NEXT:    s_mov_b32 s3, 0xf000
2190; SI-NEXT:    s_mov_b32 s2, -1
2191; SI-NEXT:    s_waitcnt lgkmcnt(0)
2192; SI-NEXT:    s_mov_b32 s0, s4
2193; SI-NEXT:    s_mov_b32 s1, s5
2194; SI-NEXT:    s_mov_b32 s4, s6
2195; SI-NEXT:    s_mov_b32 s5, s7
2196; SI-NEXT:    s_mov_b32 s6, s2
2197; SI-NEXT:    s_mov_b32 s7, s3
2198; SI-NEXT:    s_mov_b32 s10, s2
2199; SI-NEXT:    s_mov_b32 s11, s3
2200; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2201; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2202; SI-NEXT:    s_waitcnt vmcnt(0)
2203; SI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
2204; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
2205; SI-NEXT:    v_rcp_f32_e32 v10, v9
2206; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2207; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2208; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
2209; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
2210; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
2211; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
2212; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
2213; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2214; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
2215; SI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
2216; SI-NEXT:    v_trunc_f32_e32 v8, v8
2217; SI-NEXT:    v_fma_f32 v3, -v8, v7, v3
2218; SI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2219; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
2220; SI-NEXT:    v_rcp_f32_e32 v9, v8
2221; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2222; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2223; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
2224; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
2225; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
2226; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
2227; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
2228; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2229; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
2230; SI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2231; SI-NEXT:    v_trunc_f32_e32 v7, v7
2232; SI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2233; SI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2234; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
2235; SI-NEXT:    v_rcp_f32_e32 v8, v7
2236; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2237; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2238; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
2239; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
2240; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2241; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
2242; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2243; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2244; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2245; SI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2246; SI-NEXT:    v_trunc_f32_e32 v6, v6
2247; SI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2248; SI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2249; SI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
2250; SI-NEXT:    v_rcp_f32_e32 v7, v6
2251; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2252; SI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2253; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
2254; SI-NEXT:    v_mul_f32_e32 v8, v5, v7
2255; SI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2256; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
2257; SI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2258; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2259; SI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2260; SI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2261; SI-NEXT:    v_trunc_f32_e32 v5, v5
2262; SI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2263; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2264; SI-NEXT:    s_endpgm
2265;
2266; CI-LABEL: frem_v4f32:
2267; CI:       ; %bb.0:
2268; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2269; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2270; CI-NEXT:    s_mov_b32 s3, 0xf000
2271; CI-NEXT:    s_mov_b32 s2, -1
2272; CI-NEXT:    s_mov_b32 s10, s2
2273; CI-NEXT:    s_waitcnt lgkmcnt(0)
2274; CI-NEXT:    s_mov_b32 s0, s4
2275; CI-NEXT:    s_mov_b32 s1, s5
2276; CI-NEXT:    s_mov_b32 s4, s6
2277; CI-NEXT:    s_mov_b32 s5, s7
2278; CI-NEXT:    s_mov_b32 s6, s2
2279; CI-NEXT:    s_mov_b32 s7, s3
2280; CI-NEXT:    s_mov_b32 s11, s3
2281; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2282; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2283; CI-NEXT:    s_waitcnt vmcnt(0)
2284; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
2285; CI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
2286; CI-NEXT:    v_rcp_f32_e32 v10, v9
2287; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2288; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2289; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
2290; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
2291; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
2292; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
2293; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
2294; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2295; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
2296; CI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
2297; CI-NEXT:    v_trunc_f32_e32 v8, v8
2298; CI-NEXT:    v_fma_f32 v3, -v8, v7, v3
2299; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
2300; CI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2301; CI-NEXT:    v_rcp_f32_e32 v9, v8
2302; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2303; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2304; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
2305; CI-NEXT:    v_mul_f32_e32 v10, v7, v9
2306; CI-NEXT:    v_fma_f32 v11, -v8, v10, v7
2307; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
2308; CI-NEXT:    v_fma_f32 v7, -v8, v10, v7
2309; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2310; CI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
2311; CI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2312; CI-NEXT:    v_trunc_f32_e32 v7, v7
2313; CI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2314; CI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
2315; CI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2316; CI-NEXT:    v_rcp_f32_e32 v8, v7
2317; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2318; CI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2319; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
2320; CI-NEXT:    v_mul_f32_e32 v9, v6, v8
2321; CI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2322; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
2323; CI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2324; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2325; CI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2326; CI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2327; CI-NEXT:    v_trunc_f32_e32 v6, v6
2328; CI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2329; CI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
2330; CI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2331; CI-NEXT:    v_rcp_f32_e32 v7, v6
2332; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2333; CI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2334; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
2335; CI-NEXT:    v_mul_f32_e32 v8, v5, v7
2336; CI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2337; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
2338; CI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2339; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2340; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2341; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2342; CI-NEXT:    v_trunc_f32_e32 v5, v5
2343; CI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2344; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2345; CI-NEXT:    s_endpgm
2346;
2347; VI-LABEL: frem_v4f32:
2348; VI:       ; %bb.0:
2349; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2350; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2351; VI-NEXT:    s_waitcnt lgkmcnt(0)
2352; VI-NEXT:    v_mov_b32_e32 v0, s6
2353; VI-NEXT:    s_add_u32 s0, s0, 64
2354; VI-NEXT:    s_addc_u32 s1, s1, 0
2355; VI-NEXT:    v_mov_b32_e32 v5, s1
2356; VI-NEXT:    v_mov_b32_e32 v1, s7
2357; VI-NEXT:    v_mov_b32_e32 v4, s0
2358; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2359; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2360; VI-NEXT:    v_mov_b32_e32 v8, s4
2361; VI-NEXT:    v_mov_b32_e32 v9, s5
2362; VI-NEXT:    s_waitcnt vmcnt(0)
2363; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v7, v7, v3
2364; VI-NEXT:    v_div_scale_f32 v10, vcc, v3, v7, v3
2365; VI-NEXT:    v_rcp_f32_e32 v12, v11
2366; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2367; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
2368; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
2369; VI-NEXT:    v_mul_f32_e32 v13, v10, v12
2370; VI-NEXT:    v_fma_f32 v14, -v11, v13, v10
2371; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
2372; VI-NEXT:    v_fma_f32 v10, -v11, v13, v10
2373; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2374; VI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
2375; VI-NEXT:    v_div_fixup_f32 v10, v10, v7, v3
2376; VI-NEXT:    v_trunc_f32_e32 v10, v10
2377; VI-NEXT:    v_fma_f32 v3, -v10, v7, v3
2378; VI-NEXT:    v_div_scale_f32 v10, s[0:1], v6, v6, v2
2379; VI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2380; VI-NEXT:    v_rcp_f32_e32 v11, v10
2381; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2382; VI-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2383; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
2384; VI-NEXT:    v_mul_f32_e32 v12, v7, v11
2385; VI-NEXT:    v_fma_f32 v13, -v10, v12, v7
2386; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
2387; VI-NEXT:    v_fma_f32 v7, -v10, v12, v7
2388; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2389; VI-NEXT:    v_div_fmas_f32 v7, v7, v11, v12
2390; VI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2391; VI-NEXT:    v_trunc_f32_e32 v7, v7
2392; VI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2393; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
2394; VI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2395; VI-NEXT:    v_rcp_f32_e32 v10, v7
2396; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2397; VI-NEXT:    v_fma_f32 v11, -v7, v10, 1.0
2398; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
2399; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
2400; VI-NEXT:    v_fma_f32 v12, -v7, v11, v6
2401; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
2402; VI-NEXT:    v_fma_f32 v6, -v7, v11, v6
2403; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2404; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
2405; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2406; VI-NEXT:    v_trunc_f32_e32 v6, v6
2407; VI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2408; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
2409; VI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2410; VI-NEXT:    v_rcp_f32_e32 v7, v6
2411; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2412; VI-NEXT:    v_fma_f32 v10, -v6, v7, 1.0
2413; VI-NEXT:    v_fma_f32 v7, v10, v7, v7
2414; VI-NEXT:    v_mul_f32_e32 v10, v5, v7
2415; VI-NEXT:    v_fma_f32 v11, -v6, v10, v5
2416; VI-NEXT:    v_fma_f32 v10, v11, v7, v10
2417; VI-NEXT:    v_fma_f32 v5, -v6, v10, v5
2418; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2419; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v10
2420; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2421; VI-NEXT:    v_trunc_f32_e32 v5, v5
2422; VI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2423; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2424; VI-NEXT:    s_endpgm
2425;
2426; GFX9-LABEL: frem_v4f32:
2427; GFX9:       ; %bb.0:
2428; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2429; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2430; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2431; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2432; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
2433; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2434; GFX9-NEXT:    s_waitcnt vmcnt(0)
2435; GFX9-NEXT:    v_div_scale_f32 v10, s[0:1], v7, v7, v3
2436; GFX9-NEXT:    v_div_scale_f32 v9, vcc, v3, v7, v3
2437; GFX9-NEXT:    v_rcp_f32_e32 v11, v10
2438; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2439; GFX9-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2440; GFX9-NEXT:    v_fma_f32 v11, v12, v11, v11
2441; GFX9-NEXT:    v_mul_f32_e32 v12, v9, v11
2442; GFX9-NEXT:    v_fma_f32 v13, -v10, v12, v9
2443; GFX9-NEXT:    v_fma_f32 v12, v13, v11, v12
2444; GFX9-NEXT:    v_fma_f32 v9, -v10, v12, v9
2445; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2446; GFX9-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
2447; GFX9-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
2448; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
2449; GFX9-NEXT:    v_fma_f32 v3, -v9, v7, v3
2450; GFX9-NEXT:    v_div_scale_f32 v9, s[0:1], v6, v6, v2
2451; GFX9-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2452; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
2453; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2454; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2455; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
2456; GFX9-NEXT:    v_mul_f32_e32 v11, v7, v10
2457; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v7
2458; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
2459; GFX9-NEXT:    v_fma_f32 v7, -v9, v11, v7
2460; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2461; GFX9-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
2462; GFX9-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2463; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
2464; GFX9-NEXT:    v_fma_f32 v2, -v7, v6, v2
2465; GFX9-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
2466; GFX9-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2467; GFX9-NEXT:    v_rcp_f32_e32 v9, v7
2468; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2469; GFX9-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
2470; GFX9-NEXT:    v_fma_f32 v9, v10, v9, v9
2471; GFX9-NEXT:    v_mul_f32_e32 v10, v6, v9
2472; GFX9-NEXT:    v_fma_f32 v11, -v7, v10, v6
2473; GFX9-NEXT:    v_fma_f32 v10, v11, v9, v10
2474; GFX9-NEXT:    v_fma_f32 v6, -v7, v10, v6
2475; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2476; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
2477; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2478; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
2479; GFX9-NEXT:    v_fma_f32 v1, -v6, v5, v1
2480; GFX9-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
2481; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2482; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
2483; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2484; GFX9-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
2485; GFX9-NEXT:    v_fma_f32 v7, v9, v7, v7
2486; GFX9-NEXT:    v_mul_f32_e32 v9, v5, v7
2487; GFX9-NEXT:    v_fma_f32 v10, -v6, v9, v5
2488; GFX9-NEXT:    v_fma_f32 v9, v10, v7, v9
2489; GFX9-NEXT:    v_fma_f32 v5, -v6, v9, v5
2490; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2491; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
2492; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2493; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2494; GFX9-NEXT:    v_fma_f32 v0, -v5, v4, v0
2495; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
2496; GFX9-NEXT:    s_endpgm
2497;
2498; GFX10-LABEL: frem_v4f32:
2499; GFX10:       ; %bb.0:
2500; GFX10-NEXT:    s_clause 0x1
2501; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2502; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2503; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2504; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2505; GFX10-NEXT:    s_clause 0x1
2506; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
2507; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2508; GFX10-NEXT:    s_waitcnt vmcnt(0)
2509; GFX10-NEXT:    v_div_scale_f32 v10, s0, v7, v7, v3
2510; GFX10-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
2511; GFX10-NEXT:    v_rcp_f32_e32 v11, v10
2512; GFX10-NEXT:    s_denorm_mode 15
2513; GFX10-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2514; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v11
2515; GFX10-NEXT:    v_mul_f32_e32 v12, v9, v11
2516; GFX10-NEXT:    v_fma_f32 v13, -v10, v12, v9
2517; GFX10-NEXT:    v_fmac_f32_e32 v12, v13, v11
2518; GFX10-NEXT:    v_fma_f32 v9, -v10, v12, v9
2519; GFX10-NEXT:    s_denorm_mode 12
2520; GFX10-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
2521; GFX10-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
2522; GFX10-NEXT:    v_trunc_f32_e32 v9, v9
2523; GFX10-NEXT:    v_fma_f32 v3, -v9, v7, v3
2524; GFX10-NEXT:    v_div_scale_f32 v9, s0, v6, v6, v2
2525; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
2526; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
2527; GFX10-NEXT:    s_denorm_mode 15
2528; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2529; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v10
2530; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
2531; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
2532; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v10
2533; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
2534; GFX10-NEXT:    s_denorm_mode 12
2535; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
2536; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2537; GFX10-NEXT:    v_trunc_f32_e32 v7, v7
2538; GFX10-NEXT:    v_fma_f32 v2, -v7, v6, v2
2539; GFX10-NEXT:    v_div_scale_f32 v7, s0, v5, v5, v1
2540; GFX10-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
2541; GFX10-NEXT:    v_rcp_f32_e32 v9, v7
2542; GFX10-NEXT:    s_denorm_mode 15
2543; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
2544; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v9
2545; GFX10-NEXT:    v_mul_f32_e32 v10, v6, v9
2546; GFX10-NEXT:    v_fma_f32 v11, -v7, v10, v6
2547; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v9
2548; GFX10-NEXT:    v_fma_f32 v6, -v7, v10, v6
2549; GFX10-NEXT:    s_denorm_mode 12
2550; GFX10-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
2551; GFX10-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2552; GFX10-NEXT:    v_trunc_f32_e32 v6, v6
2553; GFX10-NEXT:    v_fma_f32 v1, -v6, v5, v1
2554; GFX10-NEXT:    v_div_scale_f32 v6, s0, v4, v4, v0
2555; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
2556; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
2557; GFX10-NEXT:    s_denorm_mode 15
2558; GFX10-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
2559; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v7
2560; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v7
2561; GFX10-NEXT:    v_fma_f32 v10, -v6, v9, v5
2562; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v7
2563; GFX10-NEXT:    v_fma_f32 v5, -v6, v9, v5
2564; GFX10-NEXT:    s_denorm_mode 12
2565; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
2566; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2567; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
2568; GFX10-NEXT:    v_fma_f32 v0, -v5, v4, v0
2569; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
2570; GFX10-NEXT:    s_endpgm
2571                        <4 x float> addrspace(1)* %in2) #0 {
2572   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
2573   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
2574   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
2575   %r2 = frem <4 x float> %r0, %r1
2576   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
2577   ret void
2578}
2579
2580define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
2581; SI-LABEL: frem_v2f64:
2582; SI:       ; %bb.0:
2583; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
2584; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2585; SI-NEXT:    s_mov_b32 s7, 0xf000
2586; SI-NEXT:    s_mov_b32 s6, -1
2587; SI-NEXT:    s_waitcnt lgkmcnt(0)
2588; SI-NEXT:    s_mov_b32 s4, s8
2589; SI-NEXT:    s_mov_b32 s5, s9
2590; SI-NEXT:    s_mov_b32 s8, s10
2591; SI-NEXT:    s_mov_b32 s9, s11
2592; SI-NEXT:    s_mov_b32 s10, s6
2593; SI-NEXT:    s_mov_b32 s11, s7
2594; SI-NEXT:    s_mov_b32 s2, s6
2595; SI-NEXT:    s_mov_b32 s3, s7
2596; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2597; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2598; SI-NEXT:    s_waitcnt vmcnt(0)
2599; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
2600; SI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
2601; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2602; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2603; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2604; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2605; SI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
2606; SI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2607; SI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
2608; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
2609; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v13
2610; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
2611; SI-NEXT:    s_nop 1
2612; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
2613; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2614; SI-NEXT:    v_bfe_u32 v10, v9, 20, 11
2615; SI-NEXT:    v_add_i32_e32 v12, vcc, 0xfffffc01, v10
2616; SI-NEXT:    s_mov_b32 s3, 0xfffff
2617; SI-NEXT:    v_lshr_b64 v[10:11], s[2:3], v12
2618; SI-NEXT:    v_not_b32_e32 v10, v10
2619; SI-NEXT:    v_and_b32_e32 v10, v8, v10
2620; SI-NEXT:    v_not_b32_e32 v11, v11
2621; SI-NEXT:    v_and_b32_e32 v11, v9, v11
2622; SI-NEXT:    v_and_b32_e32 v13, 0x80000000, v9
2623; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v12
2624; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
2625; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v12
2626; SI-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[0:1]
2627; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
2628; SI-NEXT:    v_cndmask_b32_e64 v8, v10, v8, s[0:1]
2629; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2630; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2631; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
2632; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2633; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2634; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2635; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2636; SI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
2637; SI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
2638; SI-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
2639; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
2640; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v11
2641; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
2642; SI-NEXT:    s_nop 1
2643; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
2644; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2645; SI-NEXT:    v_bfe_u32 v8, v7, 20, 11
2646; SI-NEXT:    v_add_i32_e32 v10, vcc, 0xfffffc01, v8
2647; SI-NEXT:    v_lshr_b64 v[8:9], s[2:3], v10
2648; SI-NEXT:    v_not_b32_e32 v8, v8
2649; SI-NEXT:    v_and_b32_e32 v8, v6, v8
2650; SI-NEXT:    v_not_b32_e32 v9, v9
2651; SI-NEXT:    v_and_b32_e32 v9, v7, v9
2652; SI-NEXT:    v_and_b32_e32 v11, 0x80000000, v7
2653; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v10
2654; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
2655; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v10
2656; SI-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[0:1]
2657; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
2658; SI-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
2659; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2660; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2661; SI-NEXT:    s_endpgm
2662;
2663; CI-LABEL: frem_v2f64:
2664; CI:       ; %bb.0:
2665; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2666; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2667; CI-NEXT:    s_mov_b32 s3, 0xf000
2668; CI-NEXT:    s_mov_b32 s2, -1
2669; CI-NEXT:    s_mov_b32 s10, s2
2670; CI-NEXT:    s_waitcnt lgkmcnt(0)
2671; CI-NEXT:    s_mov_b32 s0, s4
2672; CI-NEXT:    s_mov_b32 s1, s5
2673; CI-NEXT:    s_mov_b32 s4, s6
2674; CI-NEXT:    s_mov_b32 s5, s7
2675; CI-NEXT:    s_mov_b32 s6, s2
2676; CI-NEXT:    s_mov_b32 s7, s3
2677; CI-NEXT:    s_mov_b32 s11, s3
2678; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2679; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2680; CI-NEXT:    s_waitcnt vmcnt(0)
2681; CI-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
2682; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
2683; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2684; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2685; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2686; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2687; CI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
2688; CI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2689; CI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2690; CI-NEXT:    s_nop 1
2691; CI-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2692; CI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2693; CI-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
2694; CI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2695; CI-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
2696; CI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
2697; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2698; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2699; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2700; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2701; CI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
2702; CI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
2703; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2704; CI-NEXT:    s_nop 1
2705; CI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2706; CI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2707; CI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
2708; CI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2709; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2710; CI-NEXT:    s_endpgm
2711;
2712; VI-LABEL: frem_v2f64:
2713; VI:       ; %bb.0:
2714; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2715; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2716; VI-NEXT:    s_waitcnt lgkmcnt(0)
2717; VI-NEXT:    v_mov_b32_e32 v0, s6
2718; VI-NEXT:    s_add_u32 s0, s0, 64
2719; VI-NEXT:    s_addc_u32 s1, s1, 0
2720; VI-NEXT:    v_mov_b32_e32 v5, s1
2721; VI-NEXT:    v_mov_b32_e32 v1, s7
2722; VI-NEXT:    v_mov_b32_e32 v4, s0
2723; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2724; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2725; VI-NEXT:    v_mov_b32_e32 v8, s4
2726; VI-NEXT:    v_mov_b32_e32 v9, s5
2727; VI-NEXT:    s_waitcnt vmcnt(0)
2728; VI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
2729; VI-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
2730; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
2731; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
2732; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
2733; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
2734; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
2735; VI-NEXT:    v_mul_f64 v[16:17], v[14:15], v[12:13]
2736; VI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
2737; VI-NEXT:    s_nop 1
2738; VI-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
2739; VI-NEXT:    v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
2740; VI-NEXT:    v_trunc_f64_e32 v[10:11], v[10:11]
2741; VI-NEXT:    v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
2742; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2743; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
2744; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
2745; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2746; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
2747; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2748; VI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
2749; VI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2750; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
2751; VI-NEXT:    s_nop 1
2752; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
2753; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2754; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
2755; VI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2756; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2757; VI-NEXT:    s_endpgm
2758;
2759; GFX9-LABEL: frem_v2f64:
2760; GFX9:       ; %bb.0:
2761; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2762; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2763; GFX9-NEXT:    v_mov_b32_e32 v16, 0
2764; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2765; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
2766; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
2767; GFX9-NEXT:    s_waitcnt vmcnt(0)
2768; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
2769; GFX9-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
2770; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2771; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2772; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2773; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2774; GFX9-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
2775; GFX9-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2776; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2777; GFX9-NEXT:    s_nop 1
2778; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2779; GFX9-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2780; GFX9-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
2781; GFX9-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2782; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2783; GFX9-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
2784; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2785; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2786; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2787; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2788; GFX9-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
2789; GFX9-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
2790; GFX9-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2791; GFX9-NEXT:    s_nop 1
2792; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2793; GFX9-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2794; GFX9-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
2795; GFX9-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2796; GFX9-NEXT:    global_store_dwordx4 v16, v[0:3], s[4:5]
2797; GFX9-NEXT:    s_endpgm
2798;
2799; GFX10-LABEL: frem_v2f64:
2800; GFX10:       ; %bb.0:
2801; GFX10-NEXT:    s_clause 0x1
2802; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2803; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2804; GFX10-NEXT:    v_mov_b32_e32 v16, 0
2805; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2806; GFX10-NEXT:    s_clause 0x1
2807; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
2808; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
2809; GFX10-NEXT:    s_waitcnt vmcnt(0)
2810; GFX10-NEXT:    v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3]
2811; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
2812; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2813; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2814; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2815; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2816; GFX10-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
2817; GFX10-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2818; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2819; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2820; GFX10-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2821; GFX10-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
2822; GFX10-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2823; GFX10-NEXT:    v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1]
2824; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
2825; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2826; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2827; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2828; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2829; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
2830; GFX10-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
2831; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2832; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2833; GFX10-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2834; GFX10-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
2835; GFX10-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2836; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[4:5]
2837; GFX10-NEXT:    s_endpgm
2838                        <2 x double> addrspace(1)* %in2) #0 {
2839   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
2840   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
2841   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
2842   %r2 = frem <2 x double> %r0, %r1
2843   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
2844   ret void
2845}
2846
2847attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2848attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2849