1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs  < %s | FileCheck --check-prefix=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8
9define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
10; SI-LABEL: frem_f16:
11; SI:       ; %bb.0:
12; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
14; SI-NEXT:    s_mov_b32 s11, 0xf000
15; SI-NEXT:    s_mov_b32 s10, -1
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s4
18; SI-NEXT:    s_mov_b32 s9, s5
19; SI-NEXT:    s_mov_b32 s4, s6
20; SI-NEXT:    s_mov_b32 s5, s7
21; SI-NEXT:    s_mov_b32 s6, s10
22; SI-NEXT:    s_mov_b32 s7, s11
23; SI-NEXT:    s_mov_b32 s2, s10
24; SI-NEXT:    s_mov_b32 s3, s11
25; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
28; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
29; SI-NEXT:    s_waitcnt vmcnt(0)
30; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
31; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
32; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
33; SI-NEXT:    v_rcp_f32_e32 v4, v3
34; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
35; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
36; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
37; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
38; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
39; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
40; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
41; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
42; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
43; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
44; SI-NEXT:    v_trunc_f32_e32 v2, v2
45; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
46; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
47; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
48; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
49; SI-NEXT:    s_endpgm
50;
51; CI-LABEL: frem_f16:
52; CI:       ; %bb.0:
53; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
54; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
55; CI-NEXT:    s_mov_b32 s11, 0xf000
56; CI-NEXT:    s_mov_b32 s10, -1
57; CI-NEXT:    s_mov_b32 s2, s10
58; CI-NEXT:    s_waitcnt lgkmcnt(0)
59; CI-NEXT:    s_mov_b32 s8, s4
60; CI-NEXT:    s_mov_b32 s9, s5
61; CI-NEXT:    s_mov_b32 s4, s6
62; CI-NEXT:    s_mov_b32 s5, s7
63; CI-NEXT:    s_mov_b32 s6, s10
64; CI-NEXT:    s_mov_b32 s7, s11
65; CI-NEXT:    s_mov_b32 s3, s11
66; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
67; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
68; CI-NEXT:    s_waitcnt vmcnt(1)
69; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
70; CI-NEXT:    s_waitcnt vmcnt(0)
71; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
72; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
73; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
74; CI-NEXT:    v_rcp_f32_e32 v4, v3
75; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
76; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
77; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
78; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
79; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
80; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
81; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
82; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
83; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
84; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
85; CI-NEXT:    v_trunc_f32_e32 v2, v2
86; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
87; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
88; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
89; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
90; CI-NEXT:    s_endpgm
91;
92; VI-LABEL: frem_f16:
93; VI:       ; %bb.0:
94; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
95; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
96; VI-NEXT:    s_waitcnt lgkmcnt(0)
97; VI-NEXT:    v_mov_b32_e32 v2, s6
98; VI-NEXT:    s_add_u32 s0, s0, 8
99; VI-NEXT:    v_mov_b32_e32 v3, s7
100; VI-NEXT:    s_addc_u32 s1, s1, 0
101; VI-NEXT:    flat_load_ushort v4, v[2:3]
102; VI-NEXT:    v_mov_b32_e32 v3, s1
103; VI-NEXT:    v_mov_b32_e32 v2, s0
104; VI-NEXT:    flat_load_ushort v2, v[2:3]
105; VI-NEXT:    v_mov_b32_e32 v0, s4
106; VI-NEXT:    v_mov_b32_e32 v1, s5
107; VI-NEXT:    s_waitcnt vmcnt(1)
108; VI-NEXT:    v_cvt_f32_f16_e32 v3, v4
109; VI-NEXT:    s_waitcnt vmcnt(0)
110; VI-NEXT:    v_cvt_f32_f16_e32 v5, v2
111; VI-NEXT:    v_rcp_f32_e32 v5, v5
112; VI-NEXT:    v_mul_f32_e32 v3, v3, v5
113; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
114; VI-NEXT:    v_div_fixup_f16 v3, v3, v2, v4
115; VI-NEXT:    v_trunc_f16_e32 v3, v3
116; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
117; VI-NEXT:    flat_store_short v[0:1], v2
118; VI-NEXT:    s_endpgm
119;
120; GFX9-LABEL: frem_f16:
121; GFX9:       ; %bb.0:
122; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
123; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
124; GFX9-NEXT:    v_mov_b32_e32 v0, 0
125; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
126; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
127; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
128; GFX9-NEXT:    s_waitcnt vmcnt(1)
129; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
130; GFX9-NEXT:    s_waitcnt vmcnt(0)
131; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
132; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
133; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
134; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
135; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
136; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
137; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
138; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
139; GFX9-NEXT:    s_endpgm
140;
141; GFX10-LABEL: frem_f16:
142; GFX10:       ; %bb.0:
143; GFX10-NEXT:    s_clause 0x1
144; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
145; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
146; GFX10-NEXT:    v_mov_b32_e32 v0, 0
147; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
148; GFX10-NEXT:    s_clause 0x1
149; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
150; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
151; GFX10-NEXT:    s_waitcnt vmcnt(1)
152; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
153; GFX10-NEXT:    s_waitcnt vmcnt(0)
154; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
155; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
156; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
157; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
158; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
159; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
160; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
161; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
162; GFX10-NEXT:    s_endpgm
163;
164; GFX11-LABEL: frem_f16:
165; GFX11:       ; %bb.0:
166; GFX11-NEXT:    s_clause 0x1
167; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
168; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
169; GFX11-NEXT:    v_mov_b32_e32 v0, 0
170; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX11-NEXT:    s_clause 0x1
172; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
173; GFX11-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
174; GFX11-NEXT:    s_waitcnt vmcnt(1)
175; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
176; GFX11-NEXT:    s_waitcnt vmcnt(0)
177; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
178; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
179; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
180; GFX11-NEXT:    s_waitcnt_depctr 0xfff
181; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
182; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
183; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
184; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
185; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
186; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
187; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
188; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
189; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
190; GFX11-NEXT:    s_endpgm
191                      half addrspace(1)* %in2) #0 {
192   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
193   %r0 = load half, half addrspace(1)* %in1, align 4
194   %r1 = load half, half addrspace(1)* %gep2, align 4
195   %r2 = frem half %r0, %r1
196   store half %r2, half addrspace(1)* %out, align 4
197   ret void
198}
199
200define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
201; SI-LABEL: fast_frem_f16:
202; SI:       ; %bb.0:
203; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
204; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
205; SI-NEXT:    s_mov_b32 s11, 0xf000
206; SI-NEXT:    s_mov_b32 s10, -1
207; SI-NEXT:    s_waitcnt lgkmcnt(0)
208; SI-NEXT:    s_mov_b32 s8, s4
209; SI-NEXT:    s_mov_b32 s9, s5
210; SI-NEXT:    s_mov_b32 s4, s6
211; SI-NEXT:    s_mov_b32 s5, s7
212; SI-NEXT:    s_mov_b32 s6, s10
213; SI-NEXT:    s_mov_b32 s7, s11
214; SI-NEXT:    s_mov_b32 s2, s10
215; SI-NEXT:    s_mov_b32 s3, s11
216; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
217; SI-NEXT:    s_waitcnt vmcnt(0)
218; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
219; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
220; SI-NEXT:    s_waitcnt vmcnt(0)
221; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
222; SI-NEXT:    v_rcp_f32_e32 v2, v1
223; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
224; SI-NEXT:    v_trunc_f32_e32 v2, v2
225; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
226; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
227; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
228; SI-NEXT:    s_endpgm
229;
230; CI-LABEL: fast_frem_f16:
231; CI:       ; %bb.0:
232; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
233; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
234; CI-NEXT:    s_mov_b32 s11, 0xf000
235; CI-NEXT:    s_mov_b32 s10, -1
236; CI-NEXT:    s_mov_b32 s2, s10
237; CI-NEXT:    s_mov_b32 s3, s11
238; CI-NEXT:    s_waitcnt lgkmcnt(0)
239; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
240; CI-NEXT:    s_mov_b32 s8, s4
241; CI-NEXT:    s_mov_b32 s9, s5
242; CI-NEXT:    s_mov_b32 s4, s6
243; CI-NEXT:    s_mov_b32 s5, s7
244; CI-NEXT:    s_mov_b32 s6, s10
245; CI-NEXT:    s_mov_b32 s7, s11
246; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
247; CI-NEXT:    s_waitcnt vmcnt(1)
248; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
249; CI-NEXT:    v_rcp_f32_e32 v2, v1
250; CI-NEXT:    s_waitcnt vmcnt(0)
251; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
252; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
253; CI-NEXT:    v_trunc_f32_e32 v2, v2
254; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
255; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
256; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
257; CI-NEXT:    s_endpgm
258;
259; VI-LABEL: fast_frem_f16:
260; VI:       ; %bb.0:
261; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
262; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
263; VI-NEXT:    s_waitcnt lgkmcnt(0)
264; VI-NEXT:    v_mov_b32_e32 v2, s6
265; VI-NEXT:    s_add_u32 s0, s0, 8
266; VI-NEXT:    v_mov_b32_e32 v3, s7
267; VI-NEXT:    s_addc_u32 s1, s1, 0
268; VI-NEXT:    flat_load_ushort v4, v[2:3]
269; VI-NEXT:    v_mov_b32_e32 v3, s1
270; VI-NEXT:    v_mov_b32_e32 v2, s0
271; VI-NEXT:    flat_load_ushort v2, v[2:3]
272; VI-NEXT:    v_mov_b32_e32 v0, s4
273; VI-NEXT:    v_mov_b32_e32 v1, s5
274; VI-NEXT:    s_waitcnt vmcnt(0)
275; VI-NEXT:    v_rcp_f16_e32 v3, v2
276; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
277; VI-NEXT:    v_trunc_f16_e32 v3, v3
278; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
279; VI-NEXT:    flat_store_short v[0:1], v2
280; VI-NEXT:    s_endpgm
281;
282; GFX9-LABEL: fast_frem_f16:
283; GFX9:       ; %bb.0:
284; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
285; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
286; GFX9-NEXT:    v_mov_b32_e32 v0, 0
287; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
289; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
290; GFX9-NEXT:    s_waitcnt vmcnt(0)
291; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
292; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
293; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
294; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
295; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
296; GFX9-NEXT:    s_endpgm
297;
298; GFX10-LABEL: fast_frem_f16:
299; GFX10:       ; %bb.0:
300; GFX10-NEXT:    s_clause 0x1
301; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
302; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
303; GFX10-NEXT:    v_mov_b32_e32 v0, 0
304; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
305; GFX10-NEXT:    s_clause 0x1
306; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
307; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
308; GFX10-NEXT:    s_waitcnt vmcnt(0)
309; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
310; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
311; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
312; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
313; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
314; GFX10-NEXT:    s_endpgm
315;
316; GFX11-LABEL: fast_frem_f16:
317; GFX11:       ; %bb.0:
318; GFX11-NEXT:    s_clause 0x1
319; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
320; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
321; GFX11-NEXT:    v_mov_b32_e32 v0, 0
322; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX11-NEXT:    s_clause 0x1
324; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
325; GFX11-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
326; GFX11-NEXT:    s_waitcnt vmcnt(0)
327; GFX11-NEXT:    v_rcp_f16_e32 v3, v2
328; GFX11-NEXT:    s_waitcnt_depctr 0xfff
329; GFX11-NEXT:    v_mul_f16_e32 v3, v1, v3
330; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
331; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
332; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
333; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
334; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
335; GFX11-NEXT:    s_endpgm
336                      half addrspace(1)* %in2) #0 {
337   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
338   %r0 = load half, half addrspace(1)* %in1, align 4
339   %r1 = load half, half addrspace(1)* %gep2, align 4
340   %r2 = frem fast half %r0, %r1
341   store half %r2, half addrspace(1)* %out, align 4
342   ret void
343}
344
345define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
346; SI-LABEL: unsafe_frem_f16:
347; SI:       ; %bb.0:
348; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
349; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
350; SI-NEXT:    s_mov_b32 s11, 0xf000
351; SI-NEXT:    s_mov_b32 s10, -1
352; SI-NEXT:    s_waitcnt lgkmcnt(0)
353; SI-NEXT:    s_mov_b32 s8, s4
354; SI-NEXT:    s_mov_b32 s9, s5
355; SI-NEXT:    s_mov_b32 s4, s6
356; SI-NEXT:    s_mov_b32 s5, s7
357; SI-NEXT:    s_mov_b32 s6, s10
358; SI-NEXT:    s_mov_b32 s7, s11
359; SI-NEXT:    s_mov_b32 s2, s10
360; SI-NEXT:    s_mov_b32 s3, s11
361; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
362; SI-NEXT:    s_waitcnt vmcnt(0)
363; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
364; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
365; SI-NEXT:    s_waitcnt vmcnt(0)
366; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
367; SI-NEXT:    v_rcp_f32_e32 v2, v1
368; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
369; SI-NEXT:    v_trunc_f32_e32 v2, v2
370; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
371; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
372; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
373; SI-NEXT:    s_endpgm
374;
375; CI-LABEL: unsafe_frem_f16:
376; CI:       ; %bb.0:
377; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
378; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
379; CI-NEXT:    s_mov_b32 s11, 0xf000
380; CI-NEXT:    s_mov_b32 s10, -1
381; CI-NEXT:    s_mov_b32 s2, s10
382; CI-NEXT:    s_mov_b32 s3, s11
383; CI-NEXT:    s_waitcnt lgkmcnt(0)
384; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
385; CI-NEXT:    s_mov_b32 s8, s4
386; CI-NEXT:    s_mov_b32 s9, s5
387; CI-NEXT:    s_mov_b32 s4, s6
388; CI-NEXT:    s_mov_b32 s5, s7
389; CI-NEXT:    s_mov_b32 s6, s10
390; CI-NEXT:    s_mov_b32 s7, s11
391; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
392; CI-NEXT:    s_waitcnt vmcnt(1)
393; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
394; CI-NEXT:    v_rcp_f32_e32 v2, v1
395; CI-NEXT:    s_waitcnt vmcnt(0)
396; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
397; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
398; CI-NEXT:    v_trunc_f32_e32 v2, v2
399; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
400; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
401; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
402; CI-NEXT:    s_endpgm
403;
404; VI-LABEL: unsafe_frem_f16:
405; VI:       ; %bb.0:
406; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
407; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
408; VI-NEXT:    s_waitcnt lgkmcnt(0)
409; VI-NEXT:    v_mov_b32_e32 v2, s6
410; VI-NEXT:    s_add_u32 s0, s0, 8
411; VI-NEXT:    v_mov_b32_e32 v3, s7
412; VI-NEXT:    s_addc_u32 s1, s1, 0
413; VI-NEXT:    flat_load_ushort v4, v[2:3]
414; VI-NEXT:    v_mov_b32_e32 v3, s1
415; VI-NEXT:    v_mov_b32_e32 v2, s0
416; VI-NEXT:    flat_load_ushort v2, v[2:3]
417; VI-NEXT:    v_mov_b32_e32 v0, s4
418; VI-NEXT:    v_mov_b32_e32 v1, s5
419; VI-NEXT:    s_waitcnt vmcnt(0)
420; VI-NEXT:    v_rcp_f16_e32 v3, v2
421; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
422; VI-NEXT:    v_trunc_f16_e32 v3, v3
423; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
424; VI-NEXT:    flat_store_short v[0:1], v2
425; VI-NEXT:    s_endpgm
426;
427; GFX9-LABEL: unsafe_frem_f16:
428; GFX9:       ; %bb.0:
429; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
430; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
431; GFX9-NEXT:    v_mov_b32_e32 v0, 0
432; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
434; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
435; GFX9-NEXT:    s_waitcnt vmcnt(0)
436; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
437; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
438; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
439; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
440; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
441; GFX9-NEXT:    s_endpgm
442;
443; GFX10-LABEL: unsafe_frem_f16:
444; GFX10:       ; %bb.0:
445; GFX10-NEXT:    s_clause 0x1
446; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
447; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
448; GFX10-NEXT:    v_mov_b32_e32 v0, 0
449; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX10-NEXT:    s_clause 0x1
451; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
452; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
453; GFX10-NEXT:    s_waitcnt vmcnt(0)
454; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
455; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
456; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
457; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
458; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
459; GFX10-NEXT:    s_endpgm
460;
461; GFX11-LABEL: unsafe_frem_f16:
462; GFX11:       ; %bb.0:
463; GFX11-NEXT:    s_clause 0x1
464; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
465; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
466; GFX11-NEXT:    v_mov_b32_e32 v0, 0
467; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX11-NEXT:    s_clause 0x1
469; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
470; GFX11-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
471; GFX11-NEXT:    s_waitcnt vmcnt(0)
472; GFX11-NEXT:    v_rcp_f16_e32 v3, v2
473; GFX11-NEXT:    s_waitcnt_depctr 0xfff
474; GFX11-NEXT:    v_mul_f16_e32 v3, v1, v3
475; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
476; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
477; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
478; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
479; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
480; GFX11-NEXT:    s_endpgm
481                             half addrspace(1)* %in2) #1 {
482   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
483   %r0 = load half, half addrspace(1)* %in1, align 4
484   %r1 = load half, half addrspace(1)* %gep2, align 4
485   %r2 = frem afn half %r0, %r1
486   store half %r2, half addrspace(1)* %out, align 4
487   ret void
488}
489
490define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
491; SI-LABEL: frem_f32:
492; SI:       ; %bb.0:
493; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
494; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
495; SI-NEXT:    s_mov_b32 s11, 0xf000
496; SI-NEXT:    s_mov_b32 s10, -1
497; SI-NEXT:    s_waitcnt lgkmcnt(0)
498; SI-NEXT:    s_mov_b32 s8, s4
499; SI-NEXT:    s_mov_b32 s9, s5
500; SI-NEXT:    s_mov_b32 s4, s6
501; SI-NEXT:    s_mov_b32 s5, s7
502; SI-NEXT:    s_mov_b32 s6, s10
503; SI-NEXT:    s_mov_b32 s7, s11
504; SI-NEXT:    s_mov_b32 s2, s10
505; SI-NEXT:    s_mov_b32 s3, s11
506; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
507; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
508; SI-NEXT:    s_waitcnt vmcnt(0)
509; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
510; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
511; SI-NEXT:    v_rcp_f32_e32 v4, v3
512; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
513; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
514; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
515; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
516; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
517; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
518; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
519; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
520; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
521; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
522; SI-NEXT:    v_trunc_f32_e32 v2, v2
523; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
524; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
525; SI-NEXT:    s_endpgm
526;
527; CI-LABEL: frem_f32:
528; CI:       ; %bb.0:
529; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
530; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
531; CI-NEXT:    s_mov_b32 s11, 0xf000
532; CI-NEXT:    s_mov_b32 s10, -1
533; CI-NEXT:    s_mov_b32 s2, s10
534; CI-NEXT:    s_waitcnt lgkmcnt(0)
535; CI-NEXT:    s_mov_b32 s8, s4
536; CI-NEXT:    s_mov_b32 s9, s5
537; CI-NEXT:    s_mov_b32 s4, s6
538; CI-NEXT:    s_mov_b32 s5, s7
539; CI-NEXT:    s_mov_b32 s6, s10
540; CI-NEXT:    s_mov_b32 s7, s11
541; CI-NEXT:    s_mov_b32 s3, s11
542; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
543; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
544; CI-NEXT:    s_waitcnt vmcnt(0)
545; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
546; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
547; CI-NEXT:    v_rcp_f32_e32 v4, v3
548; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
549; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
550; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
551; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
552; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
553; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
554; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
555; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
556; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
557; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
558; CI-NEXT:    v_trunc_f32_e32 v2, v2
559; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
560; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
561; CI-NEXT:    s_endpgm
562;
563; VI-LABEL: frem_f32:
564; VI:       ; %bb.0:
565; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
566; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
567; VI-NEXT:    s_waitcnt lgkmcnt(0)
568; VI-NEXT:    v_mov_b32_e32 v2, s6
569; VI-NEXT:    s_add_u32 s0, s0, 16
570; VI-NEXT:    v_mov_b32_e32 v3, s7
571; VI-NEXT:    s_addc_u32 s1, s1, 0
572; VI-NEXT:    flat_load_dword v4, v[2:3]
573; VI-NEXT:    v_mov_b32_e32 v3, s1
574; VI-NEXT:    v_mov_b32_e32 v2, s0
575; VI-NEXT:    flat_load_dword v2, v[2:3]
576; VI-NEXT:    v_mov_b32_e32 v0, s4
577; VI-NEXT:    v_mov_b32_e32 v1, s5
578; VI-NEXT:    s_waitcnt vmcnt(0)
579; VI-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v4
580; VI-NEXT:    v_div_scale_f32 v3, vcc, v4, v2, v4
581; VI-NEXT:    v_rcp_f32_e32 v6, v5
582; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
583; VI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
584; VI-NEXT:    v_fma_f32 v6, v7, v6, v6
585; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
586; VI-NEXT:    v_fma_f32 v8, -v5, v7, v3
587; VI-NEXT:    v_fma_f32 v7, v8, v6, v7
588; VI-NEXT:    v_fma_f32 v3, -v5, v7, v3
589; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
590; VI-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
591; VI-NEXT:    v_div_fixup_f32 v3, v3, v2, v4
592; VI-NEXT:    v_trunc_f32_e32 v3, v3
593; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
594; VI-NEXT:    flat_store_dword v[0:1], v2
595; VI-NEXT:    s_endpgm
596;
597; GFX9-LABEL: frem_f32:
598; GFX9:       ; %bb.0:
599; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
600; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
601; GFX9-NEXT:    v_mov_b32_e32 v0, 0
602; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
603; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
604; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
605; GFX9-NEXT:    s_waitcnt vmcnt(0)
606; GFX9-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, v1
607; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v1, v2, v1
608; GFX9-NEXT:    v_rcp_f32_e32 v5, v4
609; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
610; GFX9-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
611; GFX9-NEXT:    v_fma_f32 v5, v6, v5, v5
612; GFX9-NEXT:    v_mul_f32_e32 v6, v3, v5
613; GFX9-NEXT:    v_fma_f32 v7, -v4, v6, v3
614; GFX9-NEXT:    v_fma_f32 v6, v7, v5, v6
615; GFX9-NEXT:    v_fma_f32 v3, -v4, v6, v3
616; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
617; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
618; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
619; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
620; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
621; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
622; GFX9-NEXT:    s_endpgm
623;
624; GFX10-LABEL: frem_f32:
625; GFX10:       ; %bb.0:
626; GFX10-NEXT:    s_clause 0x1
627; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
628; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
629; GFX10-NEXT:    v_mov_b32_e32 v0, 0
630; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX10-NEXT:    s_clause 0x1
632; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
633; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
634; GFX10-NEXT:    s_waitcnt vmcnt(0)
635; GFX10-NEXT:    v_div_scale_f32 v4, s0, v2, v2, v1
636; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
637; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
638; GFX10-NEXT:    s_denorm_mode 15
639; GFX10-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
640; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v5
641; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
642; GFX10-NEXT:    v_fma_f32 v7, -v4, v6, v3
643; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v5
644; GFX10-NEXT:    v_fma_f32 v3, -v4, v6, v3
645; GFX10-NEXT:    s_denorm_mode 12
646; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
647; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
648; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
649; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
650; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
651; GFX10-NEXT:    s_endpgm
652;
653; GFX11-LABEL: frem_f32:
654; GFX11:       ; %bb.0:
655; GFX11-NEXT:    s_clause 0x1
656; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
657; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
658; GFX11-NEXT:    v_mov_b32_e32 v0, 0
659; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
660; GFX11-NEXT:    s_clause 0x1
661; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
662; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
663; GFX11-NEXT:    s_waitcnt vmcnt(0)
664; GFX11-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
665; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
666; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
667; GFX11-NEXT:    v_rcp_f32_e32 v5, v4
668; GFX11-NEXT:    s_denorm_mode 15
669; GFX11-NEXT:    s_waitcnt_depctr 0xfff
670; GFX11-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
671; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v5
672; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
673; GFX11-NEXT:    v_mul_f32_e32 v6, v3, v5
674; GFX11-NEXT:    v_fma_f32 v7, -v4, v6, v3
675; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
676; GFX11-NEXT:    v_fmac_f32_e32 v6, v7, v5
677; GFX11-NEXT:    v_fma_f32 v3, -v4, v6, v3
678; GFX11-NEXT:    s_denorm_mode 12
679; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
680; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
681; GFX11-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
682; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
683; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
684; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
685; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
686; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
687; GFX11-NEXT:    s_endpgm
688                      float addrspace(1)* %in2) #0 {
689   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
690   %r0 = load float, float addrspace(1)* %in1, align 4
691   %r1 = load float, float addrspace(1)* %gep2, align 4
692   %r2 = frem float %r0, %r1
693   store float %r2, float addrspace(1)* %out, align 4
694   ret void
695}
696
697define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
698; SI-LABEL: fast_frem_f32:
699; SI:       ; %bb.0:
700; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
701; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
702; SI-NEXT:    s_mov_b32 s11, 0xf000
703; SI-NEXT:    s_mov_b32 s10, -1
704; SI-NEXT:    s_waitcnt lgkmcnt(0)
705; SI-NEXT:    s_mov_b32 s8, s4
706; SI-NEXT:    s_mov_b32 s9, s5
707; SI-NEXT:    s_mov_b32 s4, s6
708; SI-NEXT:    s_mov_b32 s5, s7
709; SI-NEXT:    s_mov_b32 s6, s10
710; SI-NEXT:    s_mov_b32 s7, s11
711; SI-NEXT:    s_mov_b32 s2, s10
712; SI-NEXT:    s_mov_b32 s3, s11
713; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
714; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
715; SI-NEXT:    s_waitcnt vmcnt(0)
716; SI-NEXT:    v_rcp_f32_e32 v2, v1
717; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
718; SI-NEXT:    v_trunc_f32_e32 v2, v2
719; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
720; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
721; SI-NEXT:    s_endpgm
722;
723; CI-LABEL: fast_frem_f32:
724; CI:       ; %bb.0:
725; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
726; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
727; CI-NEXT:    s_mov_b32 s11, 0xf000
728; CI-NEXT:    s_mov_b32 s10, -1
729; CI-NEXT:    s_mov_b32 s2, s10
730; CI-NEXT:    s_waitcnt lgkmcnt(0)
731; CI-NEXT:    s_mov_b32 s8, s4
732; CI-NEXT:    s_mov_b32 s9, s5
733; CI-NEXT:    s_mov_b32 s4, s6
734; CI-NEXT:    s_mov_b32 s5, s7
735; CI-NEXT:    s_mov_b32 s6, s10
736; CI-NEXT:    s_mov_b32 s7, s11
737; CI-NEXT:    s_mov_b32 s3, s11
738; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
739; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
740; CI-NEXT:    s_waitcnt vmcnt(0)
741; CI-NEXT:    v_rcp_f32_e32 v2, v1
742; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
743; CI-NEXT:    v_trunc_f32_e32 v2, v2
744; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
745; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
746; CI-NEXT:    s_endpgm
747;
748; VI-LABEL: fast_frem_f32:
749; VI:       ; %bb.0:
750; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
751; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
752; VI-NEXT:    s_waitcnt lgkmcnt(0)
753; VI-NEXT:    v_mov_b32_e32 v2, s6
754; VI-NEXT:    s_add_u32 s0, s0, 16
755; VI-NEXT:    v_mov_b32_e32 v3, s7
756; VI-NEXT:    s_addc_u32 s1, s1, 0
757; VI-NEXT:    flat_load_dword v4, v[2:3]
758; VI-NEXT:    v_mov_b32_e32 v3, s1
759; VI-NEXT:    v_mov_b32_e32 v2, s0
760; VI-NEXT:    flat_load_dword v2, v[2:3]
761; VI-NEXT:    v_mov_b32_e32 v0, s4
762; VI-NEXT:    v_mov_b32_e32 v1, s5
763; VI-NEXT:    s_waitcnt vmcnt(0)
764; VI-NEXT:    v_rcp_f32_e32 v3, v2
765; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
766; VI-NEXT:    v_trunc_f32_e32 v3, v3
767; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
768; VI-NEXT:    flat_store_dword v[0:1], v2
769; VI-NEXT:    s_endpgm
770;
771; GFX9-LABEL: fast_frem_f32:
772; GFX9:       ; %bb.0:
773; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
774; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
775; GFX9-NEXT:    v_mov_b32_e32 v0, 0
776; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
777; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
778; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
779; GFX9-NEXT:    s_waitcnt vmcnt(0)
780; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
781; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
782; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
783; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
784; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
785; GFX9-NEXT:    s_endpgm
786;
787; GFX10-LABEL: fast_frem_f32:
788; GFX10:       ; %bb.0:
789; GFX10-NEXT:    s_clause 0x1
790; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
791; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
792; GFX10-NEXT:    v_mov_b32_e32 v0, 0
793; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX10-NEXT:    s_clause 0x1
795; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
796; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
797; GFX10-NEXT:    s_waitcnt vmcnt(0)
798; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
799; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
800; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
801; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
802; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
803; GFX10-NEXT:    s_endpgm
804;
805; GFX11-LABEL: fast_frem_f32:
806; GFX11:       ; %bb.0:
807; GFX11-NEXT:    s_clause 0x1
808; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
809; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
810; GFX11-NEXT:    v_mov_b32_e32 v0, 0
811; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
812; GFX11-NEXT:    s_clause 0x1
813; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
814; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
815; GFX11-NEXT:    s_waitcnt vmcnt(0)
816; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
817; GFX11-NEXT:    s_waitcnt_depctr 0xfff
818; GFX11-NEXT:    v_mul_f32_e32 v3, v1, v3
819; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
820; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
821; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
822; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
823; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
824; GFX11-NEXT:    s_endpgm
825                      float addrspace(1)* %in2) #0 {
826   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
827   %r0 = load float, float addrspace(1)* %in1, align 4
828   %r1 = load float, float addrspace(1)* %gep2, align 4
829   %r2 = frem fast float %r0, %r1
830   store float %r2, float addrspace(1)* %out, align 4
831   ret void
832}
833
834define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
835; SI-LABEL: unsafe_frem_f32:
836; SI:       ; %bb.0:
837; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
838; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
839; SI-NEXT:    s_mov_b32 s11, 0xf000
840; SI-NEXT:    s_mov_b32 s10, -1
841; SI-NEXT:    s_waitcnt lgkmcnt(0)
842; SI-NEXT:    s_mov_b32 s8, s4
843; SI-NEXT:    s_mov_b32 s9, s5
844; SI-NEXT:    s_mov_b32 s4, s6
845; SI-NEXT:    s_mov_b32 s5, s7
846; SI-NEXT:    s_mov_b32 s6, s10
847; SI-NEXT:    s_mov_b32 s7, s11
848; SI-NEXT:    s_mov_b32 s2, s10
849; SI-NEXT:    s_mov_b32 s3, s11
850; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
851; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
852; SI-NEXT:    s_waitcnt vmcnt(0)
853; SI-NEXT:    v_rcp_f32_e32 v2, v1
854; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
855; SI-NEXT:    v_trunc_f32_e32 v2, v2
856; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
857; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
858; SI-NEXT:    s_endpgm
859;
860; CI-LABEL: unsafe_frem_f32:
861; CI:       ; %bb.0:
862; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
863; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
864; CI-NEXT:    s_mov_b32 s11, 0xf000
865; CI-NEXT:    s_mov_b32 s10, -1
866; CI-NEXT:    s_mov_b32 s2, s10
867; CI-NEXT:    s_waitcnt lgkmcnt(0)
868; CI-NEXT:    s_mov_b32 s8, s4
869; CI-NEXT:    s_mov_b32 s9, s5
870; CI-NEXT:    s_mov_b32 s4, s6
871; CI-NEXT:    s_mov_b32 s5, s7
872; CI-NEXT:    s_mov_b32 s6, s10
873; CI-NEXT:    s_mov_b32 s7, s11
874; CI-NEXT:    s_mov_b32 s3, s11
875; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
876; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
877; CI-NEXT:    s_waitcnt vmcnt(0)
878; CI-NEXT:    v_rcp_f32_e32 v2, v1
879; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
880; CI-NEXT:    v_trunc_f32_e32 v2, v2
881; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
882; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
883; CI-NEXT:    s_endpgm
884;
885; VI-LABEL: unsafe_frem_f32:
886; VI:       ; %bb.0:
887; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
888; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
889; VI-NEXT:    s_waitcnt lgkmcnt(0)
890; VI-NEXT:    v_mov_b32_e32 v2, s6
891; VI-NEXT:    s_add_u32 s0, s0, 16
892; VI-NEXT:    v_mov_b32_e32 v3, s7
893; VI-NEXT:    s_addc_u32 s1, s1, 0
894; VI-NEXT:    flat_load_dword v4, v[2:3]
895; VI-NEXT:    v_mov_b32_e32 v3, s1
896; VI-NEXT:    v_mov_b32_e32 v2, s0
897; VI-NEXT:    flat_load_dword v2, v[2:3]
898; VI-NEXT:    v_mov_b32_e32 v0, s4
899; VI-NEXT:    v_mov_b32_e32 v1, s5
900; VI-NEXT:    s_waitcnt vmcnt(0)
901; VI-NEXT:    v_rcp_f32_e32 v3, v2
902; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
903; VI-NEXT:    v_trunc_f32_e32 v3, v3
904; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
905; VI-NEXT:    flat_store_dword v[0:1], v2
906; VI-NEXT:    s_endpgm
907;
908; GFX9-LABEL: unsafe_frem_f32:
909; GFX9:       ; %bb.0:
910; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
911; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
912; GFX9-NEXT:    v_mov_b32_e32 v0, 0
913; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
914; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
915; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
916; GFX9-NEXT:    s_waitcnt vmcnt(0)
917; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
918; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
919; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
920; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
921; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
922; GFX9-NEXT:    s_endpgm
923;
924; GFX10-LABEL: unsafe_frem_f32:
925; GFX10:       ; %bb.0:
926; GFX10-NEXT:    s_clause 0x1
927; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
928; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
929; GFX10-NEXT:    v_mov_b32_e32 v0, 0
930; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX10-NEXT:    s_clause 0x1
932; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
933; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
934; GFX10-NEXT:    s_waitcnt vmcnt(0)
935; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
936; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
937; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
938; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
939; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
940; GFX10-NEXT:    s_endpgm
941;
942; GFX11-LABEL: unsafe_frem_f32:
943; GFX11:       ; %bb.0:
944; GFX11-NEXT:    s_clause 0x1
945; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
946; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
947; GFX11-NEXT:    v_mov_b32_e32 v0, 0
948; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
949; GFX11-NEXT:    s_clause 0x1
950; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
951; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
952; GFX11-NEXT:    s_waitcnt vmcnt(0)
953; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
954; GFX11-NEXT:    s_waitcnt_depctr 0xfff
955; GFX11-NEXT:    v_mul_f32_e32 v3, v1, v3
956; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
957; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
958; GFX11-NEXT:    v_fma_f32 v1, -v3, v2, v1
959; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
960; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
961; GFX11-NEXT:    s_endpgm
962                             float addrspace(1)* %in2) #1 {
963   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
964   %r0 = load float, float addrspace(1)* %in1, align 4
965   %r1 = load float, float addrspace(1)* %gep2, align 4
966   %r2 = frem afn float %r0, %r1
967   store float %r2, float addrspace(1)* %out, align 4
968   ret void
969}
970
971define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
972; SI-LABEL: frem_f64:
973; SI:       ; %bb.0:
974; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
975; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
976; SI-NEXT:    s_mov_b32 s7, 0xf000
977; SI-NEXT:    s_mov_b32 s6, -1
978; SI-NEXT:    s_waitcnt lgkmcnt(0)
979; SI-NEXT:    s_mov_b32 s4, s8
980; SI-NEXT:    s_mov_b32 s5, s9
981; SI-NEXT:    s_mov_b32 s8, s10
982; SI-NEXT:    s_mov_b32 s9, s11
983; SI-NEXT:    s_mov_b32 s10, s6
984; SI-NEXT:    s_mov_b32 s11, s7
985; SI-NEXT:    s_mov_b32 s2, s6
986; SI-NEXT:    s_mov_b32 s3, s7
987; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
988; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
989; SI-NEXT:    s_waitcnt vmcnt(0)
990; SI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
991; SI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
992; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
993; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
994; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
995; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
996; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
997; SI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
998; SI-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
999; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1000; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v9
1001; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
1002; SI-NEXT:    s_nop 1
1003; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
1004; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1005; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
1006; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1007; SI-NEXT:    s_mov_b32 s1, 0xfffff
1008; SI-NEXT:    s_mov_b32 s0, s6
1009; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
1010; SI-NEXT:    v_not_b32_e32 v6, v6
1011; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1012; SI-NEXT:    v_not_b32_e32 v7, v7
1013; SI-NEXT:    v_and_b32_e32 v7, v5, v7
1014; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
1015; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
1016; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1017; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
1018; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1019; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1020; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1021; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1022; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1023; SI-NEXT:    s_endpgm
1024;
1025; CI-LABEL: frem_f64:
1026; CI:       ; %bb.0:
1027; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1028; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1029; CI-NEXT:    s_mov_b32 s11, 0xf000
1030; CI-NEXT:    s_mov_b32 s10, -1
1031; CI-NEXT:    s_mov_b32 s2, s10
1032; CI-NEXT:    s_waitcnt lgkmcnt(0)
1033; CI-NEXT:    s_mov_b32 s8, s4
1034; CI-NEXT:    s_mov_b32 s9, s5
1035; CI-NEXT:    s_mov_b32 s4, s6
1036; CI-NEXT:    s_mov_b32 s5, s7
1037; CI-NEXT:    s_mov_b32 s6, s10
1038; CI-NEXT:    s_mov_b32 s7, s11
1039; CI-NEXT:    s_mov_b32 s3, s11
1040; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1041; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1042; CI-NEXT:    s_waitcnt vmcnt(0)
1043; CI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1044; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1045; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1046; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1047; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1048; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1049; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
1050; CI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1051; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1052; CI-NEXT:    s_nop 1
1053; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1054; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1055; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1056; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1057; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1058; CI-NEXT:    s_endpgm
1059;
1060; VI-LABEL: frem_f64:
1061; VI:       ; %bb.0:
1062; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1063; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1064; VI-NEXT:    s_waitcnt lgkmcnt(0)
1065; VI-NEXT:    v_mov_b32_e32 v2, s6
1066; VI-NEXT:    v_mov_b32_e32 v3, s7
1067; VI-NEXT:    v_mov_b32_e32 v4, s0
1068; VI-NEXT:    v_mov_b32_e32 v5, s1
1069; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1070; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1071; VI-NEXT:    v_mov_b32_e32 v0, s4
1072; VI-NEXT:    v_mov_b32_e32 v1, s5
1073; VI-NEXT:    s_waitcnt vmcnt(0)
1074; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
1075; VI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
1076; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1077; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1078; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1079; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1080; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
1081; VI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
1082; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
1083; VI-NEXT:    s_nop 1
1084; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
1085; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
1086; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1087; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1088; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1089; VI-NEXT:    s_endpgm
1090;
1091; GFX9-LABEL: frem_f64:
1092; GFX9:       ; %bb.0:
1093; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1094; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1095; GFX9-NEXT:    v_mov_b32_e32 v12, 0
1096; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1097; GFX9-NEXT:    global_load_dwordx2 v[0:1], v12, s[6:7]
1098; GFX9-NEXT:    global_load_dwordx2 v[2:3], v12, s[2:3]
1099; GFX9-NEXT:    s_waitcnt vmcnt(0)
1100; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
1101; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1102; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1103; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1104; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1105; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1106; GFX9-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
1107; GFX9-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1108; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1109; GFX9-NEXT:    s_nop 1
1110; GFX9-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1111; GFX9-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1112; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1113; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1114; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[4:5]
1115; GFX9-NEXT:    s_endpgm
1116;
1117; GFX10-LABEL: frem_f64:
1118; GFX10:       ; %bb.0:
1119; GFX10-NEXT:    s_clause 0x1
1120; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1121; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1122; GFX10-NEXT:    v_mov_b32_e32 v12, 0
1123; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1124; GFX10-NEXT:    s_clause 0x1
1125; GFX10-NEXT:    global_load_dwordx2 v[0:1], v12, s[6:7]
1126; GFX10-NEXT:    global_load_dwordx2 v[2:3], v12, s[2:3]
1127; GFX10-NEXT:    s_waitcnt vmcnt(0)
1128; GFX10-NEXT:    v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1]
1129; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1130; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1131; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1132; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1133; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1134; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1135; GFX10-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1136; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1137; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1138; GFX10-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1139; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1140; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1141; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[4:5]
1142; GFX10-NEXT:    s_endpgm
1143;
1144; GFX11-LABEL: frem_f64:
1145; GFX11:       ; %bb.0:
1146; GFX11-NEXT:    s_clause 0x1
1147; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1148; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1149; GFX11-NEXT:    v_mov_b32_e32 v12, 0
1150; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1151; GFX11-NEXT:    s_clause 0x1
1152; GFX11-NEXT:    global_load_b64 v[0:1], v12, s[6:7]
1153; GFX11-NEXT:    global_load_b64 v[2:3], v12, s[0:1]
1154; GFX11-NEXT:    s_waitcnt vmcnt(0)
1155; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
1156; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1157; GFX11-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1158; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1159; GFX11-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1160; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1161; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1162; GFX11-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1163; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
1164; GFX11-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
1165; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1166; GFX11-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
1167; GFX11-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
1168; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1169; GFX11-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
1170; GFX11-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
1171; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1172; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1173; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1174; GFX11-NEXT:    global_store_b64 v12, v[0:1], s[4:5]
1175; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1176; GFX11-NEXT:    s_endpgm
1177                      double addrspace(1)* %in2) #0 {
1178   %r0 = load double, double addrspace(1)* %in1, align 8
1179   %r1 = load double, double addrspace(1)* %in2, align 8
1180   %r2 = frem double %r0, %r1
1181   store double %r2, double addrspace(1)* %out, align 8
1182   ret void
1183}
1184
1185define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
1186; SI-LABEL: fast_frem_f64:
1187; SI:       ; %bb.0:
1188; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
1189; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1190; SI-NEXT:    s_mov_b32 s7, 0xf000
1191; SI-NEXT:    s_mov_b32 s6, -1
1192; SI-NEXT:    s_waitcnt lgkmcnt(0)
1193; SI-NEXT:    s_mov_b32 s4, s8
1194; SI-NEXT:    s_mov_b32 s5, s9
1195; SI-NEXT:    s_mov_b32 s8, s10
1196; SI-NEXT:    s_mov_b32 s9, s11
1197; SI-NEXT:    s_mov_b32 s10, s6
1198; SI-NEXT:    s_mov_b32 s11, s7
1199; SI-NEXT:    s_mov_b32 s2, s6
1200; SI-NEXT:    s_mov_b32 s3, s7
1201; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1202; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1203; SI-NEXT:    s_waitcnt vmcnt(0)
1204; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1205; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1206; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1207; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1208; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1209; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1210; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1211; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1212; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
1213; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1214; SI-NEXT:    s_mov_b32 s1, 0xfffff
1215; SI-NEXT:    s_mov_b32 s0, s6
1216; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
1217; SI-NEXT:    v_not_b32_e32 v6, v6
1218; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1219; SI-NEXT:    v_not_b32_e32 v7, v7
1220; SI-NEXT:    v_and_b32_e32 v7, v5, v7
1221; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
1222; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
1223; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1224; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
1225; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1226; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1227; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1228; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1229; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1230; SI-NEXT:    s_endpgm
1231;
1232; CI-LABEL: fast_frem_f64:
1233; CI:       ; %bb.0:
1234; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1235; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1236; CI-NEXT:    s_mov_b32 s11, 0xf000
1237; CI-NEXT:    s_mov_b32 s10, -1
1238; CI-NEXT:    s_mov_b32 s2, s10
1239; CI-NEXT:    s_waitcnt lgkmcnt(0)
1240; CI-NEXT:    s_mov_b32 s8, s4
1241; CI-NEXT:    s_mov_b32 s9, s5
1242; CI-NEXT:    s_mov_b32 s4, s6
1243; CI-NEXT:    s_mov_b32 s5, s7
1244; CI-NEXT:    s_mov_b32 s6, s10
1245; CI-NEXT:    s_mov_b32 s7, s11
1246; CI-NEXT:    s_mov_b32 s3, s11
1247; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1248; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1249; CI-NEXT:    s_waitcnt vmcnt(0)
1250; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1251; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1252; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1253; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1254; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1255; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1256; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1257; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1258; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1259; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1260; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1261; CI-NEXT:    s_endpgm
1262;
1263; VI-LABEL: fast_frem_f64:
1264; VI:       ; %bb.0:
1265; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1266; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1267; VI-NEXT:    s_waitcnt lgkmcnt(0)
1268; VI-NEXT:    v_mov_b32_e32 v2, s6
1269; VI-NEXT:    v_mov_b32_e32 v3, s7
1270; VI-NEXT:    v_mov_b32_e32 v4, s0
1271; VI-NEXT:    v_mov_b32_e32 v5, s1
1272; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1273; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1274; VI-NEXT:    v_mov_b32_e32 v0, s4
1275; VI-NEXT:    v_mov_b32_e32 v1, s5
1276; VI-NEXT:    s_waitcnt vmcnt(0)
1277; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1278; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1279; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1280; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1281; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1282; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
1283; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1284; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1285; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1286; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1287; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1288; VI-NEXT:    s_endpgm
1289;
1290; GFX9-LABEL: fast_frem_f64:
1291; GFX9:       ; %bb.0:
1292; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1293; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1294; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1295; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1296; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1297; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1298; GFX9-NEXT:    s_waitcnt vmcnt(0)
1299; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1300; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1301; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1302; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1303; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1304; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1305; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1306; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1307; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1308; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1309; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1310; GFX9-NEXT:    s_endpgm
1311;
1312; GFX10-LABEL: fast_frem_f64:
1313; GFX10:       ; %bb.0:
1314; GFX10-NEXT:    s_clause 0x1
1315; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1316; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1317; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1318; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1319; GFX10-NEXT:    s_clause 0x1
1320; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1321; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1322; GFX10-NEXT:    s_waitcnt vmcnt(0)
1323; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1324; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1325; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1326; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1327; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1328; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1329; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1330; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1331; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1332; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1333; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1334; GFX10-NEXT:    s_endpgm
1335;
1336; GFX11-LABEL: fast_frem_f64:
1337; GFX11:       ; %bb.0:
1338; GFX11-NEXT:    s_clause 0x1
1339; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1340; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1341; GFX11-NEXT:    v_mov_b32_e32 v10, 0
1342; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX11-NEXT:    s_clause 0x1
1344; GFX11-NEXT:    global_load_b64 v[0:1], v10, s[6:7]
1345; GFX11-NEXT:    global_load_b64 v[2:3], v10, s[0:1]
1346; GFX11-NEXT:    s_waitcnt vmcnt(0)
1347; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1348; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1349; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1350; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1351; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1352; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1353; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1354; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1355; GFX11-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1356; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1357; GFX11-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1358; GFX11-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1359; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1360; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1361; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1362; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
1363; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1364; GFX11-NEXT:    s_endpgm
1365                      double addrspace(1)* %in2) #0 {
1366   %r0 = load double, double addrspace(1)* %in1, align 8
1367   %r1 = load double, double addrspace(1)* %in2, align 8
1368   %r2 = frem fast double %r0, %r1
1369   store double %r2, double addrspace(1)* %out, align 8
1370   ret void
1371}
1372
1373define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
1374; SI-LABEL: unsafe_frem_f64:
1375; SI:       ; %bb.0:
1376; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
1377; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1378; SI-NEXT:    s_mov_b32 s7, 0xf000
1379; SI-NEXT:    s_mov_b32 s6, -1
1380; SI-NEXT:    s_waitcnt lgkmcnt(0)
1381; SI-NEXT:    s_mov_b32 s4, s8
1382; SI-NEXT:    s_mov_b32 s5, s9
1383; SI-NEXT:    s_mov_b32 s8, s10
1384; SI-NEXT:    s_mov_b32 s9, s11
1385; SI-NEXT:    s_mov_b32 s10, s6
1386; SI-NEXT:    s_mov_b32 s11, s7
1387; SI-NEXT:    s_mov_b32 s2, s6
1388; SI-NEXT:    s_mov_b32 s3, s7
1389; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1390; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1391; SI-NEXT:    s_waitcnt vmcnt(0)
1392; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1393; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1394; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1395; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1396; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1397; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1398; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1399; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1400; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
1401; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1402; SI-NEXT:    s_mov_b32 s1, 0xfffff
1403; SI-NEXT:    s_mov_b32 s0, s6
1404; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
1405; SI-NEXT:    v_not_b32_e32 v6, v6
1406; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1407; SI-NEXT:    v_not_b32_e32 v7, v7
1408; SI-NEXT:    v_and_b32_e32 v7, v5, v7
1409; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
1410; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
1411; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1412; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
1413; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1414; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1415; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1416; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1417; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1418; SI-NEXT:    s_endpgm
1419;
1420; CI-LABEL: unsafe_frem_f64:
1421; CI:       ; %bb.0:
1422; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1423; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1424; CI-NEXT:    s_mov_b32 s11, 0xf000
1425; CI-NEXT:    s_mov_b32 s10, -1
1426; CI-NEXT:    s_mov_b32 s2, s10
1427; CI-NEXT:    s_waitcnt lgkmcnt(0)
1428; CI-NEXT:    s_mov_b32 s8, s4
1429; CI-NEXT:    s_mov_b32 s9, s5
1430; CI-NEXT:    s_mov_b32 s4, s6
1431; CI-NEXT:    s_mov_b32 s5, s7
1432; CI-NEXT:    s_mov_b32 s6, s10
1433; CI-NEXT:    s_mov_b32 s7, s11
1434; CI-NEXT:    s_mov_b32 s3, s11
1435; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1436; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1437; CI-NEXT:    s_waitcnt vmcnt(0)
1438; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1439; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1440; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1441; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1442; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1443; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1444; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1445; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1446; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1447; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1448; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1449; CI-NEXT:    s_endpgm
1450;
1451; VI-LABEL: unsafe_frem_f64:
1452; VI:       ; %bb.0:
1453; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1454; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1455; VI-NEXT:    s_waitcnt lgkmcnt(0)
1456; VI-NEXT:    v_mov_b32_e32 v2, s6
1457; VI-NEXT:    v_mov_b32_e32 v3, s7
1458; VI-NEXT:    v_mov_b32_e32 v4, s0
1459; VI-NEXT:    v_mov_b32_e32 v5, s1
1460; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1461; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1462; VI-NEXT:    v_mov_b32_e32 v0, s4
1463; VI-NEXT:    v_mov_b32_e32 v1, s5
1464; VI-NEXT:    s_waitcnt vmcnt(0)
1465; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1466; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1467; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1468; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1469; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1470; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
1471; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1472; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1473; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1474; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1475; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1476; VI-NEXT:    s_endpgm
1477;
1478; GFX9-LABEL: unsafe_frem_f64:
1479; GFX9:       ; %bb.0:
1480; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1481; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1482; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1483; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1484; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1485; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1486; GFX9-NEXT:    s_waitcnt vmcnt(0)
1487; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1488; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1489; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1490; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1491; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1492; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1493; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1494; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1495; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1496; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1497; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1498; GFX9-NEXT:    s_endpgm
1499;
1500; GFX10-LABEL: unsafe_frem_f64:
1501; GFX10:       ; %bb.0:
1502; GFX10-NEXT:    s_clause 0x1
1503; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1504; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1505; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1506; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1507; GFX10-NEXT:    s_clause 0x1
1508; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1509; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1510; GFX10-NEXT:    s_waitcnt vmcnt(0)
1511; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1512; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1513; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1514; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1515; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1516; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1517; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1518; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1519; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1520; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1521; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1522; GFX10-NEXT:    s_endpgm
1523;
1524; GFX11-LABEL: unsafe_frem_f64:
1525; GFX11:       ; %bb.0:
1526; GFX11-NEXT:    s_clause 0x1
1527; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1528; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1529; GFX11-NEXT:    v_mov_b32_e32 v10, 0
1530; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1531; GFX11-NEXT:    s_clause 0x1
1532; GFX11-NEXT:    global_load_b64 v[0:1], v10, s[6:7]
1533; GFX11-NEXT:    global_load_b64 v[2:3], v10, s[0:1]
1534; GFX11-NEXT:    s_waitcnt vmcnt(0)
1535; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1536; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1537; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1538; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1539; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1540; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1541; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1542; GFX11-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1543; GFX11-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1544; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1545; GFX11-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1546; GFX11-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1547; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1548; GFX11-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1549; GFX11-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1550; GFX11-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
1551; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1552; GFX11-NEXT:    s_endpgm
1553                             double addrspace(1)* %in2) #1 {
1554   %r0 = load double, double addrspace(1)* %in1, align 8
1555   %r1 = load double, double addrspace(1)* %in2, align 8
1556   %r2 = frem afn double %r0, %r1
1557   store double %r2, double addrspace(1)* %out, align 8
1558   ret void
1559}
1560
1561define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
1562; SI-LABEL: frem_v2f16:
1563; SI:       ; %bb.0:
1564; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1565; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1566; SI-NEXT:    s_mov_b32 s3, 0xf000
1567; SI-NEXT:    s_mov_b32 s2, -1
1568; SI-NEXT:    s_waitcnt lgkmcnt(0)
1569; SI-NEXT:    s_mov_b32 s0, s4
1570; SI-NEXT:    s_mov_b32 s1, s5
1571; SI-NEXT:    s_mov_b32 s4, s6
1572; SI-NEXT:    s_mov_b32 s5, s7
1573; SI-NEXT:    s_mov_b32 s6, s2
1574; SI-NEXT:    s_mov_b32 s7, s3
1575; SI-NEXT:    s_mov_b32 s10, s2
1576; SI-NEXT:    s_mov_b32 s11, s3
1577; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1578; SI-NEXT:    s_waitcnt vmcnt(0)
1579; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1580; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1581; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1582; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
1583; SI-NEXT:    s_waitcnt vmcnt(0)
1584; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1585; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1586; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1587; SI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1588; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1589; SI-NEXT:    v_rcp_f32_e32 v6, v5
1590; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1591; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1592; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
1593; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
1594; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1595; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
1596; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1597; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1598; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1599; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1600; SI-NEXT:    v_trunc_f32_e32 v4, v4
1601; SI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1602; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1603; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1604; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1605; SI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1606; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1607; SI-NEXT:    v_rcp_f32_e32 v5, v4
1608; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1609; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1610; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
1611; SI-NEXT:    v_mul_f32_e32 v6, v2, v5
1612; SI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1613; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
1614; SI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1615; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1616; SI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1617; SI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1618; SI-NEXT:    v_trunc_f32_e32 v2, v2
1619; SI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1620; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1621; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1622; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1623; SI-NEXT:    s_endpgm
1624;
1625; CI-LABEL: frem_v2f16:
1626; CI:       ; %bb.0:
1627; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1628; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1629; CI-NEXT:    s_mov_b32 s3, 0xf000
1630; CI-NEXT:    s_mov_b32 s2, -1
1631; CI-NEXT:    s_mov_b32 s10, s2
1632; CI-NEXT:    s_waitcnt lgkmcnt(0)
1633; CI-NEXT:    s_mov_b32 s0, s4
1634; CI-NEXT:    s_mov_b32 s1, s5
1635; CI-NEXT:    s_mov_b32 s4, s6
1636; CI-NEXT:    s_mov_b32 s5, s7
1637; CI-NEXT:    s_mov_b32 s6, s2
1638; CI-NEXT:    s_mov_b32 s7, s3
1639; CI-NEXT:    s_mov_b32 s11, s3
1640; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1641; CI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
1642; CI-NEXT:    s_waitcnt vmcnt(1)
1643; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1644; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1645; CI-NEXT:    s_waitcnt vmcnt(0)
1646; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1647; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1648; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1649; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1650; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1651; CI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1652; CI-NEXT:    v_rcp_f32_e32 v6, v5
1653; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1654; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1655; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
1656; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
1657; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1658; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
1659; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1660; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1661; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1662; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1663; CI-NEXT:    v_trunc_f32_e32 v4, v4
1664; CI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1665; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1666; CI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1667; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1668; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1669; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1670; CI-NEXT:    v_rcp_f32_e32 v5, v4
1671; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1672; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1673; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
1674; CI-NEXT:    v_mul_f32_e32 v6, v2, v5
1675; CI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1676; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
1677; CI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1678; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1679; CI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1680; CI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1681; CI-NEXT:    v_trunc_f32_e32 v2, v2
1682; CI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1683; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1684; CI-NEXT:    v_or_b32_e32 v0, v1, v0
1685; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1686; CI-NEXT:    s_endpgm
1687;
1688; VI-LABEL: frem_v2f16:
1689; VI:       ; %bb.0:
1690; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1691; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1692; VI-NEXT:    s_waitcnt lgkmcnt(0)
1693; VI-NEXT:    v_mov_b32_e32 v2, s6
1694; VI-NEXT:    s_add_u32 s0, s0, 16
1695; VI-NEXT:    v_mov_b32_e32 v3, s7
1696; VI-NEXT:    s_addc_u32 s1, s1, 0
1697; VI-NEXT:    flat_load_dword v4, v[2:3]
1698; VI-NEXT:    v_mov_b32_e32 v3, s1
1699; VI-NEXT:    v_mov_b32_e32 v2, s0
1700; VI-NEXT:    flat_load_dword v2, v[2:3]
1701; VI-NEXT:    v_mov_b32_e32 v0, s4
1702; VI-NEXT:    v_mov_b32_e32 v1, s5
1703; VI-NEXT:    s_waitcnt vmcnt(1)
1704; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
1705; VI-NEXT:    v_cvt_f32_f16_e32 v5, v3
1706; VI-NEXT:    s_waitcnt vmcnt(0)
1707; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1708; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1709; VI-NEXT:    v_rcp_f32_e32 v7, v7
1710; VI-NEXT:    v_mul_f32_e32 v5, v5, v7
1711; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1712; VI-NEXT:    v_div_fixup_f16 v5, v5, v6, v3
1713; VI-NEXT:    v_trunc_f16_e32 v5, v5
1714; VI-NEXT:    v_fma_f16 v3, -v5, v6, v3
1715; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
1716; VI-NEXT:    v_cvt_f32_f16_e32 v5, v4
1717; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1718; VI-NEXT:    v_rcp_f32_e32 v6, v6
1719; VI-NEXT:    v_mul_f32_e32 v5, v5, v6
1720; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1721; VI-NEXT:    v_div_fixup_f16 v5, v5, v2, v4
1722; VI-NEXT:    v_trunc_f16_e32 v5, v5
1723; VI-NEXT:    v_fma_f16 v2, -v5, v2, v4
1724; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1725; VI-NEXT:    flat_store_dword v[0:1], v2
1726; VI-NEXT:    s_endpgm
1727;
1728; GFX9-LABEL: frem_v2f16:
1729; GFX9:       ; %bb.0:
1730; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1731; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1732; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1733; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1734; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
1735; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
1736; GFX9-NEXT:    s_waitcnt vmcnt(1)
1737; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
1738; GFX9-NEXT:    s_waitcnt vmcnt(0)
1739; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
1740; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
1741; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
1742; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
1743; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
1744; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
1745; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v1
1746; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1747; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
1748; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1749; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v1
1750; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
1751; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v5
1752; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
1753; GFX9-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
1754; GFX9-NEXT:    v_trunc_f16_e32 v4, v4
1755; GFX9-NEXT:    v_fma_f16 v1, -v4, v2, v1
1756; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
1757; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
1758; GFX9-NEXT:    s_endpgm
1759;
1760; GFX10-LABEL: frem_v2f16:
1761; GFX10:       ; %bb.0:
1762; GFX10-NEXT:    s_clause 0x1
1763; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1764; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1765; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1766; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1767; GFX10-NEXT:    s_clause 0x1
1768; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
1769; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
1770; GFX10-NEXT:    s_waitcnt vmcnt(1)
1771; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
1772; GFX10-NEXT:    s_waitcnt vmcnt(0)
1773; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
1774; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
1775; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
1776; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
1777; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
1778; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
1779; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v1
1780; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1781; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1782; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
1783; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v1
1784; GFX10-NEXT:    v_rcp_f32_e32 v5, v5
1785; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v5
1786; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
1787; GFX10-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
1788; GFX10-NEXT:    v_trunc_f16_e32 v4, v4
1789; GFX10-NEXT:    v_fma_f16 v1, -v4, v2, v1
1790; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
1791; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
1792; GFX10-NEXT:    s_endpgm
1793;
1794; GFX11-LABEL: frem_v2f16:
1795; GFX11:       ; %bb.0:
1796; GFX11-NEXT:    s_clause 0x1
1797; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1798; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1799; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1800; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1801; GFX11-NEXT:    s_clause 0x1
1802; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
1803; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
1804; GFX11-NEXT:    s_waitcnt vmcnt(1)
1805; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
1806; GFX11-NEXT:    s_waitcnt vmcnt(0)
1807; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
1808; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1809; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
1810; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1811; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
1812; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
1813; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1814; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
1815; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
1816; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
1817; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v1
1818; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1819; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1820; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v2
1821; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1822; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v1
1823; GFX11-NEXT:    v_rcp_f32_e32 v5, v5
1824; GFX11-NEXT:    s_waitcnt_depctr 0xfff
1825; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v5
1826; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1827; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
1828; GFX11-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
1829; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1830; GFX11-NEXT:    v_trunc_f16_e32 v4, v4
1831; GFX11-NEXT:    v_fma_f16 v1, -v4, v2, v1
1832; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1833; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
1834; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
1835; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1836; GFX11-NEXT:    s_endpgm
1837                        <2 x half> addrspace(1)* %in2) #0 {
1838   %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4
1839   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8
1840   %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8
1841   %r2 = frem <2 x half> %r0, %r1
1842   store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8
1843   ret void
1844}
1845
1846define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1,
1847; SI-LABEL: frem_v4f16:
1848; SI:       ; %bb.0:
1849; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1850; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1851; SI-NEXT:    s_mov_b32 s3, 0xf000
1852; SI-NEXT:    s_mov_b32 s2, -1
1853; SI-NEXT:    s_waitcnt lgkmcnt(0)
1854; SI-NEXT:    s_mov_b32 s0, s4
1855; SI-NEXT:    s_mov_b32 s1, s5
1856; SI-NEXT:    s_mov_b32 s4, s6
1857; SI-NEXT:    s_mov_b32 s5, s7
1858; SI-NEXT:    s_mov_b32 s6, s2
1859; SI-NEXT:    s_mov_b32 s7, s3
1860; SI-NEXT:    s_mov_b32 s10, s2
1861; SI-NEXT:    s_mov_b32 s11, s3
1862; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1863; SI-NEXT:    s_waitcnt vmcnt(0)
1864; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1865; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1866; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
1867; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
1868; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1869; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1870; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1871; SI-NEXT:    s_waitcnt vmcnt(0)
1872; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1873; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1874; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1875; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1876; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1877; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1878; SI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
1879; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
1880; SI-NEXT:    v_rcp_f32_e32 v10, v9
1881; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1882; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1883; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
1884; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
1885; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1886; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
1887; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1888; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1889; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1890; SI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
1891; SI-NEXT:    v_trunc_f32_e32 v8, v8
1892; SI-NEXT:    v_fma_f32 v1, -v8, v1, v5
1893; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1894; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1895; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1896; SI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
1897; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
1898; SI-NEXT:    v_rcp_f32_e32 v9, v8
1899; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1900; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1901; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
1902; SI-NEXT:    v_mul_f32_e32 v10, v5, v9
1903; SI-NEXT:    v_fma_f32 v11, -v8, v10, v5
1904; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
1905; SI-NEXT:    v_fma_f32 v5, -v8, v10, v5
1906; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1907; SI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
1908; SI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
1909; SI-NEXT:    v_trunc_f32_e32 v5, v5
1910; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1911; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1912; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1913; SI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
1914; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
1915; SI-NEXT:    v_rcp_f32_e32 v7, v5
1916; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1917; SI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
1918; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
1919; SI-NEXT:    v_mul_f32_e32 v8, v4, v7
1920; SI-NEXT:    v_fma_f32 v9, -v5, v8, v4
1921; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
1922; SI-NEXT:    v_fma_f32 v4, -v5, v8, v4
1923; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1924; SI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
1925; SI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
1926; SI-NEXT:    v_trunc_f32_e32 v4, v4
1927; SI-NEXT:    v_fma_f32 v0, -v4, v0, v3
1928; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1929; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1930; SI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
1931; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
1932; SI-NEXT:    v_rcp_f32_e32 v5, v4
1933; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1934; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1935; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
1936; SI-NEXT:    v_mul_f32_e32 v7, v3, v5
1937; SI-NEXT:    v_fma_f32 v8, -v4, v7, v3
1938; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
1939; SI-NEXT:    v_fma_f32 v3, -v4, v7, v3
1940; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1941; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
1942; SI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
1943; SI-NEXT:    v_trunc_f32_e32 v3, v3
1944; SI-NEXT:    v_fma_f32 v2, -v3, v6, v2
1945; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1946; SI-NEXT:    v_or_b32_e32 v0, v2, v0
1947; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1948; SI-NEXT:    s_endpgm
1949;
1950; CI-LABEL: frem_v4f16:
1951; CI:       ; %bb.0:
1952; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1953; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1954; CI-NEXT:    s_mov_b32 s3, 0xf000
1955; CI-NEXT:    s_mov_b32 s2, -1
1956; CI-NEXT:    s_mov_b32 s10, s2
1957; CI-NEXT:    s_waitcnt lgkmcnt(0)
1958; CI-NEXT:    s_mov_b32 s0, s4
1959; CI-NEXT:    s_mov_b32 s1, s5
1960; CI-NEXT:    s_mov_b32 s4, s6
1961; CI-NEXT:    s_mov_b32 s5, s7
1962; CI-NEXT:    s_mov_b32 s6, s2
1963; CI-NEXT:    s_mov_b32 s7, s3
1964; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1965; CI-NEXT:    s_mov_b32 s11, s3
1966; CI-NEXT:    s_waitcnt vmcnt(0)
1967; CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1968; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1969; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
1970; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1971; CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
1972; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1973; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1974; CI-NEXT:    s_waitcnt vmcnt(0)
1975; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1976; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1977; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1978; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1979; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1980; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1981; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
1982; CI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
1983; CI-NEXT:    v_rcp_f32_e32 v10, v9
1984; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
1985; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1986; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
1987; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
1988; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1989; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
1990; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1991; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
1992; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1993; CI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
1994; CI-NEXT:    v_trunc_f32_e32 v8, v8
1995; CI-NEXT:    v_fma_f32 v1, -v8, v1, v5
1996; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
1997; CI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
1998; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1999; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2000; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2001; CI-NEXT:    v_rcp_f32_e32 v9, v8
2002; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2003; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2004; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
2005; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
2006; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
2007; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
2008; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
2009; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2010; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
2011; CI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
2012; CI-NEXT:    v_trunc_f32_e32 v5, v5
2013; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2014; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
2015; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2016; CI-NEXT:    v_or_b32_e32 v1, v4, v1
2017; CI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
2018; CI-NEXT:    v_rcp_f32_e32 v7, v5
2019; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2020; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
2021; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
2022; CI-NEXT:    v_mul_f32_e32 v8, v4, v7
2023; CI-NEXT:    v_fma_f32 v9, -v5, v8, v4
2024; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
2025; CI-NEXT:    v_fma_f32 v4, -v5, v8, v4
2026; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2027; CI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
2028; CI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
2029; CI-NEXT:    v_trunc_f32_e32 v4, v4
2030; CI-NEXT:    v_fma_f32 v0, -v4, v0, v3
2031; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
2032; CI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
2033; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2034; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2035; CI-NEXT:    v_rcp_f32_e32 v5, v4
2036; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2037; CI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
2038; CI-NEXT:    v_fma_f32 v5, v7, v5, v5
2039; CI-NEXT:    v_mul_f32_e32 v7, v3, v5
2040; CI-NEXT:    v_fma_f32 v8, -v4, v7, v3
2041; CI-NEXT:    v_fma_f32 v7, v8, v5, v7
2042; CI-NEXT:    v_fma_f32 v3, -v4, v7, v3
2043; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2044; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
2045; CI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
2046; CI-NEXT:    v_trunc_f32_e32 v3, v3
2047; CI-NEXT:    v_fma_f32 v2, -v3, v6, v2
2048; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2049; CI-NEXT:    v_or_b32_e32 v0, v2, v0
2050; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2051; CI-NEXT:    s_endpgm
2052;
2053; VI-LABEL: frem_v4f16:
2054; VI:       ; %bb.0:
2055; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2056; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2057; VI-NEXT:    s_waitcnt lgkmcnt(0)
2058; VI-NEXT:    v_mov_b32_e32 v2, s6
2059; VI-NEXT:    s_add_u32 s0, s0, 32
2060; VI-NEXT:    s_addc_u32 s1, s1, 0
2061; VI-NEXT:    v_mov_b32_e32 v5, s1
2062; VI-NEXT:    v_mov_b32_e32 v4, s0
2063; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
2064; VI-NEXT:    v_mov_b32_e32 v3, s7
2065; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
2066; VI-NEXT:    v_mov_b32_e32 v0, s4
2067; VI-NEXT:    v_mov_b32_e32 v1, s5
2068; VI-NEXT:    s_waitcnt vmcnt(1)
2069; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
2070; VI-NEXT:    v_cvt_f32_f16_e32 v9, v8
2071; VI-NEXT:    s_waitcnt vmcnt(0)
2072; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
2073; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
2074; VI-NEXT:    v_rcp_f32_e32 v9, v9
2075; VI-NEXT:    v_mul_f32_e32 v7, v7, v9
2076; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2077; VI-NEXT:    v_div_fixup_f16 v7, v7, v8, v6
2078; VI-NEXT:    v_trunc_f16_e32 v7, v7
2079; VI-NEXT:    v_fma_f16 v6, -v7, v8, v6
2080; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
2081; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
2082; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2083; VI-NEXT:    v_rcp_f32_e32 v8, v8
2084; VI-NEXT:    v_mul_f32_e32 v7, v7, v8
2085; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2086; VI-NEXT:    v_div_fixup_f16 v7, v7, v5, v3
2087; VI-NEXT:    v_trunc_f16_e32 v7, v7
2088; VI-NEXT:    v_fma_f16 v3, -v7, v5, v3
2089; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
2090; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
2091; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2092; VI-NEXT:    v_or_b32_e32 v3, v3, v6
2093; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
2094; VI-NEXT:    v_rcp_f32_e32 v8, v8
2095; VI-NEXT:    v_mul_f32_e32 v6, v6, v8
2096; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2097; VI-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
2098; VI-NEXT:    v_trunc_f16_e32 v6, v6
2099; VI-NEXT:    v_fma_f16 v5, -v6, v7, v5
2100; VI-NEXT:    v_cvt_f32_f16_e32 v7, v4
2101; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
2102; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2103; VI-NEXT:    v_rcp_f32_e32 v7, v7
2104; VI-NEXT:    v_mul_f32_e32 v6, v6, v7
2105; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2106; VI-NEXT:    v_div_fixup_f16 v6, v6, v4, v2
2107; VI-NEXT:    v_trunc_f16_e32 v6, v6
2108; VI-NEXT:    v_fma_f16 v2, -v6, v4, v2
2109; VI-NEXT:    v_or_b32_e32 v2, v2, v5
2110; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2111; VI-NEXT:    s_endpgm
2112;
2113; GFX9-LABEL: frem_v4f16:
2114; GFX9:       ; %bb.0:
2115; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2116; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2117; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2118; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2119; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
2120; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2121; GFX9-NEXT:    s_waitcnt vmcnt(1)
2122; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v1
2123; GFX9-NEXT:    s_waitcnt vmcnt(0)
2124; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v3
2125; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
2126; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
2127; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
2128; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
2129; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
2130; GFX9-NEXT:    v_fma_f16 v5, -v5, v3, v1
2131; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2132; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v3
2133; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2134; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v1
2135; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
2136; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v7
2137; GFX9-NEXT:    v_cvt_f16_f32_e32 v6, v6
2138; GFX9-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
2139; GFX9-NEXT:    v_trunc_f16_e32 v6, v6
2140; GFX9-NEXT:    v_fma_f16 v1, -v6, v3, v1
2141; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v1
2142; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
2143; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
2144; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
2145; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
2146; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
2147; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
2148; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
2149; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v0
2150; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2151; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v2
2152; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2153; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v0
2154; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
2155; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
2156; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
2157; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
2158; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
2159; GFX9-NEXT:    v_fma_f16 v0, -v5, v2, v0
2160; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
2161; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
2162; GFX9-NEXT:    s_endpgm
2163;
2164; GFX10-LABEL: frem_v4f16:
2165; GFX10:       ; %bb.0:
2166; GFX10-NEXT:    s_clause 0x1
2167; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2168; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2169; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2170; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2171; GFX10-NEXT:    s_clause 0x1
2172; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
2173; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2174; GFX10-NEXT:    s_waitcnt vmcnt(1)
2175; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
2176; GFX10-NEXT:    s_waitcnt vmcnt(0)
2177; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v3
2178; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
2179; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v6
2180; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
2181; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
2182; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
2183; GFX10-NEXT:    v_fma_f16 v5, -v5, v3, v1
2184; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2185; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2186; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v3
2187; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v1
2188; GFX10-NEXT:    v_rcp_f32_e32 v7, v7
2189; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v7
2190; GFX10-NEXT:    v_cvt_f16_f32_e32 v6, v6
2191; GFX10-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
2192; GFX10-NEXT:    v_trunc_f16_e32 v6, v6
2193; GFX10-NEXT:    v_fma_f16 v1, -v6, v3, v1
2194; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
2195; GFX10-NEXT:    v_pack_b32_f16 v1, v5, v1
2196; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
2197; GFX10-NEXT:    v_rcp_f32_e32 v5, v5
2198; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
2199; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
2200; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
2201; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
2202; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v0
2203; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2204; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2205; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v2
2206; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v0
2207; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
2208; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v6
2209; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
2210; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
2211; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
2212; GFX10-NEXT:    v_fma_f16 v0, -v5, v2, v0
2213; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
2214; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
2215; GFX10-NEXT:    s_endpgm
2216;
2217; GFX11-LABEL: frem_v4f16:
2218; GFX11:       ; %bb.0:
2219; GFX11-NEXT:    s_clause 0x1
2220; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
2221; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
2222; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2223; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2224; GFX11-NEXT:    s_clause 0x1
2225; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
2226; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[0:1] offset:32
2227; GFX11-NEXT:    s_waitcnt vmcnt(1)
2228; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v1
2229; GFX11-NEXT:    s_waitcnt vmcnt(0)
2230; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v3
2231; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2232; GFX11-NEXT:    v_rcp_f32_e32 v6, v6
2233; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2234; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v6
2235; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
2236; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2237; GFX11-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
2238; GFX11-NEXT:    v_trunc_f16_e32 v5, v5
2239; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2240; GFX11-NEXT:    v_fma_f16 v5, -v5, v3, v1
2241; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2242; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2243; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v3
2244; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2245; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v1
2246; GFX11-NEXT:    v_rcp_f32_e32 v7, v7
2247; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2248; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v7
2249; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2250; GFX11-NEXT:    v_cvt_f16_f32_e32 v6, v6
2251; GFX11-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
2252; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2253; GFX11-NEXT:    v_trunc_f16_e32 v6, v6
2254; GFX11-NEXT:    v_fma_f16 v1, -v6, v3, v1
2255; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v0
2256; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2257; GFX11-NEXT:    v_pack_b32_f16 v1, v5, v1
2258; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v2
2259; GFX11-NEXT:    v_rcp_f32_e32 v5, v5
2260; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2261; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v5
2262; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2263; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
2264; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
2265; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2266; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
2267; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v0
2268; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2269; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2270; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2271; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v2
2272; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v0
2273; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2274; GFX11-NEXT:    v_rcp_f32_e32 v6, v6
2275; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2276; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v6
2277; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
2278; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2279; GFX11-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
2280; GFX11-NEXT:    v_trunc_f16_e32 v5, v5
2281; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2282; GFX11-NEXT:    v_fma_f16 v0, -v5, v2, v0
2283; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v0
2284; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
2285; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2286; GFX11-NEXT:    s_endpgm
2287                        <4 x half> addrspace(1)* %in2) #0 {
2288   %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
2289   %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16
2290   %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16
2291   %r2 = frem <4 x half> %r0, %r1
2292   store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16
2293   ret void
2294}
2295
2296define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
2297; SI-LABEL: frem_v2f32:
2298; SI:       ; %bb.0:
2299; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2300; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2301; SI-NEXT:    s_mov_b32 s3, 0xf000
2302; SI-NEXT:    s_mov_b32 s2, -1
2303; SI-NEXT:    s_waitcnt lgkmcnt(0)
2304; SI-NEXT:    s_mov_b32 s0, s4
2305; SI-NEXT:    s_mov_b32 s1, s5
2306; SI-NEXT:    s_mov_b32 s4, s6
2307; SI-NEXT:    s_mov_b32 s5, s7
2308; SI-NEXT:    s_mov_b32 s6, s2
2309; SI-NEXT:    s_mov_b32 s7, s3
2310; SI-NEXT:    s_mov_b32 s10, s2
2311; SI-NEXT:    s_mov_b32 s11, s3
2312; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2313; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2314; SI-NEXT:    s_waitcnt vmcnt(0)
2315; SI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
2316; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
2317; SI-NEXT:    v_rcp_f32_e32 v6, v5
2318; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2319; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2320; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
2321; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
2322; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
2323; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
2324; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2325; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2326; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
2327; SI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
2328; SI-NEXT:    v_trunc_f32_e32 v4, v4
2329; SI-NEXT:    v_fma_f32 v1, -v4, v3, v1
2330; SI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2331; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
2332; SI-NEXT:    v_rcp_f32_e32 v5, v4
2333; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2334; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
2335; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
2336; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
2337; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
2338; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
2339; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
2340; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2341; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
2342; SI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2343; SI-NEXT:    v_trunc_f32_e32 v3, v3
2344; SI-NEXT:    v_fma_f32 v0, -v3, v2, v0
2345; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2346; SI-NEXT:    s_endpgm
2347;
2348; CI-LABEL: frem_v2f32:
2349; CI:       ; %bb.0:
2350; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2351; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2352; CI-NEXT:    s_mov_b32 s3, 0xf000
2353; CI-NEXT:    s_mov_b32 s2, -1
2354; CI-NEXT:    s_mov_b32 s10, s2
2355; CI-NEXT:    s_waitcnt lgkmcnt(0)
2356; CI-NEXT:    s_mov_b32 s0, s4
2357; CI-NEXT:    s_mov_b32 s1, s5
2358; CI-NEXT:    s_mov_b32 s4, s6
2359; CI-NEXT:    s_mov_b32 s5, s7
2360; CI-NEXT:    s_mov_b32 s6, s2
2361; CI-NEXT:    s_mov_b32 s7, s3
2362; CI-NEXT:    s_mov_b32 s11, s3
2363; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2364; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2365; CI-NEXT:    s_waitcnt vmcnt(0)
2366; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
2367; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
2368; CI-NEXT:    v_rcp_f32_e32 v6, v5
2369; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2370; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2371; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
2372; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
2373; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
2374; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
2375; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2376; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2377; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
2378; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
2379; CI-NEXT:    v_trunc_f32_e32 v4, v4
2380; CI-NEXT:    v_fma_f32 v1, -v4, v3, v1
2381; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
2382; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2383; CI-NEXT:    v_rcp_f32_e32 v5, v4
2384; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2385; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
2386; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
2387; CI-NEXT:    v_mul_f32_e32 v6, v3, v5
2388; CI-NEXT:    v_fma_f32 v7, -v4, v6, v3
2389; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
2390; CI-NEXT:    v_fma_f32 v3, -v4, v6, v3
2391; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2392; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
2393; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2394; CI-NEXT:    v_trunc_f32_e32 v3, v3
2395; CI-NEXT:    v_fma_f32 v0, -v3, v2, v0
2396; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2397; CI-NEXT:    s_endpgm
2398;
2399; VI-LABEL: frem_v2f32:
2400; VI:       ; %bb.0:
2401; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2402; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2403; VI-NEXT:    s_waitcnt lgkmcnt(0)
2404; VI-NEXT:    v_mov_b32_e32 v2, s6
2405; VI-NEXT:    s_add_u32 s0, s0, 32
2406; VI-NEXT:    s_addc_u32 s1, s1, 0
2407; VI-NEXT:    v_mov_b32_e32 v5, s1
2408; VI-NEXT:    v_mov_b32_e32 v3, s7
2409; VI-NEXT:    v_mov_b32_e32 v4, s0
2410; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
2411; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
2412; VI-NEXT:    v_mov_b32_e32 v0, s4
2413; VI-NEXT:    v_mov_b32_e32 v1, s5
2414; VI-NEXT:    s_waitcnt vmcnt(0)
2415; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v3
2416; VI-NEXT:    v_div_scale_f32 v6, vcc, v3, v5, v3
2417; VI-NEXT:    v_rcp_f32_e32 v8, v7
2418; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2419; VI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2420; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
2421; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
2422; VI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2423; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
2424; VI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2425; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2426; VI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2427; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v3
2428; VI-NEXT:    v_trunc_f32_e32 v6, v6
2429; VI-NEXT:    v_fma_f32 v3, -v6, v5, v3
2430; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v2
2431; VI-NEXT:    v_div_scale_f32 v5, vcc, v2, v4, v2
2432; VI-NEXT:    v_rcp_f32_e32 v7, v6
2433; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2434; VI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2435; VI-NEXT:    v_fma_f32 v7, v8, v7, v7
2436; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
2437; VI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2438; VI-NEXT:    v_fma_f32 v8, v9, v7, v8
2439; VI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2440; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2441; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2442; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v2
2443; VI-NEXT:    v_trunc_f32_e32 v5, v5
2444; VI-NEXT:    v_fma_f32 v2, -v5, v4, v2
2445; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2446; VI-NEXT:    s_endpgm
2447;
2448; GFX9-LABEL: frem_v2f32:
2449; GFX9:       ; %bb.0:
2450; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2451; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2452; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2453; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2454; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
2455; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2456; GFX9-NEXT:    s_waitcnt vmcnt(0)
2457; GFX9-NEXT:    v_div_scale_f32 v6, s[0:1], v3, v3, v1
2458; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
2459; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
2460; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2461; GFX9-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2462; GFX9-NEXT:    v_fma_f32 v7, v8, v7, v7
2463; GFX9-NEXT:    v_mul_f32_e32 v8, v5, v7
2464; GFX9-NEXT:    v_fma_f32 v9, -v6, v8, v5
2465; GFX9-NEXT:    v_fma_f32 v8, v9, v7, v8
2466; GFX9-NEXT:    v_fma_f32 v5, -v6, v8, v5
2467; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2468; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2469; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
2470; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2471; GFX9-NEXT:    v_fma_f32 v1, -v5, v3, v1
2472; GFX9-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v0
2473; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2474; GFX9-NEXT:    v_rcp_f32_e32 v6, v5
2475; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2476; GFX9-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2477; GFX9-NEXT:    v_fma_f32 v6, v7, v6, v6
2478; GFX9-NEXT:    v_mul_f32_e32 v7, v3, v6
2479; GFX9-NEXT:    v_fma_f32 v8, -v5, v7, v3
2480; GFX9-NEXT:    v_fma_f32 v7, v8, v6, v7
2481; GFX9-NEXT:    v_fma_f32 v3, -v5, v7, v3
2482; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2483; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
2484; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2485; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2486; GFX9-NEXT:    v_fma_f32 v0, -v3, v2, v0
2487; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
2488; GFX9-NEXT:    s_endpgm
2489;
2490; GFX10-LABEL: frem_v2f32:
2491; GFX10:       ; %bb.0:
2492; GFX10-NEXT:    s_clause 0x1
2493; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2494; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2495; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2496; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2497; GFX10-NEXT:    s_clause 0x1
2498; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
2499; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2500; GFX10-NEXT:    s_waitcnt vmcnt(0)
2501; GFX10-NEXT:    v_div_scale_f32 v6, s0, v3, v3, v1
2502; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2503; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
2504; GFX10-NEXT:    s_denorm_mode 15
2505; GFX10-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2506; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v7
2507; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
2508; GFX10-NEXT:    v_fma_f32 v9, -v6, v8, v5
2509; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v7
2510; GFX10-NEXT:    v_fma_f32 v5, -v6, v8, v5
2511; GFX10-NEXT:    s_denorm_mode 12
2512; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2513; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
2514; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
2515; GFX10-NEXT:    v_fma_f32 v1, -v5, v3, v1
2516; GFX10-NEXT:    v_div_scale_f32 v5, s0, v2, v2, v0
2517; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2518; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
2519; GFX10-NEXT:    s_denorm_mode 15
2520; GFX10-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2521; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v6
2522; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
2523; GFX10-NEXT:    v_fma_f32 v8, -v5, v7, v3
2524; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v6
2525; GFX10-NEXT:    v_fma_f32 v3, -v5, v7, v3
2526; GFX10-NEXT:    s_denorm_mode 12
2527; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
2528; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2529; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
2530; GFX10-NEXT:    v_fma_f32 v0, -v3, v2, v0
2531; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
2532; GFX10-NEXT:    s_endpgm
2533;
2534; GFX11-LABEL: frem_v2f32:
2535; GFX11:       ; %bb.0:
2536; GFX11-NEXT:    s_clause 0x1
2537; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
2538; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
2539; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2540; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2541; GFX11-NEXT:    s_clause 0x1
2542; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
2543; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[0:1] offset:32
2544; GFX11-NEXT:    s_waitcnt vmcnt(0)
2545; GFX11-NEXT:    v_div_scale_f32 v6, null, v3, v3, v1
2546; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2547; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
2548; GFX11-NEXT:    v_rcp_f32_e32 v7, v6
2549; GFX11-NEXT:    s_denorm_mode 15
2550; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2551; GFX11-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2552; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v7
2553; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2554; GFX11-NEXT:    v_mul_f32_e32 v8, v5, v7
2555; GFX11-NEXT:    v_fma_f32 v9, -v6, v8, v5
2556; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2557; GFX11-NEXT:    v_fmac_f32_e32 v8, v9, v7
2558; GFX11-NEXT:    v_fma_f32 v5, -v6, v8, v5
2559; GFX11-NEXT:    s_denorm_mode 12
2560; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2561; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2562; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
2563; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2564; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
2565; GFX11-NEXT:    v_fma_f32 v1, -v5, v3, v1
2566; GFX11-NEXT:    v_div_scale_f32 v5, null, v2, v2, v0
2567; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2568; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
2569; GFX11-NEXT:    v_rcp_f32_e32 v6, v5
2570; GFX11-NEXT:    s_denorm_mode 15
2571; GFX11-NEXT:    s_waitcnt_depctr 0xfff
2572; GFX11-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2573; GFX11-NEXT:    v_fmac_f32_e32 v6, v7, v6
2574; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2575; GFX11-NEXT:    v_mul_f32_e32 v7, v3, v6
2576; GFX11-NEXT:    v_fma_f32 v8, -v5, v7, v3
2577; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2578; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v6
2579; GFX11-NEXT:    v_fma_f32 v3, -v5, v7, v3
2580; GFX11-NEXT:    s_denorm_mode 12
2581; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2582; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
2583; GFX11-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2584; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2585; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
2586; GFX11-NEXT:    v_fma_f32 v0, -v3, v2, v0
2587; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
2588; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2589; GFX11-NEXT:    s_endpgm
2590                        <2 x float> addrspace(1)* %in2) #0 {
2591   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
2592   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
2593   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
2594   %r2 = frem <2 x float> %r0, %r1
2595   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
2596   ret void
2597}
2598
2599define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
2600; SI-LABEL: frem_v4f32:
2601; SI:       ; %bb.0:
2602; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2603; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2604; SI-NEXT:    s_mov_b32 s3, 0xf000
2605; SI-NEXT:    s_mov_b32 s2, -1
2606; SI-NEXT:    s_waitcnt lgkmcnt(0)
2607; SI-NEXT:    s_mov_b32 s0, s4
2608; SI-NEXT:    s_mov_b32 s1, s5
2609; SI-NEXT:    s_mov_b32 s4, s6
2610; SI-NEXT:    s_mov_b32 s5, s7
2611; SI-NEXT:    s_mov_b32 s6, s2
2612; SI-NEXT:    s_mov_b32 s7, s3
2613; SI-NEXT:    s_mov_b32 s10, s2
2614; SI-NEXT:    s_mov_b32 s11, s3
2615; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2616; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2617; SI-NEXT:    s_waitcnt vmcnt(0)
2618; SI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
2619; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
2620; SI-NEXT:    v_rcp_f32_e32 v10, v9
2621; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2622; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2623; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
2624; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
2625; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
2626; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
2627; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
2628; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2629; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
2630; SI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
2631; SI-NEXT:    v_trunc_f32_e32 v8, v8
2632; SI-NEXT:    v_fma_f32 v3, -v8, v7, v3
2633; SI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2634; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
2635; SI-NEXT:    v_rcp_f32_e32 v9, v8
2636; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2637; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2638; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
2639; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
2640; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
2641; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
2642; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
2643; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2644; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
2645; SI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2646; SI-NEXT:    v_trunc_f32_e32 v7, v7
2647; SI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2648; SI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2649; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
2650; SI-NEXT:    v_rcp_f32_e32 v8, v7
2651; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2652; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2653; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
2654; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
2655; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2656; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
2657; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2658; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2659; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2660; SI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2661; SI-NEXT:    v_trunc_f32_e32 v6, v6
2662; SI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2663; SI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2664; SI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
2665; SI-NEXT:    v_rcp_f32_e32 v7, v6
2666; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2667; SI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2668; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
2669; SI-NEXT:    v_mul_f32_e32 v8, v5, v7
2670; SI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2671; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
2672; SI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2673; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2674; SI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2675; SI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2676; SI-NEXT:    v_trunc_f32_e32 v5, v5
2677; SI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2678; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2679; SI-NEXT:    s_endpgm
2680;
2681; CI-LABEL: frem_v4f32:
2682; CI:       ; %bb.0:
2683; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2684; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2685; CI-NEXT:    s_mov_b32 s3, 0xf000
2686; CI-NEXT:    s_mov_b32 s2, -1
2687; CI-NEXT:    s_mov_b32 s10, s2
2688; CI-NEXT:    s_waitcnt lgkmcnt(0)
2689; CI-NEXT:    s_mov_b32 s0, s4
2690; CI-NEXT:    s_mov_b32 s1, s5
2691; CI-NEXT:    s_mov_b32 s4, s6
2692; CI-NEXT:    s_mov_b32 s5, s7
2693; CI-NEXT:    s_mov_b32 s6, s2
2694; CI-NEXT:    s_mov_b32 s7, s3
2695; CI-NEXT:    s_mov_b32 s11, s3
2696; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2697; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2698; CI-NEXT:    s_waitcnt vmcnt(0)
2699; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
2700; CI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
2701; CI-NEXT:    v_rcp_f32_e32 v10, v9
2702; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2703; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2704; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
2705; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
2706; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
2707; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
2708; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
2709; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2710; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
2711; CI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
2712; CI-NEXT:    v_trunc_f32_e32 v8, v8
2713; CI-NEXT:    v_fma_f32 v3, -v8, v7, v3
2714; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
2715; CI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2716; CI-NEXT:    v_rcp_f32_e32 v9, v8
2717; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2718; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2719; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
2720; CI-NEXT:    v_mul_f32_e32 v10, v7, v9
2721; CI-NEXT:    v_fma_f32 v11, -v8, v10, v7
2722; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
2723; CI-NEXT:    v_fma_f32 v7, -v8, v10, v7
2724; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2725; CI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
2726; CI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2727; CI-NEXT:    v_trunc_f32_e32 v7, v7
2728; CI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2729; CI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
2730; CI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2731; CI-NEXT:    v_rcp_f32_e32 v8, v7
2732; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2733; CI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2734; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
2735; CI-NEXT:    v_mul_f32_e32 v9, v6, v8
2736; CI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2737; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
2738; CI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2739; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2740; CI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2741; CI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2742; CI-NEXT:    v_trunc_f32_e32 v6, v6
2743; CI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2744; CI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
2745; CI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2746; CI-NEXT:    v_rcp_f32_e32 v7, v6
2747; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2748; CI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2749; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
2750; CI-NEXT:    v_mul_f32_e32 v8, v5, v7
2751; CI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2752; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
2753; CI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2754; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2755; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2756; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2757; CI-NEXT:    v_trunc_f32_e32 v5, v5
2758; CI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2759; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2760; CI-NEXT:    s_endpgm
2761;
2762; VI-LABEL: frem_v4f32:
2763; VI:       ; %bb.0:
2764; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2765; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2766; VI-NEXT:    s_waitcnt lgkmcnt(0)
2767; VI-NEXT:    v_mov_b32_e32 v0, s6
2768; VI-NEXT:    s_add_u32 s0, s0, 64
2769; VI-NEXT:    s_addc_u32 s1, s1, 0
2770; VI-NEXT:    v_mov_b32_e32 v5, s1
2771; VI-NEXT:    v_mov_b32_e32 v1, s7
2772; VI-NEXT:    v_mov_b32_e32 v4, s0
2773; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2774; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2775; VI-NEXT:    v_mov_b32_e32 v8, s4
2776; VI-NEXT:    v_mov_b32_e32 v9, s5
2777; VI-NEXT:    s_waitcnt vmcnt(0)
2778; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v7, v7, v3
2779; VI-NEXT:    v_div_scale_f32 v10, vcc, v3, v7, v3
2780; VI-NEXT:    v_rcp_f32_e32 v12, v11
2781; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2782; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
2783; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
2784; VI-NEXT:    v_mul_f32_e32 v13, v10, v12
2785; VI-NEXT:    v_fma_f32 v14, -v11, v13, v10
2786; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
2787; VI-NEXT:    v_fma_f32 v10, -v11, v13, v10
2788; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2789; VI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
2790; VI-NEXT:    v_div_fixup_f32 v10, v10, v7, v3
2791; VI-NEXT:    v_trunc_f32_e32 v10, v10
2792; VI-NEXT:    v_fma_f32 v3, -v10, v7, v3
2793; VI-NEXT:    v_div_scale_f32 v10, s[0:1], v6, v6, v2
2794; VI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2795; VI-NEXT:    v_rcp_f32_e32 v11, v10
2796; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2797; VI-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2798; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
2799; VI-NEXT:    v_mul_f32_e32 v12, v7, v11
2800; VI-NEXT:    v_fma_f32 v13, -v10, v12, v7
2801; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
2802; VI-NEXT:    v_fma_f32 v7, -v10, v12, v7
2803; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2804; VI-NEXT:    v_div_fmas_f32 v7, v7, v11, v12
2805; VI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2806; VI-NEXT:    v_trunc_f32_e32 v7, v7
2807; VI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2808; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
2809; VI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2810; VI-NEXT:    v_rcp_f32_e32 v10, v7
2811; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2812; VI-NEXT:    v_fma_f32 v11, -v7, v10, 1.0
2813; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
2814; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
2815; VI-NEXT:    v_fma_f32 v12, -v7, v11, v6
2816; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
2817; VI-NEXT:    v_fma_f32 v6, -v7, v11, v6
2818; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2819; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
2820; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2821; VI-NEXT:    v_trunc_f32_e32 v6, v6
2822; VI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2823; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
2824; VI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2825; VI-NEXT:    v_rcp_f32_e32 v7, v6
2826; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2827; VI-NEXT:    v_fma_f32 v10, -v6, v7, 1.0
2828; VI-NEXT:    v_fma_f32 v7, v10, v7, v7
2829; VI-NEXT:    v_mul_f32_e32 v10, v5, v7
2830; VI-NEXT:    v_fma_f32 v11, -v6, v10, v5
2831; VI-NEXT:    v_fma_f32 v10, v11, v7, v10
2832; VI-NEXT:    v_fma_f32 v5, -v6, v10, v5
2833; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2834; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v10
2835; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2836; VI-NEXT:    v_trunc_f32_e32 v5, v5
2837; VI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2838; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2839; VI-NEXT:    s_endpgm
2840;
2841; GFX9-LABEL: frem_v4f32:
2842; GFX9:       ; %bb.0:
2843; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2844; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2845; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2846; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2847; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
2848; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2849; GFX9-NEXT:    s_waitcnt vmcnt(0)
2850; GFX9-NEXT:    v_div_scale_f32 v10, s[0:1], v7, v7, v3
2851; GFX9-NEXT:    v_div_scale_f32 v9, vcc, v3, v7, v3
2852; GFX9-NEXT:    v_rcp_f32_e32 v11, v10
2853; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2854; GFX9-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2855; GFX9-NEXT:    v_fma_f32 v11, v12, v11, v11
2856; GFX9-NEXT:    v_mul_f32_e32 v12, v9, v11
2857; GFX9-NEXT:    v_fma_f32 v13, -v10, v12, v9
2858; GFX9-NEXT:    v_fma_f32 v12, v13, v11, v12
2859; GFX9-NEXT:    v_fma_f32 v9, -v10, v12, v9
2860; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2861; GFX9-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
2862; GFX9-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
2863; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
2864; GFX9-NEXT:    v_fma_f32 v3, -v9, v7, v3
2865; GFX9-NEXT:    v_div_scale_f32 v9, s[0:1], v6, v6, v2
2866; GFX9-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2867; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
2868; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2869; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2870; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
2871; GFX9-NEXT:    v_mul_f32_e32 v11, v7, v10
2872; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v7
2873; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
2874; GFX9-NEXT:    v_fma_f32 v7, -v9, v11, v7
2875; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2876; GFX9-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
2877; GFX9-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2878; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
2879; GFX9-NEXT:    v_fma_f32 v2, -v7, v6, v2
2880; GFX9-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
2881; GFX9-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2882; GFX9-NEXT:    v_rcp_f32_e32 v9, v7
2883; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2884; GFX9-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
2885; GFX9-NEXT:    v_fma_f32 v9, v10, v9, v9
2886; GFX9-NEXT:    v_mul_f32_e32 v10, v6, v9
2887; GFX9-NEXT:    v_fma_f32 v11, -v7, v10, v6
2888; GFX9-NEXT:    v_fma_f32 v10, v11, v9, v10
2889; GFX9-NEXT:    v_fma_f32 v6, -v7, v10, v6
2890; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2891; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
2892; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2893; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
2894; GFX9-NEXT:    v_fma_f32 v1, -v6, v5, v1
2895; GFX9-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
2896; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2897; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
2898; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
2899; GFX9-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
2900; GFX9-NEXT:    v_fma_f32 v7, v9, v7, v7
2901; GFX9-NEXT:    v_mul_f32_e32 v9, v5, v7
2902; GFX9-NEXT:    v_fma_f32 v10, -v6, v9, v5
2903; GFX9-NEXT:    v_fma_f32 v9, v10, v7, v9
2904; GFX9-NEXT:    v_fma_f32 v5, -v6, v9, v5
2905; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
2906; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
2907; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2908; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2909; GFX9-NEXT:    v_fma_f32 v0, -v5, v4, v0
2910; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
2911; GFX9-NEXT:    s_endpgm
2912;
2913; GFX10-LABEL: frem_v4f32:
2914; GFX10:       ; %bb.0:
2915; GFX10-NEXT:    s_clause 0x1
2916; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2917; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2918; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2919; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2920; GFX10-NEXT:    s_clause 0x1
2921; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
2922; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2923; GFX10-NEXT:    s_waitcnt vmcnt(0)
2924; GFX10-NEXT:    v_div_scale_f32 v10, s0, v7, v7, v3
2925; GFX10-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
2926; GFX10-NEXT:    v_rcp_f32_e32 v11, v10
2927; GFX10-NEXT:    s_denorm_mode 15
2928; GFX10-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2929; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v11
2930; GFX10-NEXT:    v_mul_f32_e32 v12, v9, v11
2931; GFX10-NEXT:    v_fma_f32 v13, -v10, v12, v9
2932; GFX10-NEXT:    v_fmac_f32_e32 v12, v13, v11
2933; GFX10-NEXT:    v_fma_f32 v9, -v10, v12, v9
2934; GFX10-NEXT:    s_denorm_mode 12
2935; GFX10-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
2936; GFX10-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
2937; GFX10-NEXT:    v_trunc_f32_e32 v9, v9
2938; GFX10-NEXT:    v_fma_f32 v3, -v9, v7, v3
2939; GFX10-NEXT:    v_div_scale_f32 v9, s0, v6, v6, v2
2940; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
2941; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
2942; GFX10-NEXT:    s_denorm_mode 15
2943; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2944; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v10
2945; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
2946; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
2947; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v10
2948; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
2949; GFX10-NEXT:    s_denorm_mode 12
2950; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
2951; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2952; GFX10-NEXT:    v_trunc_f32_e32 v7, v7
2953; GFX10-NEXT:    v_fma_f32 v2, -v7, v6, v2
2954; GFX10-NEXT:    v_div_scale_f32 v7, s0, v5, v5, v1
2955; GFX10-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
2956; GFX10-NEXT:    v_rcp_f32_e32 v9, v7
2957; GFX10-NEXT:    s_denorm_mode 15
2958; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
2959; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v9
2960; GFX10-NEXT:    v_mul_f32_e32 v10, v6, v9
2961; GFX10-NEXT:    v_fma_f32 v11, -v7, v10, v6
2962; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v9
2963; GFX10-NEXT:    v_fma_f32 v6, -v7, v10, v6
2964; GFX10-NEXT:    s_denorm_mode 12
2965; GFX10-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
2966; GFX10-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2967; GFX10-NEXT:    v_trunc_f32_e32 v6, v6
2968; GFX10-NEXT:    v_fma_f32 v1, -v6, v5, v1
2969; GFX10-NEXT:    v_div_scale_f32 v6, s0, v4, v4, v0
2970; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
2971; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
2972; GFX10-NEXT:    s_denorm_mode 15
2973; GFX10-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
2974; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v7
2975; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v7
2976; GFX10-NEXT:    v_fma_f32 v10, -v6, v9, v5
2977; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v7
2978; GFX10-NEXT:    v_fma_f32 v5, -v6, v9, v5
2979; GFX10-NEXT:    s_denorm_mode 12
2980; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
2981; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2982; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
2983; GFX10-NEXT:    v_fma_f32 v0, -v5, v4, v0
2984; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
2985; GFX10-NEXT:    s_endpgm
2986;
2987; GFX11-LABEL: frem_v4f32:
2988; GFX11:       ; %bb.0:
2989; GFX11-NEXT:    s_clause 0x1
2990; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
2991; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
2992; GFX11-NEXT:    v_mov_b32_e32 v8, 0
2993; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2994; GFX11-NEXT:    s_clause 0x1
2995; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[6:7]
2996; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[0:1] offset:64
2997; GFX11-NEXT:    s_waitcnt vmcnt(0)
2998; GFX11-NEXT:    v_div_scale_f32 v10, null, v7, v7, v3
2999; GFX11-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
3000; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3001; GFX11-NEXT:    v_rcp_f32_e32 v11, v10
3002; GFX11-NEXT:    s_denorm_mode 15
3003; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3004; GFX11-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
3005; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v11
3006; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3007; GFX11-NEXT:    v_mul_f32_e32 v12, v9, v11
3008; GFX11-NEXT:    v_fma_f32 v13, -v10, v12, v9
3009; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3010; GFX11-NEXT:    v_fmac_f32_e32 v12, v13, v11
3011; GFX11-NEXT:    v_fma_f32 v9, -v10, v12, v9
3012; GFX11-NEXT:    s_denorm_mode 12
3013; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3014; GFX11-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
3015; GFX11-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
3016; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3017; GFX11-NEXT:    v_trunc_f32_e32 v9, v9
3018; GFX11-NEXT:    v_fma_f32 v3, -v9, v7, v3
3019; GFX11-NEXT:    v_div_scale_f32 v9, null, v6, v6, v2
3020; GFX11-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
3021; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3022; GFX11-NEXT:    v_rcp_f32_e32 v10, v9
3023; GFX11-NEXT:    s_denorm_mode 15
3024; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3025; GFX11-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
3026; GFX11-NEXT:    v_fmac_f32_e32 v10, v11, v10
3027; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3028; GFX11-NEXT:    v_mul_f32_e32 v11, v7, v10
3029; GFX11-NEXT:    v_fma_f32 v12, -v9, v11, v7
3030; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3031; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v10
3032; GFX11-NEXT:    v_fma_f32 v7, -v9, v11, v7
3033; GFX11-NEXT:    s_denorm_mode 12
3034; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3035; GFX11-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
3036; GFX11-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
3037; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3038; GFX11-NEXT:    v_trunc_f32_e32 v7, v7
3039; GFX11-NEXT:    v_fma_f32 v2, -v7, v6, v2
3040; GFX11-NEXT:    v_div_scale_f32 v7, null, v5, v5, v1
3041; GFX11-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
3042; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3043; GFX11-NEXT:    v_rcp_f32_e32 v9, v7
3044; GFX11-NEXT:    s_denorm_mode 15
3045; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3046; GFX11-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
3047; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v9
3048; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3049; GFX11-NEXT:    v_mul_f32_e32 v10, v6, v9
3050; GFX11-NEXT:    v_fma_f32 v11, -v7, v10, v6
3051; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3052; GFX11-NEXT:    v_fmac_f32_e32 v10, v11, v9
3053; GFX11-NEXT:    v_fma_f32 v6, -v7, v10, v6
3054; GFX11-NEXT:    s_denorm_mode 12
3055; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3056; GFX11-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
3057; GFX11-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
3058; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3059; GFX11-NEXT:    v_trunc_f32_e32 v6, v6
3060; GFX11-NEXT:    v_fma_f32 v1, -v6, v5, v1
3061; GFX11-NEXT:    v_div_scale_f32 v6, null, v4, v4, v0
3062; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
3063; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
3064; GFX11-NEXT:    v_rcp_f32_e32 v7, v6
3065; GFX11-NEXT:    s_denorm_mode 15
3066; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3067; GFX11-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
3068; GFX11-NEXT:    v_fmac_f32_e32 v7, v9, v7
3069; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3070; GFX11-NEXT:    v_mul_f32_e32 v9, v5, v7
3071; GFX11-NEXT:    v_fma_f32 v10, -v6, v9, v5
3072; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3073; GFX11-NEXT:    v_fmac_f32_e32 v9, v10, v7
3074; GFX11-NEXT:    v_fma_f32 v5, -v6, v9, v5
3075; GFX11-NEXT:    s_denorm_mode 12
3076; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3077; GFX11-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
3078; GFX11-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
3079; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3080; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
3081; GFX11-NEXT:    v_fma_f32 v0, -v5, v4, v0
3082; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
3083; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3084; GFX11-NEXT:    s_endpgm
3085                        <4 x float> addrspace(1)* %in2) #0 {
3086   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
3087   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
3088   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
3089   %r2 = frem <4 x float> %r0, %r1
3090   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
3091   ret void
3092}
3093
3094define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
3095; SI-LABEL: frem_v2f64:
3096; SI:       ; %bb.0:
3097; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
3098; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3099; SI-NEXT:    s_mov_b32 s7, 0xf000
3100; SI-NEXT:    s_mov_b32 s6, -1
3101; SI-NEXT:    s_waitcnt lgkmcnt(0)
3102; SI-NEXT:    s_mov_b32 s4, s8
3103; SI-NEXT:    s_mov_b32 s5, s9
3104; SI-NEXT:    s_mov_b32 s8, s10
3105; SI-NEXT:    s_mov_b32 s9, s11
3106; SI-NEXT:    s_mov_b32 s10, s6
3107; SI-NEXT:    s_mov_b32 s11, s7
3108; SI-NEXT:    s_mov_b32 s2, s6
3109; SI-NEXT:    s_mov_b32 s3, s7
3110; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
3111; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
3112; SI-NEXT:    s_waitcnt vmcnt(0)
3113; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
3114; SI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
3115; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3116; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3117; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3118; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3119; SI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
3120; SI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
3121; SI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
3122; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
3123; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v13
3124; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
3125; SI-NEXT:    s_nop 1
3126; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
3127; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3128; SI-NEXT:    v_bfe_u32 v10, v9, 20, 11
3129; SI-NEXT:    v_add_i32_e32 v12, vcc, 0xfffffc01, v10
3130; SI-NEXT:    s_mov_b32 s3, 0xfffff
3131; SI-NEXT:    v_lshr_b64 v[10:11], s[2:3], v12
3132; SI-NEXT:    v_not_b32_e32 v10, v10
3133; SI-NEXT:    v_and_b32_e32 v10, v8, v10
3134; SI-NEXT:    v_not_b32_e32 v11, v11
3135; SI-NEXT:    v_and_b32_e32 v11, v9, v11
3136; SI-NEXT:    v_and_b32_e32 v13, 0x80000000, v9
3137; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v12
3138; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
3139; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v12
3140; SI-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[0:1]
3141; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
3142; SI-NEXT:    v_cndmask_b32_e64 v8, v10, v8, s[0:1]
3143; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3144; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3145; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
3146; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3147; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3148; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3149; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3150; SI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
3151; SI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
3152; SI-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
3153; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
3154; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v11
3155; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
3156; SI-NEXT:    s_nop 1
3157; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
3158; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3159; SI-NEXT:    v_bfe_u32 v8, v7, 20, 11
3160; SI-NEXT:    v_add_i32_e32 v10, vcc, 0xfffffc01, v8
3161; SI-NEXT:    v_lshr_b64 v[8:9], s[2:3], v10
3162; SI-NEXT:    v_not_b32_e32 v8, v8
3163; SI-NEXT:    v_and_b32_e32 v8, v6, v8
3164; SI-NEXT:    v_not_b32_e32 v9, v9
3165; SI-NEXT:    v_and_b32_e32 v9, v7, v9
3166; SI-NEXT:    v_and_b32_e32 v11, 0x80000000, v7
3167; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v10
3168; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
3169; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v10
3170; SI-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[0:1]
3171; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
3172; SI-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
3173; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3174; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
3175; SI-NEXT:    s_endpgm
3176;
3177; CI-LABEL: frem_v2f64:
3178; CI:       ; %bb.0:
3179; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3180; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
3181; CI-NEXT:    s_mov_b32 s3, 0xf000
3182; CI-NEXT:    s_mov_b32 s2, -1
3183; CI-NEXT:    s_mov_b32 s10, s2
3184; CI-NEXT:    s_waitcnt lgkmcnt(0)
3185; CI-NEXT:    s_mov_b32 s0, s4
3186; CI-NEXT:    s_mov_b32 s1, s5
3187; CI-NEXT:    s_mov_b32 s4, s6
3188; CI-NEXT:    s_mov_b32 s5, s7
3189; CI-NEXT:    s_mov_b32 s6, s2
3190; CI-NEXT:    s_mov_b32 s7, s3
3191; CI-NEXT:    s_mov_b32 s11, s3
3192; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
3193; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
3194; CI-NEXT:    s_waitcnt vmcnt(0)
3195; CI-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
3196; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
3197; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3198; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3199; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3200; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3201; CI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
3202; CI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
3203; CI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3204; CI-NEXT:    s_nop 1
3205; CI-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3206; CI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3207; CI-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
3208; CI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3209; CI-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
3210; CI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
3211; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3212; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3213; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3214; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3215; CI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
3216; CI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
3217; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3218; CI-NEXT:    s_nop 1
3219; CI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3220; CI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3221; CI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
3222; CI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3223; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3224; CI-NEXT:    s_endpgm
3225;
3226; VI-LABEL: frem_v2f64:
3227; VI:       ; %bb.0:
3228; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3229; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3230; VI-NEXT:    s_waitcnt lgkmcnt(0)
3231; VI-NEXT:    v_mov_b32_e32 v0, s6
3232; VI-NEXT:    s_add_u32 s0, s0, 64
3233; VI-NEXT:    s_addc_u32 s1, s1, 0
3234; VI-NEXT:    v_mov_b32_e32 v5, s1
3235; VI-NEXT:    v_mov_b32_e32 v1, s7
3236; VI-NEXT:    v_mov_b32_e32 v4, s0
3237; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
3238; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
3239; VI-NEXT:    v_mov_b32_e32 v8, s4
3240; VI-NEXT:    v_mov_b32_e32 v9, s5
3241; VI-NEXT:    s_waitcnt vmcnt(0)
3242; VI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
3243; VI-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
3244; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
3245; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
3246; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
3247; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
3248; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
3249; VI-NEXT:    v_mul_f64 v[16:17], v[14:15], v[12:13]
3250; VI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
3251; VI-NEXT:    s_nop 1
3252; VI-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
3253; VI-NEXT:    v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
3254; VI-NEXT:    v_trunc_f64_e32 v[10:11], v[10:11]
3255; VI-NEXT:    v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
3256; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3257; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
3258; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
3259; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3260; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
3261; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3262; VI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
3263; VI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
3264; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
3265; VI-NEXT:    s_nop 1
3266; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
3267; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3268; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
3269; VI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3270; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3271; VI-NEXT:    s_endpgm
3272;
3273; GFX9-LABEL: frem_v2f64:
3274; GFX9:       ; %bb.0:
3275; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3276; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3277; GFX9-NEXT:    v_mov_b32_e32 v16, 0
3278; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3279; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
3280; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
3281; GFX9-NEXT:    s_waitcnt vmcnt(0)
3282; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
3283; GFX9-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
3284; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3285; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3286; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3287; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3288; GFX9-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
3289; GFX9-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
3290; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3291; GFX9-NEXT:    s_nop 1
3292; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3293; GFX9-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3294; GFX9-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
3295; GFX9-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3296; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
3297; GFX9-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
3298; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3299; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3300; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3301; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3302; GFX9-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
3303; GFX9-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
3304; GFX9-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3305; GFX9-NEXT:    s_nop 1
3306; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3307; GFX9-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3308; GFX9-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
3309; GFX9-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3310; GFX9-NEXT:    global_store_dwordx4 v16, v[0:3], s[4:5]
3311; GFX9-NEXT:    s_endpgm
3312;
3313; GFX10-LABEL: frem_v2f64:
3314; GFX10:       ; %bb.0:
3315; GFX10-NEXT:    s_clause 0x1
3316; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3317; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3318; GFX10-NEXT:    v_mov_b32_e32 v16, 0
3319; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3320; GFX10-NEXT:    s_clause 0x1
3321; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
3322; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
3323; GFX10-NEXT:    s_waitcnt vmcnt(0)
3324; GFX10-NEXT:    v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3]
3325; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
3326; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3327; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3328; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3329; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3330; GFX10-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
3331; GFX10-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
3332; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3333; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3334; GFX10-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3335; GFX10-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
3336; GFX10-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3337; GFX10-NEXT:    v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1]
3338; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
3339; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3340; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3341; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3342; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3343; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
3344; GFX10-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
3345; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3346; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3347; GFX10-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3348; GFX10-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
3349; GFX10-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3350; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[4:5]
3351; GFX10-NEXT:    s_endpgm
3352;
3353; GFX11-LABEL: frem_v2f64:
3354; GFX11:       ; %bb.0:
3355; GFX11-NEXT:    s_clause 0x1
3356; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
3357; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
3358; GFX11-NEXT:    v_mov_b32_e32 v16, 0
3359; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3360; GFX11-NEXT:    s_clause 0x1
3361; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[6:7]
3362; GFX11-NEXT:    global_load_b128 v[4:7], v16, s[0:1] offset:64
3363; GFX11-NEXT:    s_waitcnt vmcnt(0)
3364; GFX11-NEXT:    v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
3365; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3366; GFX11-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
3367; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3368; GFX11-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3369; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3370; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3371; GFX11-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
3372; GFX11-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
3373; GFX11-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
3374; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3375; GFX11-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
3376; GFX11-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
3377; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3378; GFX11-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
3379; GFX11-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
3380; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3381; GFX11-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
3382; GFX11-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
3383; GFX11-NEXT:    v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
3384; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3385; GFX11-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
3386; GFX11-NEXT:    s_waitcnt_depctr 0xfff
3387; GFX11-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3388; GFX11-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3389; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3390; GFX11-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
3391; GFX11-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
3392; GFX11-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
3393; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3394; GFX11-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
3395; GFX11-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
3396; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3397; GFX11-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
3398; GFX11-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
3399; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3400; GFX11-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
3401; GFX11-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
3402; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[4:5]
3403; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3404; GFX11-NEXT:    s_endpgm
3405                        <2 x double> addrspace(1)* %in2) #0 {
3406   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
3407   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
3408   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
3409   %r2 = frem <2 x double> %r0, %r1
3410   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
3411   ret void
3412}
3413
3414attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3415attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
3416