1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs  < %s | FileCheck --check-prefix=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5
6define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
7; SI-LABEL: frem_f16:
8; SI:       ; %bb.0:
9; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
10; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
11; SI-NEXT:    s_mov_b32 s11, 0xf000
12; SI-NEXT:    s_mov_b32 s10, -1
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    s_mov_b32 s8, s4
15; SI-NEXT:    s_mov_b32 s9, s5
16; SI-NEXT:    s_mov_b32 s4, s6
17; SI-NEXT:    s_mov_b32 s5, s7
18; SI-NEXT:    s_mov_b32 s6, s10
19; SI-NEXT:    s_mov_b32 s7, s11
20; SI-NEXT:    s_mov_b32 s2, s10
21; SI-NEXT:    s_mov_b32 s3, s11
22; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
23; SI-NEXT:    s_waitcnt vmcnt(0)
24; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
25; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
28; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
29; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
30; SI-NEXT:    v_rcp_f32_e32 v4, v3
31; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
32; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
33; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
34; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
35; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
36; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
37; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
38; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
39; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
40; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
41; SI-NEXT:    v_trunc_f32_e32 v2, v2
42; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
43; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
44; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
45; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
46; SI-NEXT:    s_endpgm
47;
48; CI-LABEL: frem_f16:
49; CI:       ; %bb.0:
50; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
51; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
52; CI-NEXT:    s_mov_b32 s11, 0xf000
53; CI-NEXT:    s_mov_b32 s10, -1
54; CI-NEXT:    s_mov_b32 s2, s10
55; CI-NEXT:    s_waitcnt lgkmcnt(0)
56; CI-NEXT:    s_mov_b32 s8, s4
57; CI-NEXT:    s_mov_b32 s9, s5
58; CI-NEXT:    s_mov_b32 s4, s6
59; CI-NEXT:    s_mov_b32 s5, s7
60; CI-NEXT:    s_mov_b32 s3, s11
61; CI-NEXT:    s_mov_b32 s6, s10
62; CI-NEXT:    s_mov_b32 s7, s11
63; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
64; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
65; CI-NEXT:    s_waitcnt vmcnt(1)
66; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
67; CI-NEXT:    s_waitcnt vmcnt(0)
68; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
69; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
70; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
71; CI-NEXT:    v_rcp_f32_e32 v4, v3
72; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
73; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
74; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
75; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
76; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
77; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
78; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
79; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
80; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
81; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
82; CI-NEXT:    v_trunc_f32_e32 v2, v2
83; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
84; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
85; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
86; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
87; CI-NEXT:    s_endpgm
88;
89; VI-LABEL: frem_f16:
90; VI:       ; %bb.0:
91; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
92; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
93; VI-NEXT:    s_waitcnt lgkmcnt(0)
94; VI-NEXT:    v_mov_b32_e32 v2, s6
95; VI-NEXT:    s_add_u32 s0, s0, 8
96; VI-NEXT:    v_mov_b32_e32 v3, s7
97; VI-NEXT:    s_addc_u32 s1, s1, 0
98; VI-NEXT:    flat_load_ushort v4, v[2:3]
99; VI-NEXT:    v_mov_b32_e32 v3, s1
100; VI-NEXT:    v_mov_b32_e32 v2, s0
101; VI-NEXT:    flat_load_ushort v2, v[2:3]
102; VI-NEXT:    v_mov_b32_e32 v0, s4
103; VI-NEXT:    v_mov_b32_e32 v1, s5
104; VI-NEXT:    s_waitcnt vmcnt(1)
105; VI-NEXT:    v_cvt_f32_f16_e32 v3, v4
106; VI-NEXT:    s_waitcnt vmcnt(0)
107; VI-NEXT:    v_cvt_f32_f16_e32 v5, v2
108; VI-NEXT:    v_rcp_f32_e32 v5, v5
109; VI-NEXT:    v_mul_f32_e32 v3, v3, v5
110; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
111; VI-NEXT:    v_div_fixup_f16 v3, v3, v2, v4
112; VI-NEXT:    v_trunc_f16_e32 v3, v3
113; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
114; VI-NEXT:    flat_store_short v[0:1], v2
115; VI-NEXT:    s_endpgm
116                      half addrspace(1)* %in2) #0 {
117   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
118   %r0 = load half, half addrspace(1)* %in1, align 4
119   %r1 = load half, half addrspace(1)* %gep2, align 4
120   %r2 = frem half %r0, %r1
121   store half %r2, half addrspace(1)* %out, align 4
122   ret void
123}
124
125define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
126; SI-LABEL: fast_frem_f16:
127; SI:       ; %bb.0:
128; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
129; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
130; SI-NEXT:    s_mov_b32 s11, 0xf000
131; SI-NEXT:    s_mov_b32 s10, -1
132; SI-NEXT:    s_waitcnt lgkmcnt(0)
133; SI-NEXT:    s_mov_b32 s8, s4
134; SI-NEXT:    s_mov_b32 s9, s5
135; SI-NEXT:    s_mov_b32 s4, s6
136; SI-NEXT:    s_mov_b32 s5, s7
137; SI-NEXT:    s_mov_b32 s6, s10
138; SI-NEXT:    s_mov_b32 s7, s11
139; SI-NEXT:    s_mov_b32 s2, s10
140; SI-NEXT:    s_mov_b32 s3, s11
141; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
142; SI-NEXT:    s_waitcnt vmcnt(0)
143; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
144; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
145; SI-NEXT:    s_waitcnt vmcnt(0)
146; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
147; SI-NEXT:    v_rcp_f32_e32 v2, v1
148; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
149; SI-NEXT:    v_trunc_f32_e32 v2, v2
150; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
151; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
152; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
153; SI-NEXT:    s_endpgm
154;
155; CI-LABEL: fast_frem_f16:
156; CI:       ; %bb.0:
157; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
158; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
159; CI-NEXT:    s_mov_b32 s11, 0xf000
160; CI-NEXT:    s_mov_b32 s10, -1
161; CI-NEXT:    s_mov_b32 s2, s10
162; CI-NEXT:    s_mov_b32 s3, s11
163; CI-NEXT:    s_waitcnt lgkmcnt(0)
164; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
165; CI-NEXT:    s_mov_b32 s8, s4
166; CI-NEXT:    s_mov_b32 s9, s5
167; CI-NEXT:    s_mov_b32 s4, s6
168; CI-NEXT:    s_mov_b32 s5, s7
169; CI-NEXT:    s_mov_b32 s6, s10
170; CI-NEXT:    s_mov_b32 s7, s11
171; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
172; CI-NEXT:    s_waitcnt vmcnt(1)
173; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
174; CI-NEXT:    v_rcp_f32_e32 v2, v1
175; CI-NEXT:    s_waitcnt vmcnt(0)
176; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
177; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
178; CI-NEXT:    v_trunc_f32_e32 v2, v2
179; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
180; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
181; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
182; CI-NEXT:    s_endpgm
183;
184; VI-LABEL: fast_frem_f16:
185; VI:       ; %bb.0:
186; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
187; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
188; VI-NEXT:    s_waitcnt lgkmcnt(0)
189; VI-NEXT:    v_mov_b32_e32 v2, s6
190; VI-NEXT:    s_add_u32 s0, s0, 8
191; VI-NEXT:    v_mov_b32_e32 v3, s7
192; VI-NEXT:    s_addc_u32 s1, s1, 0
193; VI-NEXT:    flat_load_ushort v4, v[2:3]
194; VI-NEXT:    v_mov_b32_e32 v3, s1
195; VI-NEXT:    v_mov_b32_e32 v2, s0
196; VI-NEXT:    flat_load_ushort v2, v[2:3]
197; VI-NEXT:    v_mov_b32_e32 v0, s4
198; VI-NEXT:    v_mov_b32_e32 v1, s5
199; VI-NEXT:    s_waitcnt vmcnt(0)
200; VI-NEXT:    v_rcp_f16_e32 v3, v2
201; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
202; VI-NEXT:    v_trunc_f16_e32 v3, v3
203; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
204; VI-NEXT:    flat_store_short v[0:1], v2
205; VI-NEXT:    s_endpgm
206                      half addrspace(1)* %in2) #0 {
207   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
208   %r0 = load half, half addrspace(1)* %in1, align 4
209   %r1 = load half, half addrspace(1)* %gep2, align 4
210   %r2 = frem fast half %r0, %r1
211   store half %r2, half addrspace(1)* %out, align 4
212   ret void
213}
214
215define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
216; SI-LABEL: unsafe_frem_f16:
217; SI:       ; %bb.0:
218; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
219; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
220; SI-NEXT:    s_mov_b32 s11, 0xf000
221; SI-NEXT:    s_mov_b32 s10, -1
222; SI-NEXT:    s_waitcnt lgkmcnt(0)
223; SI-NEXT:    s_mov_b32 s8, s4
224; SI-NEXT:    s_mov_b32 s9, s5
225; SI-NEXT:    s_mov_b32 s4, s6
226; SI-NEXT:    s_mov_b32 s5, s7
227; SI-NEXT:    s_mov_b32 s6, s10
228; SI-NEXT:    s_mov_b32 s7, s11
229; SI-NEXT:    s_mov_b32 s2, s10
230; SI-NEXT:    s_mov_b32 s3, s11
231; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
232; SI-NEXT:    s_waitcnt vmcnt(0)
233; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
234; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
235; SI-NEXT:    s_waitcnt vmcnt(0)
236; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
237; SI-NEXT:    v_rcp_f32_e32 v2, v1
238; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
239; SI-NEXT:    v_trunc_f32_e32 v2, v2
240; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
241; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
242; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
243; SI-NEXT:    s_endpgm
244;
245; CI-LABEL: unsafe_frem_f16:
246; CI:       ; %bb.0:
247; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
248; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
249; CI-NEXT:    s_mov_b32 s11, 0xf000
250; CI-NEXT:    s_mov_b32 s10, -1
251; CI-NEXT:    s_mov_b32 s2, s10
252; CI-NEXT:    s_mov_b32 s3, s11
253; CI-NEXT:    s_waitcnt lgkmcnt(0)
254; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
255; CI-NEXT:    s_mov_b32 s8, s4
256; CI-NEXT:    s_mov_b32 s9, s5
257; CI-NEXT:    s_mov_b32 s4, s6
258; CI-NEXT:    s_mov_b32 s5, s7
259; CI-NEXT:    s_mov_b32 s6, s10
260; CI-NEXT:    s_mov_b32 s7, s11
261; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
262; CI-NEXT:    s_waitcnt vmcnt(1)
263; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
264; CI-NEXT:    v_rcp_f32_e32 v2, v1
265; CI-NEXT:    s_waitcnt vmcnt(0)
266; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
267; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
268; CI-NEXT:    v_trunc_f32_e32 v2, v2
269; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
270; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
271; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
272; CI-NEXT:    s_endpgm
273;
274; VI-LABEL: unsafe_frem_f16:
275; VI:       ; %bb.0:
276; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
277; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
278; VI-NEXT:    s_waitcnt lgkmcnt(0)
279; VI-NEXT:    v_mov_b32_e32 v2, s6
280; VI-NEXT:    s_add_u32 s0, s0, 8
281; VI-NEXT:    v_mov_b32_e32 v3, s7
282; VI-NEXT:    s_addc_u32 s1, s1, 0
283; VI-NEXT:    flat_load_ushort v4, v[2:3]
284; VI-NEXT:    v_mov_b32_e32 v3, s1
285; VI-NEXT:    v_mov_b32_e32 v2, s0
286; VI-NEXT:    flat_load_ushort v2, v[2:3]
287; VI-NEXT:    v_mov_b32_e32 v0, s4
288; VI-NEXT:    v_mov_b32_e32 v1, s5
289; VI-NEXT:    s_waitcnt vmcnt(0)
290; VI-NEXT:    v_rcp_f16_e32 v3, v2
291; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
292; VI-NEXT:    v_trunc_f16_e32 v3, v3
293; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
294; VI-NEXT:    flat_store_short v[0:1], v2
295; VI-NEXT:    s_endpgm
296                             half addrspace(1)* %in2) #1 {
297   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
298   %r0 = load half, half addrspace(1)* %in1, align 4
299   %r1 = load half, half addrspace(1)* %gep2, align 4
300   %r2 = frem afn half %r0, %r1
301   store half %r2, half addrspace(1)* %out, align 4
302   ret void
303}
304
305define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
306; SI-LABEL: frem_f32:
307; SI:       ; %bb.0:
308; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
309; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
310; SI-NEXT:    s_mov_b32 s11, 0xf000
311; SI-NEXT:    s_mov_b32 s10, -1
312; SI-NEXT:    s_waitcnt lgkmcnt(0)
313; SI-NEXT:    s_mov_b32 s8, s4
314; SI-NEXT:    s_mov_b32 s9, s5
315; SI-NEXT:    s_mov_b32 s4, s6
316; SI-NEXT:    s_mov_b32 s5, s7
317; SI-NEXT:    s_mov_b32 s6, s10
318; SI-NEXT:    s_mov_b32 s7, s11
319; SI-NEXT:    s_mov_b32 s2, s10
320; SI-NEXT:    s_mov_b32 s3, s11
321; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
322; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
323; SI-NEXT:    s_waitcnt vmcnt(0)
324; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
325; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
326; SI-NEXT:    v_rcp_f32_e32 v4, v3
327; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
328; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
329; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
330; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
331; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
332; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
333; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
334; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
335; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
336; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
337; SI-NEXT:    v_trunc_f32_e32 v2, v2
338; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
339; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
340; SI-NEXT:    s_endpgm
341;
342; CI-LABEL: frem_f32:
343; CI:       ; %bb.0:
344; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
345; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
346; CI-NEXT:    s_mov_b32 s11, 0xf000
347; CI-NEXT:    s_mov_b32 s10, -1
348; CI-NEXT:    s_mov_b32 s2, s10
349; CI-NEXT:    s_waitcnt lgkmcnt(0)
350; CI-NEXT:    s_mov_b32 s8, s4
351; CI-NEXT:    s_mov_b32 s9, s5
352; CI-NEXT:    s_mov_b32 s4, s6
353; CI-NEXT:    s_mov_b32 s5, s7
354; CI-NEXT:    s_mov_b32 s6, s10
355; CI-NEXT:    s_mov_b32 s7, s11
356; CI-NEXT:    s_mov_b32 s3, s11
357; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
358; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
359; CI-NEXT:    s_waitcnt vmcnt(0)
360; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
361; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
362; CI-NEXT:    v_rcp_f32_e32 v4, v3
363; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
364; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
365; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
366; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
367; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
368; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
369; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
370; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
371; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
372; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
373; CI-NEXT:    v_trunc_f32_e32 v2, v2
374; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
375; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
376; CI-NEXT:    s_endpgm
377;
378; VI-LABEL: frem_f32:
379; VI:       ; %bb.0:
380; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
381; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
382; VI-NEXT:    s_waitcnt lgkmcnt(0)
383; VI-NEXT:    v_mov_b32_e32 v2, s6
384; VI-NEXT:    s_add_u32 s0, s0, 16
385; VI-NEXT:    v_mov_b32_e32 v3, s7
386; VI-NEXT:    s_addc_u32 s1, s1, 0
387; VI-NEXT:    flat_load_dword v4, v[2:3]
388; VI-NEXT:    v_mov_b32_e32 v3, s1
389; VI-NEXT:    v_mov_b32_e32 v2, s0
390; VI-NEXT:    flat_load_dword v2, v[2:3]
391; VI-NEXT:    v_mov_b32_e32 v0, s4
392; VI-NEXT:    v_mov_b32_e32 v1, s5
393; VI-NEXT:    s_waitcnt vmcnt(0)
394; VI-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v4
395; VI-NEXT:    v_div_scale_f32 v3, vcc, v4, v2, v4
396; VI-NEXT:    v_rcp_f32_e32 v6, v5
397; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
398; VI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
399; VI-NEXT:    v_fma_f32 v6, v7, v6, v6
400; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
401; VI-NEXT:    v_fma_f32 v8, -v5, v7, v3
402; VI-NEXT:    v_fma_f32 v7, v8, v6, v7
403; VI-NEXT:    v_fma_f32 v3, -v5, v7, v3
404; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
405; VI-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
406; VI-NEXT:    v_div_fixup_f32 v3, v3, v2, v4
407; VI-NEXT:    v_trunc_f32_e32 v3, v3
408; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
409; VI-NEXT:    flat_store_dword v[0:1], v2
410; VI-NEXT:    s_endpgm
411                      float addrspace(1)* %in2) #0 {
412   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
413   %r0 = load float, float addrspace(1)* %in1, align 4
414   %r1 = load float, float addrspace(1)* %gep2, align 4
415   %r2 = frem float %r0, %r1
416   store float %r2, float addrspace(1)* %out, align 4
417   ret void
418}
419
420define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
421; SI-LABEL: fast_frem_f32:
422; SI:       ; %bb.0:
423; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
424; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
425; SI-NEXT:    s_mov_b32 s11, 0xf000
426; SI-NEXT:    s_mov_b32 s10, -1
427; SI-NEXT:    s_waitcnt lgkmcnt(0)
428; SI-NEXT:    s_mov_b32 s8, s4
429; SI-NEXT:    s_mov_b32 s9, s5
430; SI-NEXT:    s_mov_b32 s4, s6
431; SI-NEXT:    s_mov_b32 s5, s7
432; SI-NEXT:    s_mov_b32 s6, s10
433; SI-NEXT:    s_mov_b32 s7, s11
434; SI-NEXT:    s_mov_b32 s2, s10
435; SI-NEXT:    s_mov_b32 s3, s11
436; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
437; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
438; SI-NEXT:    s_waitcnt vmcnt(0)
439; SI-NEXT:    v_rcp_f32_e32 v2, v1
440; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
441; SI-NEXT:    v_trunc_f32_e32 v2, v2
442; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
443; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
444; SI-NEXT:    s_endpgm
445;
446; CI-LABEL: fast_frem_f32:
447; CI:       ; %bb.0:
448; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
449; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
450; CI-NEXT:    s_mov_b32 s11, 0xf000
451; CI-NEXT:    s_mov_b32 s10, -1
452; CI-NEXT:    s_mov_b32 s2, s10
453; CI-NEXT:    s_waitcnt lgkmcnt(0)
454; CI-NEXT:    s_mov_b32 s8, s4
455; CI-NEXT:    s_mov_b32 s9, s5
456; CI-NEXT:    s_mov_b32 s4, s6
457; CI-NEXT:    s_mov_b32 s5, s7
458; CI-NEXT:    s_mov_b32 s6, s10
459; CI-NEXT:    s_mov_b32 s7, s11
460; CI-NEXT:    s_mov_b32 s3, s11
461; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
462; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
463; CI-NEXT:    s_waitcnt vmcnt(0)
464; CI-NEXT:    v_rcp_f32_e32 v2, v1
465; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
466; CI-NEXT:    v_trunc_f32_e32 v2, v2
467; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
468; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
469; CI-NEXT:    s_endpgm
470;
471; VI-LABEL: fast_frem_f32:
472; VI:       ; %bb.0:
473; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
474; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
475; VI-NEXT:    s_waitcnt lgkmcnt(0)
476; VI-NEXT:    v_mov_b32_e32 v2, s6
477; VI-NEXT:    s_add_u32 s0, s0, 16
478; VI-NEXT:    v_mov_b32_e32 v3, s7
479; VI-NEXT:    s_addc_u32 s1, s1, 0
480; VI-NEXT:    flat_load_dword v4, v[2:3]
481; VI-NEXT:    v_mov_b32_e32 v3, s1
482; VI-NEXT:    v_mov_b32_e32 v2, s0
483; VI-NEXT:    flat_load_dword v2, v[2:3]
484; VI-NEXT:    v_mov_b32_e32 v0, s4
485; VI-NEXT:    v_mov_b32_e32 v1, s5
486; VI-NEXT:    s_waitcnt vmcnt(0)
487; VI-NEXT:    v_rcp_f32_e32 v3, v2
488; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
489; VI-NEXT:    v_trunc_f32_e32 v3, v3
490; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
491; VI-NEXT:    flat_store_dword v[0:1], v2
492; VI-NEXT:    s_endpgm
493                      float addrspace(1)* %in2) #0 {
494   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
495   %r0 = load float, float addrspace(1)* %in1, align 4
496   %r1 = load float, float addrspace(1)* %gep2, align 4
497   %r2 = frem fast float %r0, %r1
498   store float %r2, float addrspace(1)* %out, align 4
499   ret void
500}
501
502define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
503; SI-LABEL: unsafe_frem_f32:
504; SI:       ; %bb.0:
505; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
506; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
507; SI-NEXT:    s_mov_b32 s11, 0xf000
508; SI-NEXT:    s_mov_b32 s10, -1
509; SI-NEXT:    s_waitcnt lgkmcnt(0)
510; SI-NEXT:    s_mov_b32 s8, s4
511; SI-NEXT:    s_mov_b32 s9, s5
512; SI-NEXT:    s_mov_b32 s4, s6
513; SI-NEXT:    s_mov_b32 s5, s7
514; SI-NEXT:    s_mov_b32 s6, s10
515; SI-NEXT:    s_mov_b32 s7, s11
516; SI-NEXT:    s_mov_b32 s2, s10
517; SI-NEXT:    s_mov_b32 s3, s11
518; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
519; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
520; SI-NEXT:    s_waitcnt vmcnt(0)
521; SI-NEXT:    v_rcp_f32_e32 v2, v1
522; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
523; SI-NEXT:    v_trunc_f32_e32 v2, v2
524; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
525; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
526; SI-NEXT:    s_endpgm
527;
528; CI-LABEL: unsafe_frem_f32:
529; CI:       ; %bb.0:
530; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
531; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
532; CI-NEXT:    s_mov_b32 s11, 0xf000
533; CI-NEXT:    s_mov_b32 s10, -1
534; CI-NEXT:    s_mov_b32 s2, s10
535; CI-NEXT:    s_waitcnt lgkmcnt(0)
536; CI-NEXT:    s_mov_b32 s8, s4
537; CI-NEXT:    s_mov_b32 s9, s5
538; CI-NEXT:    s_mov_b32 s4, s6
539; CI-NEXT:    s_mov_b32 s5, s7
540; CI-NEXT:    s_mov_b32 s6, s10
541; CI-NEXT:    s_mov_b32 s7, s11
542; CI-NEXT:    s_mov_b32 s3, s11
543; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
544; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
545; CI-NEXT:    s_waitcnt vmcnt(0)
546; CI-NEXT:    v_rcp_f32_e32 v2, v1
547; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
548; CI-NEXT:    v_trunc_f32_e32 v2, v2
549; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
550; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
551; CI-NEXT:    s_endpgm
552;
553; VI-LABEL: unsafe_frem_f32:
554; VI:       ; %bb.0:
555; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
556; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
557; VI-NEXT:    s_waitcnt lgkmcnt(0)
558; VI-NEXT:    v_mov_b32_e32 v2, s6
559; VI-NEXT:    s_add_u32 s0, s0, 16
560; VI-NEXT:    v_mov_b32_e32 v3, s7
561; VI-NEXT:    s_addc_u32 s1, s1, 0
562; VI-NEXT:    flat_load_dword v4, v[2:3]
563; VI-NEXT:    v_mov_b32_e32 v3, s1
564; VI-NEXT:    v_mov_b32_e32 v2, s0
565; VI-NEXT:    flat_load_dword v2, v[2:3]
566; VI-NEXT:    v_mov_b32_e32 v0, s4
567; VI-NEXT:    v_mov_b32_e32 v1, s5
568; VI-NEXT:    s_waitcnt vmcnt(0)
569; VI-NEXT:    v_rcp_f32_e32 v3, v2
570; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
571; VI-NEXT:    v_trunc_f32_e32 v3, v3
572; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
573; VI-NEXT:    flat_store_dword v[0:1], v2
574; VI-NEXT:    s_endpgm
575                             float addrspace(1)* %in2) #1 {
576   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
577   %r0 = load float, float addrspace(1)* %in1, align 4
578   %r1 = load float, float addrspace(1)* %gep2, align 4
579   %r2 = frem afn float %r0, %r1
580   store float %r2, float addrspace(1)* %out, align 4
581   ret void
582}
583
584define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
585; SI-LABEL: frem_f64:
586; SI:       ; %bb.0:
587; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
588; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
589; SI-NEXT:    s_mov_b32 s7, 0xf000
590; SI-NEXT:    s_mov_b32 s6, -1
591; SI-NEXT:    s_waitcnt lgkmcnt(0)
592; SI-NEXT:    s_mov_b32 s4, s8
593; SI-NEXT:    s_mov_b32 s5, s9
594; SI-NEXT:    s_mov_b32 s8, s10
595; SI-NEXT:    s_mov_b32 s9, s11
596; SI-NEXT:    s_mov_b32 s10, s6
597; SI-NEXT:    s_mov_b32 s11, s7
598; SI-NEXT:    s_mov_b32 s2, s6
599; SI-NEXT:    s_mov_b32 s3, s7
600; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
601; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
602; SI-NEXT:    s_waitcnt vmcnt(0)
603; SI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
604; SI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
605; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
606; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
607; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
608; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
609; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
610; SI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
611; SI-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
612; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
613; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v9
614; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
615; SI-NEXT:    s_nop 1
616; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
617; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
618; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
619; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
620; SI-NEXT:    s_mov_b32 s1, 0xfffff
621; SI-NEXT:    s_mov_b32 s0, s6
622; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
623; SI-NEXT:    v_not_b32_e32 v6, v6
624; SI-NEXT:    v_and_b32_e32 v6, v4, v6
625; SI-NEXT:    v_not_b32_e32 v7, v7
626; SI-NEXT:    v_and_b32_e32 v7, v5, v7
627; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
628; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
629; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
630; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
631; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
632; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
633; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
634; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
635; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
636; SI-NEXT:    s_endpgm
637;
638; CI-LABEL: frem_f64:
639; CI:       ; %bb.0:
640; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
641; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
642; CI-NEXT:    s_mov_b32 s11, 0xf000
643; CI-NEXT:    s_mov_b32 s10, -1
644; CI-NEXT:    s_mov_b32 s2, s10
645; CI-NEXT:    s_waitcnt lgkmcnt(0)
646; CI-NEXT:    s_mov_b32 s8, s4
647; CI-NEXT:    s_mov_b32 s9, s5
648; CI-NEXT:    s_mov_b32 s4, s6
649; CI-NEXT:    s_mov_b32 s5, s7
650; CI-NEXT:    s_mov_b32 s6, s10
651; CI-NEXT:    s_mov_b32 s7, s11
652; CI-NEXT:    s_mov_b32 s3, s11
653; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
654; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
655; CI-NEXT:    s_waitcnt vmcnt(0)
656; CI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
657; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
658; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
659; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
660; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
661; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
662; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
663; CI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
664; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
665; CI-NEXT:    s_nop 1
666; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
667; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
668; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
669; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
670; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
671; CI-NEXT:    s_endpgm
672;
673; VI-LABEL: frem_f64:
674; VI:       ; %bb.0:
675; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
676; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
677; VI-NEXT:    s_waitcnt lgkmcnt(0)
678; VI-NEXT:    v_mov_b32_e32 v2, s6
679; VI-NEXT:    v_mov_b32_e32 v3, s7
680; VI-NEXT:    v_mov_b32_e32 v4, s0
681; VI-NEXT:    v_mov_b32_e32 v5, s1
682; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
683; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
684; VI-NEXT:    v_mov_b32_e32 v0, s4
685; VI-NEXT:    v_mov_b32_e32 v1, s5
686; VI-NEXT:    s_waitcnt vmcnt(0)
687; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
688; VI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
689; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
690; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
691; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
692; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
693; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
694; VI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
695; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
696; VI-NEXT:    s_nop 1
697; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
698; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
699; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
700; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
701; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
702; VI-NEXT:    s_endpgm
703                      double addrspace(1)* %in2) #0 {
704   %r0 = load double, double addrspace(1)* %in1, align 8
705   %r1 = load double, double addrspace(1)* %in2, align 8
706   %r2 = frem double %r0, %r1
707   store double %r2, double addrspace(1)* %out, align 8
708   ret void
709}
710
711define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
712; SI-LABEL: fast_frem_f64:
713; SI:       ; %bb.0:
714; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
715; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
716; SI-NEXT:    s_mov_b32 s11, 0xf000
717; SI-NEXT:    s_mov_b32 s10, -1
718; SI-NEXT:    s_waitcnt lgkmcnt(0)
719; SI-NEXT:    s_mov_b32 s8, s4
720; SI-NEXT:    s_mov_b32 s9, s5
721; SI-NEXT:    s_mov_b32 s4, s6
722; SI-NEXT:    s_mov_b32 s5, s7
723; SI-NEXT:    s_mov_b32 s6, s10
724; SI-NEXT:    s_mov_b32 s7, s11
725; SI-NEXT:    s_mov_b32 s2, s10
726; SI-NEXT:    s_mov_b32 s3, s11
727; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
728; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
729; SI-NEXT:    s_waitcnt vmcnt(0)
730; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
731; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
732; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
733; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
734; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
735; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
736; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
737; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
738; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
739; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
740; SI-NEXT:    s_mov_b32 s1, 0xfffff
741; SI-NEXT:    s_mov_b32 s0, s10
742; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
743; SI-NEXT:    v_not_b32_e32 v6, v6
744; SI-NEXT:    v_and_b32_e32 v6, v4, v6
745; SI-NEXT:    v_not_b32_e32 v7, v7
746; SI-NEXT:    v_and_b32_e32 v7, v5, v7
747; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
748; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
749; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
750; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
751; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
752; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
753; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
754; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
755; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
756; SI-NEXT:    s_endpgm
757;
758; CI-LABEL: fast_frem_f64:
759; CI:       ; %bb.0:
760; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
761; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
762; CI-NEXT:    s_mov_b32 s11, 0xf000
763; CI-NEXT:    s_mov_b32 s10, -1
764; CI-NEXT:    s_mov_b32 s2, s10
765; CI-NEXT:    s_waitcnt lgkmcnt(0)
766; CI-NEXT:    s_mov_b32 s8, s4
767; CI-NEXT:    s_mov_b32 s9, s5
768; CI-NEXT:    s_mov_b32 s4, s6
769; CI-NEXT:    s_mov_b32 s5, s7
770; CI-NEXT:    s_mov_b32 s6, s10
771; CI-NEXT:    s_mov_b32 s7, s11
772; CI-NEXT:    s_mov_b32 s3, s11
773; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
774; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
775; CI-NEXT:    s_waitcnt vmcnt(0)
776; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
777; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
778; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
779; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
780; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
781; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
782; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
783; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
784; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
785; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
786; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
787; CI-NEXT:    s_endpgm
788;
789; VI-LABEL: fast_frem_f64:
790; VI:       ; %bb.0:
791; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
792; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
793; VI-NEXT:    s_waitcnt lgkmcnt(0)
794; VI-NEXT:    v_mov_b32_e32 v2, s6
795; VI-NEXT:    v_mov_b32_e32 v3, s7
796; VI-NEXT:    v_mov_b32_e32 v4, s0
797; VI-NEXT:    v_mov_b32_e32 v5, s1
798; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
799; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
800; VI-NEXT:    v_mov_b32_e32 v0, s4
801; VI-NEXT:    v_mov_b32_e32 v1, s5
802; VI-NEXT:    s_waitcnt vmcnt(0)
803; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
804; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
805; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
806; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
807; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
808; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
809; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
810; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
811; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
812; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
813; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
814; VI-NEXT:    s_endpgm
815                      double addrspace(1)* %in2) #0 {
816   %r0 = load double, double addrspace(1)* %in1, align 8
817   %r1 = load double, double addrspace(1)* %in2, align 8
818   %r2 = frem fast double %r0, %r1
819   store double %r2, double addrspace(1)* %out, align 8
820   ret void
821}
822
823define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
824; SI-LABEL: unsafe_frem_f64:
825; SI:       ; %bb.0:
826; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
827; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
828; SI-NEXT:    s_mov_b32 s11, 0xf000
829; SI-NEXT:    s_mov_b32 s10, -1
830; SI-NEXT:    s_waitcnt lgkmcnt(0)
831; SI-NEXT:    s_mov_b32 s8, s4
832; SI-NEXT:    s_mov_b32 s9, s5
833; SI-NEXT:    s_mov_b32 s4, s6
834; SI-NEXT:    s_mov_b32 s5, s7
835; SI-NEXT:    s_mov_b32 s6, s10
836; SI-NEXT:    s_mov_b32 s7, s11
837; SI-NEXT:    s_mov_b32 s2, s10
838; SI-NEXT:    s_mov_b32 s3, s11
839; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
840; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
841; SI-NEXT:    s_waitcnt vmcnt(0)
842; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
843; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
844; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
845; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
846; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
847; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
848; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
849; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
850; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
851; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
852; SI-NEXT:    s_mov_b32 s1, 0xfffff
853; SI-NEXT:    s_mov_b32 s0, s10
854; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
855; SI-NEXT:    v_not_b32_e32 v6, v6
856; SI-NEXT:    v_and_b32_e32 v6, v4, v6
857; SI-NEXT:    v_not_b32_e32 v7, v7
858; SI-NEXT:    v_and_b32_e32 v7, v5, v7
859; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
860; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
861; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
862; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
863; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
864; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
865; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
866; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
867; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
868; SI-NEXT:    s_endpgm
869;
870; CI-LABEL: unsafe_frem_f64:
871; CI:       ; %bb.0:
872; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
873; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
874; CI-NEXT:    s_mov_b32 s11, 0xf000
875; CI-NEXT:    s_mov_b32 s10, -1
876; CI-NEXT:    s_mov_b32 s2, s10
877; CI-NEXT:    s_waitcnt lgkmcnt(0)
878; CI-NEXT:    s_mov_b32 s8, s4
879; CI-NEXT:    s_mov_b32 s9, s5
880; CI-NEXT:    s_mov_b32 s4, s6
881; CI-NEXT:    s_mov_b32 s5, s7
882; CI-NEXT:    s_mov_b32 s6, s10
883; CI-NEXT:    s_mov_b32 s7, s11
884; CI-NEXT:    s_mov_b32 s3, s11
885; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
886; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
887; CI-NEXT:    s_waitcnt vmcnt(0)
888; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
889; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
890; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
891; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
892; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
893; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
894; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
895; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
896; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
897; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
898; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
899; CI-NEXT:    s_endpgm
900;
901; VI-LABEL: unsafe_frem_f64:
902; VI:       ; %bb.0:
903; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
904; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
905; VI-NEXT:    s_waitcnt lgkmcnt(0)
906; VI-NEXT:    v_mov_b32_e32 v2, s6
907; VI-NEXT:    v_mov_b32_e32 v3, s7
908; VI-NEXT:    v_mov_b32_e32 v4, s0
909; VI-NEXT:    v_mov_b32_e32 v5, s1
910; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
911; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
912; VI-NEXT:    v_mov_b32_e32 v0, s4
913; VI-NEXT:    v_mov_b32_e32 v1, s5
914; VI-NEXT:    s_waitcnt vmcnt(0)
915; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
916; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
917; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
918; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
919; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
920; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
921; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
922; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
923; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
924; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
925; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
926; VI-NEXT:    s_endpgm
927                             double addrspace(1)* %in2) #1 {
928   %r0 = load double, double addrspace(1)* %in1, align 8
929   %r1 = load double, double addrspace(1)* %in2, align 8
930   %r2 = frem afn double %r0, %r1
931   store double %r2, double addrspace(1)* %out, align 8
932   ret void
933}
934
935define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
936; SI-LABEL: frem_v2f16:
937; SI:       ; %bb.0:
938; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
939; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
940; SI-NEXT:    s_mov_b32 s3, 0xf000
941; SI-NEXT:    s_mov_b32 s2, -1
942; SI-NEXT:    s_waitcnt lgkmcnt(0)
943; SI-NEXT:    s_mov_b32 s0, s4
944; SI-NEXT:    s_mov_b32 s1, s5
945; SI-NEXT:    s_mov_b32 s4, s6
946; SI-NEXT:    s_mov_b32 s5, s7
947; SI-NEXT:    s_mov_b32 s6, s2
948; SI-NEXT:    s_mov_b32 s7, s3
949; SI-NEXT:    s_mov_b32 s10, s2
950; SI-NEXT:    s_mov_b32 s11, s3
951; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
952; SI-NEXT:    s_waitcnt vmcnt(0)
953; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
954; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
955; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
956; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
957; SI-NEXT:    s_waitcnt vmcnt(0)
958; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
959; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
960; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
961; SI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
962; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
963; SI-NEXT:    v_rcp_f32_e32 v6, v5
964; SI-NEXT:    s_mov_b32 s6, 3
965; SI-NEXT:    s_mov_b32 s7, 0
966; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
967; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
968; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
969; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
970; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
971; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
972; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
973; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
974; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
975; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
976; SI-NEXT:    v_trunc_f32_e32 v4, v4
977; SI-NEXT:    v_fma_f32 v0, -v4, v2, v0
978; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
979; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
980; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
981; SI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
982; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
983; SI-NEXT:    v_rcp_f32_e32 v5, v4
984; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
985; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
986; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
987; SI-NEXT:    v_mul_f32_e32 v6, v2, v5
988; SI-NEXT:    v_fma_f32 v7, -v4, v6, v2
989; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
990; SI-NEXT:    v_fma_f32 v2, -v4, v6, v2
991; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
992; SI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
993; SI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
994; SI-NEXT:    v_trunc_f32_e32 v2, v2
995; SI-NEXT:    v_fma_f32 v1, -v2, v3, v1
996; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
997; SI-NEXT:    v_or_b32_e32 v0, v1, v0
998; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
999; SI-NEXT:    s_endpgm
1000;
1001; CI-LABEL: frem_v2f16:
1002; CI:       ; %bb.0:
1003; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1004; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1005; CI-NEXT:    s_mov_b32 s3, 0xf000
1006; CI-NEXT:    s_mov_b32 s2, -1
1007; CI-NEXT:    s_mov_b32 s10, s2
1008; CI-NEXT:    s_waitcnt lgkmcnt(0)
1009; CI-NEXT:    s_mov_b32 s0, s4
1010; CI-NEXT:    s_mov_b32 s1, s5
1011; CI-NEXT:    s_mov_b32 s4, s6
1012; CI-NEXT:    s_mov_b32 s5, s7
1013; CI-NEXT:    s_mov_b32 s11, s3
1014; CI-NEXT:    s_mov_b32 s6, s2
1015; CI-NEXT:    s_mov_b32 s7, s3
1016; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1017; CI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
1018; CI-NEXT:    s_mov_b32 s6, 3
1019; CI-NEXT:    s_mov_b32 s7, 0
1020; CI-NEXT:    s_waitcnt vmcnt(1)
1021; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1022; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1023; CI-NEXT:    s_waitcnt vmcnt(0)
1024; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1025; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1026; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1027; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1028; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1029; CI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1030; CI-NEXT:    v_rcp_f32_e32 v6, v5
1031; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1032; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1033; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
1034; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
1035; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1036; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
1037; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1038; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1039; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1040; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1041; CI-NEXT:    v_trunc_f32_e32 v4, v4
1042; CI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1043; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1044; CI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1045; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1046; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1047; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1048; CI-NEXT:    v_rcp_f32_e32 v5, v4
1049; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1050; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1051; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
1052; CI-NEXT:    v_mul_f32_e32 v6, v2, v5
1053; CI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1054; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
1055; CI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1056; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1057; CI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1058; CI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1059; CI-NEXT:    v_trunc_f32_e32 v2, v2
1060; CI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1061; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1062; CI-NEXT:    v_or_b32_e32 v0, v1, v0
1063; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1064; CI-NEXT:    s_endpgm
1065;
1066; VI-LABEL: frem_v2f16:
1067; VI:       ; %bb.0:
1068; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1069; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1070; VI-NEXT:    s_waitcnt lgkmcnt(0)
1071; VI-NEXT:    v_mov_b32_e32 v2, s6
1072; VI-NEXT:    s_add_u32 s0, s0, 16
1073; VI-NEXT:    v_mov_b32_e32 v3, s7
1074; VI-NEXT:    s_addc_u32 s1, s1, 0
1075; VI-NEXT:    flat_load_dword v4, v[2:3]
1076; VI-NEXT:    v_mov_b32_e32 v3, s1
1077; VI-NEXT:    v_mov_b32_e32 v2, s0
1078; VI-NEXT:    flat_load_dword v2, v[2:3]
1079; VI-NEXT:    v_mov_b32_e32 v0, s4
1080; VI-NEXT:    v_mov_b32_e32 v1, s5
1081; VI-NEXT:    s_waitcnt vmcnt(1)
1082; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
1083; VI-NEXT:    v_cvt_f32_f16_e32 v5, v3
1084; VI-NEXT:    s_waitcnt vmcnt(0)
1085; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1086; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1087; VI-NEXT:    v_rcp_f32_e32 v7, v7
1088; VI-NEXT:    v_mul_f32_e32 v5, v5, v7
1089; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1090; VI-NEXT:    v_div_fixup_f16 v5, v5, v6, v3
1091; VI-NEXT:    v_trunc_f16_e32 v5, v5
1092; VI-NEXT:    v_fma_f16 v3, -v5, v6, v3
1093; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
1094; VI-NEXT:    v_cvt_f32_f16_e32 v5, v4
1095; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1096; VI-NEXT:    v_rcp_f32_e32 v6, v6
1097; VI-NEXT:    v_mul_f32_e32 v5, v5, v6
1098; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1099; VI-NEXT:    v_div_fixup_f16 v5, v5, v2, v4
1100; VI-NEXT:    v_trunc_f16_e32 v5, v5
1101; VI-NEXT:    v_fma_f16 v2, -v5, v2, v4
1102; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1103; VI-NEXT:    flat_store_dword v[0:1], v2
1104; VI-NEXT:    s_endpgm
1105                        <2 x half> addrspace(1)* %in2) #0 {
1106   %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4
1107   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8
1108   %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8
1109   %r2 = frem <2 x half> %r0, %r1
1110   store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8
1111   ret void
1112}
1113
1114define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1,
1115; SI-LABEL: frem_v4f16:
1116; SI:       ; %bb.0:
1117; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1118; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1119; SI-NEXT:    s_mov_b32 s3, 0xf000
1120; SI-NEXT:    s_mov_b32 s2, -1
1121; SI-NEXT:    s_waitcnt lgkmcnt(0)
1122; SI-NEXT:    s_mov_b32 s0, s4
1123; SI-NEXT:    s_mov_b32 s1, s5
1124; SI-NEXT:    s_mov_b32 s4, s6
1125; SI-NEXT:    s_mov_b32 s5, s7
1126; SI-NEXT:    s_mov_b32 s6, s2
1127; SI-NEXT:    s_mov_b32 s7, s3
1128; SI-NEXT:    s_mov_b32 s10, s2
1129; SI-NEXT:    s_mov_b32 s11, s3
1130; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1131; SI-NEXT:    s_waitcnt vmcnt(0)
1132; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1133; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1134; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
1135; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
1136; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1137; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1138; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1139; SI-NEXT:    s_waitcnt vmcnt(0)
1140; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1141; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1142; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1143; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1144; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1145; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1146; SI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
1147; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
1148; SI-NEXT:    v_rcp_f32_e32 v10, v9
1149; SI-NEXT:    s_mov_b32 s6, 3
1150; SI-NEXT:    s_mov_b32 s7, 0
1151; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1152; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1153; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
1154; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
1155; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1156; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
1157; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1158; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1159; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1160; SI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
1161; SI-NEXT:    v_trunc_f32_e32 v8, v8
1162; SI-NEXT:    v_fma_f32 v1, -v8, v1, v5
1163; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1164; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1165; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1166; SI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
1167; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
1168; SI-NEXT:    v_rcp_f32_e32 v9, v8
1169; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1170; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1171; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
1172; SI-NEXT:    v_mul_f32_e32 v10, v5, v9
1173; SI-NEXT:    v_fma_f32 v11, -v8, v10, v5
1174; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
1175; SI-NEXT:    v_fma_f32 v5, -v8, v10, v5
1176; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1177; SI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
1178; SI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
1179; SI-NEXT:    v_trunc_f32_e32 v5, v5
1180; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1181; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1182; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1183; SI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
1184; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
1185; SI-NEXT:    v_rcp_f32_e32 v7, v5
1186; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1187; SI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
1188; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
1189; SI-NEXT:    v_mul_f32_e32 v8, v4, v7
1190; SI-NEXT:    v_fma_f32 v9, -v5, v8, v4
1191; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
1192; SI-NEXT:    v_fma_f32 v4, -v5, v8, v4
1193; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1194; SI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
1195; SI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
1196; SI-NEXT:    v_trunc_f32_e32 v4, v4
1197; SI-NEXT:    v_fma_f32 v0, -v4, v0, v3
1198; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1199; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1200; SI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
1201; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
1202; SI-NEXT:    v_rcp_f32_e32 v5, v4
1203; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1204; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1205; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
1206; SI-NEXT:    v_mul_f32_e32 v7, v3, v5
1207; SI-NEXT:    v_fma_f32 v8, -v4, v7, v3
1208; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
1209; SI-NEXT:    v_fma_f32 v3, -v4, v7, v3
1210; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1211; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
1212; SI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
1213; SI-NEXT:    v_trunc_f32_e32 v3, v3
1214; SI-NEXT:    v_fma_f32 v2, -v3, v6, v2
1215; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1216; SI-NEXT:    v_or_b32_e32 v0, v2, v0
1217; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1218; SI-NEXT:    s_endpgm
1219;
1220; CI-LABEL: frem_v4f16:
1221; CI:       ; %bb.0:
1222; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1223; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1224; CI-NEXT:    s_mov_b32 s3, 0xf000
1225; CI-NEXT:    s_mov_b32 s2, -1
1226; CI-NEXT:    s_mov_b32 s10, s2
1227; CI-NEXT:    s_waitcnt lgkmcnt(0)
1228; CI-NEXT:    s_mov_b32 s0, s4
1229; CI-NEXT:    s_mov_b32 s1, s5
1230; CI-NEXT:    s_mov_b32 s4, s6
1231; CI-NEXT:    s_mov_b32 s5, s7
1232; CI-NEXT:    s_mov_b32 s6, s2
1233; CI-NEXT:    s_mov_b32 s7, s3
1234; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1235; CI-NEXT:    s_mov_b32 s11, s3
1236; CI-NEXT:    s_mov_b32 s6, 3
1237; CI-NEXT:    s_mov_b32 s7, 0
1238; CI-NEXT:    s_waitcnt vmcnt(0)
1239; CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1240; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1241; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
1242; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1243; CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
1244; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1245; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1246; CI-NEXT:    s_waitcnt vmcnt(0)
1247; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1248; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1249; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1250; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1251; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1252; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1253; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
1254; CI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
1255; CI-NEXT:    v_rcp_f32_e32 v10, v9
1256; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1257; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1258; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
1259; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
1260; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1261; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
1262; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1263; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1264; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1265; CI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
1266; CI-NEXT:    v_trunc_f32_e32 v8, v8
1267; CI-NEXT:    v_fma_f32 v1, -v8, v1, v5
1268; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
1269; CI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
1270; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1271; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1272; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1273; CI-NEXT:    v_rcp_f32_e32 v9, v8
1274; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1275; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1276; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
1277; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
1278; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
1279; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
1280; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
1281; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1282; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
1283; CI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
1284; CI-NEXT:    v_trunc_f32_e32 v5, v5
1285; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1286; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
1287; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1288; CI-NEXT:    v_or_b32_e32 v1, v4, v1
1289; CI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
1290; CI-NEXT:    v_rcp_f32_e32 v7, v5
1291; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1292; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
1293; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
1294; CI-NEXT:    v_mul_f32_e32 v8, v4, v7
1295; CI-NEXT:    v_fma_f32 v9, -v5, v8, v4
1296; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
1297; CI-NEXT:    v_fma_f32 v4, -v5, v8, v4
1298; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1299; CI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
1300; CI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
1301; CI-NEXT:    v_trunc_f32_e32 v4, v4
1302; CI-NEXT:    v_fma_f32 v0, -v4, v0, v3
1303; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
1304; CI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
1305; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1306; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1307; CI-NEXT:    v_rcp_f32_e32 v5, v4
1308; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1309; CI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1310; CI-NEXT:    v_fma_f32 v5, v7, v5, v5
1311; CI-NEXT:    v_mul_f32_e32 v7, v3, v5
1312; CI-NEXT:    v_fma_f32 v8, -v4, v7, v3
1313; CI-NEXT:    v_fma_f32 v7, v8, v5, v7
1314; CI-NEXT:    v_fma_f32 v3, -v4, v7, v3
1315; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1316; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
1317; CI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
1318; CI-NEXT:    v_trunc_f32_e32 v3, v3
1319; CI-NEXT:    v_fma_f32 v2, -v3, v6, v2
1320; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1321; CI-NEXT:    v_or_b32_e32 v0, v2, v0
1322; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1323; CI-NEXT:    s_endpgm
1324;
1325; VI-LABEL: frem_v4f16:
1326; VI:       ; %bb.0:
1327; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1328; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1329; VI-NEXT:    s_waitcnt lgkmcnt(0)
1330; VI-NEXT:    v_mov_b32_e32 v2, s6
1331; VI-NEXT:    s_add_u32 s0, s0, 32
1332; VI-NEXT:    s_addc_u32 s1, s1, 0
1333; VI-NEXT:    v_mov_b32_e32 v5, s1
1334; VI-NEXT:    v_mov_b32_e32 v4, s0
1335; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1336; VI-NEXT:    v_mov_b32_e32 v3, s7
1337; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1338; VI-NEXT:    v_mov_b32_e32 v0, s4
1339; VI-NEXT:    v_mov_b32_e32 v1, s5
1340; VI-NEXT:    s_waitcnt vmcnt(1)
1341; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
1342; VI-NEXT:    v_cvt_f32_f16_e32 v9, v8
1343; VI-NEXT:    s_waitcnt vmcnt(0)
1344; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1345; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1346; VI-NEXT:    v_rcp_f32_e32 v9, v9
1347; VI-NEXT:    v_mul_f32_e32 v7, v7, v9
1348; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1349; VI-NEXT:    v_div_fixup_f16 v7, v7, v8, v6
1350; VI-NEXT:    v_trunc_f16_e32 v7, v7
1351; VI-NEXT:    v_fma_f16 v6, -v7, v8, v6
1352; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
1353; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
1354; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
1355; VI-NEXT:    v_rcp_f32_e32 v8, v8
1356; VI-NEXT:    v_mul_f32_e32 v7, v7, v8
1357; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1358; VI-NEXT:    v_div_fixup_f16 v7, v7, v5, v3
1359; VI-NEXT:    v_trunc_f16_e32 v7, v7
1360; VI-NEXT:    v_fma_f16 v3, -v7, v5, v3
1361; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1362; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
1363; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1364; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1365; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
1366; VI-NEXT:    v_rcp_f32_e32 v8, v8
1367; VI-NEXT:    v_mul_f32_e32 v6, v6, v8
1368; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1369; VI-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
1370; VI-NEXT:    v_trunc_f16_e32 v6, v6
1371; VI-NEXT:    v_fma_f16 v5, -v6, v7, v5
1372; VI-NEXT:    v_cvt_f32_f16_e32 v7, v4
1373; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
1374; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1375; VI-NEXT:    v_rcp_f32_e32 v7, v7
1376; VI-NEXT:    v_mul_f32_e32 v6, v6, v7
1377; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1378; VI-NEXT:    v_div_fixup_f16 v6, v6, v4, v2
1379; VI-NEXT:    v_trunc_f16_e32 v6, v6
1380; VI-NEXT:    v_fma_f16 v2, -v6, v4, v2
1381; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1382; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1383; VI-NEXT:    s_endpgm
1384                        <4 x half> addrspace(1)* %in2) #0 {
1385   %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
1386   %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16
1387   %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16
1388   %r2 = frem <4 x half> %r0, %r1
1389   store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16
1390   ret void
1391}
1392
1393define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
1394; SI-LABEL: frem_v2f32:
1395; SI:       ; %bb.0:
1396; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1397; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1398; SI-NEXT:    s_mov_b32 s3, 0xf000
1399; SI-NEXT:    s_mov_b32 s2, -1
1400; SI-NEXT:    s_waitcnt lgkmcnt(0)
1401; SI-NEXT:    s_mov_b32 s0, s4
1402; SI-NEXT:    s_mov_b32 s1, s5
1403; SI-NEXT:    s_mov_b32 s4, s6
1404; SI-NEXT:    s_mov_b32 s5, s7
1405; SI-NEXT:    s_mov_b32 s6, s2
1406; SI-NEXT:    s_mov_b32 s7, s3
1407; SI-NEXT:    s_mov_b32 s10, s2
1408; SI-NEXT:    s_mov_b32 s11, s3
1409; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1410; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
1411; SI-NEXT:    s_waitcnt vmcnt(0)
1412; SI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
1413; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
1414; SI-NEXT:    v_rcp_f32_e32 v6, v5
1415; SI-NEXT:    s_mov_b32 s6, 3
1416; SI-NEXT:    s_mov_b32 s7, 0
1417; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1418; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1419; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
1420; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
1421; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1422; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
1423; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1424; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1425; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1426; SI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
1427; SI-NEXT:    v_trunc_f32_e32 v4, v4
1428; SI-NEXT:    v_fma_f32 v1, -v4, v3, v1
1429; SI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
1430; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
1431; SI-NEXT:    v_rcp_f32_e32 v5, v4
1432; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1433; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1434; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
1435; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
1436; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
1437; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
1438; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
1439; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1440; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
1441; SI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
1442; SI-NEXT:    v_trunc_f32_e32 v3, v3
1443; SI-NEXT:    v_fma_f32 v0, -v3, v2, v0
1444; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1445; SI-NEXT:    s_endpgm
1446;
1447; CI-LABEL: frem_v2f32:
1448; CI:       ; %bb.0:
1449; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1450; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1451; CI-NEXT:    s_mov_b32 s3, 0xf000
1452; CI-NEXT:    s_mov_b32 s2, -1
1453; CI-NEXT:    s_mov_b32 s10, s2
1454; CI-NEXT:    s_waitcnt lgkmcnt(0)
1455; CI-NEXT:    s_mov_b32 s0, s4
1456; CI-NEXT:    s_mov_b32 s1, s5
1457; CI-NEXT:    s_mov_b32 s4, s6
1458; CI-NEXT:    s_mov_b32 s5, s7
1459; CI-NEXT:    s_mov_b32 s6, s2
1460; CI-NEXT:    s_mov_b32 s7, s3
1461; CI-NEXT:    s_mov_b32 s11, s3
1462; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1463; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
1464; CI-NEXT:    s_mov_b32 s6, 3
1465; CI-NEXT:    s_mov_b32 s7, 0
1466; CI-NEXT:    s_waitcnt vmcnt(0)
1467; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
1468; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
1469; CI-NEXT:    v_rcp_f32_e32 v6, v5
1470; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1471; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1472; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
1473; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
1474; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1475; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
1476; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1477; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1478; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1479; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
1480; CI-NEXT:    v_trunc_f32_e32 v4, v4
1481; CI-NEXT:    v_fma_f32 v1, -v4, v3, v1
1482; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
1483; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
1484; CI-NEXT:    v_rcp_f32_e32 v5, v4
1485; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1486; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1487; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
1488; CI-NEXT:    v_mul_f32_e32 v6, v3, v5
1489; CI-NEXT:    v_fma_f32 v7, -v4, v6, v3
1490; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
1491; CI-NEXT:    v_fma_f32 v3, -v4, v6, v3
1492; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1493; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
1494; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
1495; CI-NEXT:    v_trunc_f32_e32 v3, v3
1496; CI-NEXT:    v_fma_f32 v0, -v3, v2, v0
1497; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1498; CI-NEXT:    s_endpgm
1499;
1500; VI-LABEL: frem_v2f32:
1501; VI:       ; %bb.0:
1502; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1503; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1504; VI-NEXT:    s_mov_b32 s2, 3
1505; VI-NEXT:    s_mov_b32 s3, 0
1506; VI-NEXT:    s_waitcnt lgkmcnt(0)
1507; VI-NEXT:    v_mov_b32_e32 v2, s6
1508; VI-NEXT:    s_add_u32 s0, s0, 32
1509; VI-NEXT:    s_addc_u32 s1, s1, 0
1510; VI-NEXT:    v_mov_b32_e32 v5, s1
1511; VI-NEXT:    v_mov_b32_e32 v3, s7
1512; VI-NEXT:    v_mov_b32_e32 v4, s0
1513; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1514; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1515; VI-NEXT:    v_mov_b32_e32 v0, s4
1516; VI-NEXT:    v_mov_b32_e32 v1, s5
1517; VI-NEXT:    s_waitcnt vmcnt(0)
1518; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v3
1519; VI-NEXT:    v_div_scale_f32 v6, vcc, v3, v5, v3
1520; VI-NEXT:    v_rcp_f32_e32 v8, v7
1521; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
1522; VI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
1523; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
1524; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
1525; VI-NEXT:    v_fma_f32 v10, -v7, v9, v6
1526; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
1527; VI-NEXT:    v_fma_f32 v6, -v7, v9, v6
1528; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
1529; VI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
1530; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v3
1531; VI-NEXT:    v_trunc_f32_e32 v6, v6
1532; VI-NEXT:    v_fma_f32 v3, -v6, v5, v3
1533; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v2
1534; VI-NEXT:    v_div_scale_f32 v5, vcc, v2, v4, v2
1535; VI-NEXT:    v_rcp_f32_e32 v7, v6
1536; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
1537; VI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
1538; VI-NEXT:    v_fma_f32 v7, v8, v7, v7
1539; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
1540; VI-NEXT:    v_fma_f32 v9, -v6, v8, v5
1541; VI-NEXT:    v_fma_f32 v8, v9, v7, v8
1542; VI-NEXT:    v_fma_f32 v5, -v6, v8, v5
1543; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
1544; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
1545; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v2
1546; VI-NEXT:    v_trunc_f32_e32 v5, v5
1547; VI-NEXT:    v_fma_f32 v2, -v5, v4, v2
1548; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1549; VI-NEXT:    s_endpgm
1550                        <2 x float> addrspace(1)* %in2) #0 {
1551   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
1552   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
1553   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
1554   %r2 = frem <2 x float> %r0, %r1
1555   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
1556   ret void
1557}
1558
1559define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
1560; SI-LABEL: frem_v4f32:
1561; SI:       ; %bb.0:
1562; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1563; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1564; SI-NEXT:    s_mov_b32 s3, 0xf000
1565; SI-NEXT:    s_mov_b32 s2, -1
1566; SI-NEXT:    s_waitcnt lgkmcnt(0)
1567; SI-NEXT:    s_mov_b32 s0, s4
1568; SI-NEXT:    s_mov_b32 s1, s5
1569; SI-NEXT:    s_mov_b32 s4, s6
1570; SI-NEXT:    s_mov_b32 s5, s7
1571; SI-NEXT:    s_mov_b32 s6, s2
1572; SI-NEXT:    s_mov_b32 s7, s3
1573; SI-NEXT:    s_mov_b32 s10, s2
1574; SI-NEXT:    s_mov_b32 s11, s3
1575; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1576; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
1577; SI-NEXT:    s_waitcnt vmcnt(0)
1578; SI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
1579; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
1580; SI-NEXT:    v_rcp_f32_e32 v10, v9
1581; SI-NEXT:    s_mov_b32 s6, 3
1582; SI-NEXT:    s_mov_b32 s7, 0
1583; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1584; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1585; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
1586; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
1587; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1588; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
1589; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1590; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1591; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1592; SI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
1593; SI-NEXT:    v_trunc_f32_e32 v8, v8
1594; SI-NEXT:    v_fma_f32 v3, -v8, v7, v3
1595; SI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
1596; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
1597; SI-NEXT:    v_rcp_f32_e32 v9, v8
1598; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1599; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1600; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
1601; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
1602; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
1603; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
1604; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
1605; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1606; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
1607; SI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
1608; SI-NEXT:    v_trunc_f32_e32 v7, v7
1609; SI-NEXT:    v_fma_f32 v2, -v7, v6, v2
1610; SI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
1611; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
1612; SI-NEXT:    v_rcp_f32_e32 v8, v7
1613; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1614; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
1615; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
1616; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
1617; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
1618; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
1619; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
1620; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1621; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
1622; SI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
1623; SI-NEXT:    v_trunc_f32_e32 v6, v6
1624; SI-NEXT:    v_fma_f32 v1, -v6, v5, v1
1625; SI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
1626; SI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
1627; SI-NEXT:    v_rcp_f32_e32 v7, v6
1628; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1629; SI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
1630; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
1631; SI-NEXT:    v_mul_f32_e32 v8, v5, v7
1632; SI-NEXT:    v_fma_f32 v9, -v6, v8, v5
1633; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
1634; SI-NEXT:    v_fma_f32 v5, -v6, v8, v5
1635; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1636; SI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
1637; SI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
1638; SI-NEXT:    v_trunc_f32_e32 v5, v5
1639; SI-NEXT:    v_fma_f32 v0, -v5, v4, v0
1640; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1641; SI-NEXT:    s_endpgm
1642;
1643; CI-LABEL: frem_v4f32:
1644; CI:       ; %bb.0:
1645; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1646; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1647; CI-NEXT:    s_mov_b32 s3, 0xf000
1648; CI-NEXT:    s_mov_b32 s2, -1
1649; CI-NEXT:    s_mov_b32 s10, s2
1650; CI-NEXT:    s_waitcnt lgkmcnt(0)
1651; CI-NEXT:    s_mov_b32 s0, s4
1652; CI-NEXT:    s_mov_b32 s1, s5
1653; CI-NEXT:    s_mov_b32 s4, s6
1654; CI-NEXT:    s_mov_b32 s5, s7
1655; CI-NEXT:    s_mov_b32 s6, s2
1656; CI-NEXT:    s_mov_b32 s7, s3
1657; CI-NEXT:    s_mov_b32 s11, s3
1658; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1659; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
1660; CI-NEXT:    s_mov_b32 s6, 3
1661; CI-NEXT:    s_mov_b32 s7, 0
1662; CI-NEXT:    s_waitcnt vmcnt(0)
1663; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
1664; CI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
1665; CI-NEXT:    v_rcp_f32_e32 v10, v9
1666; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1667; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1668; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
1669; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
1670; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1671; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
1672; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1673; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1674; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1675; CI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
1676; CI-NEXT:    v_trunc_f32_e32 v8, v8
1677; CI-NEXT:    v_fma_f32 v3, -v8, v7, v3
1678; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
1679; CI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
1680; CI-NEXT:    v_rcp_f32_e32 v9, v8
1681; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1682; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1683; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
1684; CI-NEXT:    v_mul_f32_e32 v10, v7, v9
1685; CI-NEXT:    v_fma_f32 v11, -v8, v10, v7
1686; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
1687; CI-NEXT:    v_fma_f32 v7, -v8, v10, v7
1688; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1689; CI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
1690; CI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
1691; CI-NEXT:    v_trunc_f32_e32 v7, v7
1692; CI-NEXT:    v_fma_f32 v2, -v7, v6, v2
1693; CI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
1694; CI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
1695; CI-NEXT:    v_rcp_f32_e32 v8, v7
1696; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1697; CI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
1698; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
1699; CI-NEXT:    v_mul_f32_e32 v9, v6, v8
1700; CI-NEXT:    v_fma_f32 v10, -v7, v9, v6
1701; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
1702; CI-NEXT:    v_fma_f32 v6, -v7, v9, v6
1703; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1704; CI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
1705; CI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
1706; CI-NEXT:    v_trunc_f32_e32 v6, v6
1707; CI-NEXT:    v_fma_f32 v1, -v6, v5, v1
1708; CI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
1709; CI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
1710; CI-NEXT:    v_rcp_f32_e32 v7, v6
1711; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1712; CI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
1713; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
1714; CI-NEXT:    v_mul_f32_e32 v8, v5, v7
1715; CI-NEXT:    v_fma_f32 v9, -v6, v8, v5
1716; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
1717; CI-NEXT:    v_fma_f32 v5, -v6, v8, v5
1718; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1719; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
1720; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
1721; CI-NEXT:    v_trunc_f32_e32 v5, v5
1722; CI-NEXT:    v_fma_f32 v0, -v5, v4, v0
1723; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1724; CI-NEXT:    s_endpgm
1725;
1726; VI-LABEL: frem_v4f32:
1727; VI:       ; %bb.0:
1728; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1729; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1730; VI-NEXT:    s_mov_b32 s2, 3
1731; VI-NEXT:    s_mov_b32 s3, 0
1732; VI-NEXT:    s_waitcnt lgkmcnt(0)
1733; VI-NEXT:    v_mov_b32_e32 v0, s6
1734; VI-NEXT:    s_add_u32 s0, s0, 64
1735; VI-NEXT:    s_addc_u32 s1, s1, 0
1736; VI-NEXT:    v_mov_b32_e32 v5, s1
1737; VI-NEXT:    v_mov_b32_e32 v1, s7
1738; VI-NEXT:    v_mov_b32_e32 v4, s0
1739; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1740; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1741; VI-NEXT:    v_mov_b32_e32 v8, s4
1742; VI-NEXT:    v_mov_b32_e32 v9, s5
1743; VI-NEXT:    s_waitcnt vmcnt(0)
1744; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v7, v7, v3
1745; VI-NEXT:    v_div_scale_f32 v10, vcc, v3, v7, v3
1746; VI-NEXT:    v_rcp_f32_e32 v12, v11
1747; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
1748; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
1749; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
1750; VI-NEXT:    v_mul_f32_e32 v13, v10, v12
1751; VI-NEXT:    v_fma_f32 v14, -v11, v13, v10
1752; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
1753; VI-NEXT:    v_fma_f32 v10, -v11, v13, v10
1754; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
1755; VI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
1756; VI-NEXT:    v_div_fixup_f32 v10, v10, v7, v3
1757; VI-NEXT:    v_trunc_f32_e32 v10, v10
1758; VI-NEXT:    v_fma_f32 v3, -v10, v7, v3
1759; VI-NEXT:    v_div_scale_f32 v10, s[0:1], v6, v6, v2
1760; VI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
1761; VI-NEXT:    v_rcp_f32_e32 v11, v10
1762; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
1763; VI-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
1764; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
1765; VI-NEXT:    v_mul_f32_e32 v12, v7, v11
1766; VI-NEXT:    v_fma_f32 v13, -v10, v12, v7
1767; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
1768; VI-NEXT:    v_fma_f32 v7, -v10, v12, v7
1769; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
1770; VI-NEXT:    v_div_fmas_f32 v7, v7, v11, v12
1771; VI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
1772; VI-NEXT:    v_trunc_f32_e32 v7, v7
1773; VI-NEXT:    v_fma_f32 v2, -v7, v6, v2
1774; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
1775; VI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
1776; VI-NEXT:    v_rcp_f32_e32 v10, v7
1777; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
1778; VI-NEXT:    v_fma_f32 v11, -v7, v10, 1.0
1779; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
1780; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
1781; VI-NEXT:    v_fma_f32 v12, -v7, v11, v6
1782; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
1783; VI-NEXT:    v_fma_f32 v6, -v7, v11, v6
1784; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
1785; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
1786; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
1787; VI-NEXT:    v_trunc_f32_e32 v6, v6
1788; VI-NEXT:    v_fma_f32 v1, -v6, v5, v1
1789; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
1790; VI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
1791; VI-NEXT:    v_rcp_f32_e32 v7, v6
1792; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
1793; VI-NEXT:    v_fma_f32 v10, -v6, v7, 1.0
1794; VI-NEXT:    v_fma_f32 v7, v10, v7, v7
1795; VI-NEXT:    v_mul_f32_e32 v10, v5, v7
1796; VI-NEXT:    v_fma_f32 v11, -v6, v10, v5
1797; VI-NEXT:    v_fma_f32 v10, v11, v7, v10
1798; VI-NEXT:    v_fma_f32 v5, -v6, v10, v5
1799; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
1800; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v10
1801; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
1802; VI-NEXT:    v_trunc_f32_e32 v5, v5
1803; VI-NEXT:    v_fma_f32 v0, -v5, v4, v0
1804; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1805; VI-NEXT:    s_endpgm
1806                        <4 x float> addrspace(1)* %in2) #0 {
1807   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
1808   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
1809   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
1810   %r2 = frem <4 x float> %r0, %r1
1811   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
1812   ret void
1813}
1814
1815define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
1816; SI-LABEL: frem_v2f64:
1817; SI:       ; %bb.0:
1818; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
1819; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1820; SI-NEXT:    s_mov_b32 s7, 0xf000
1821; SI-NEXT:    s_mov_b32 s6, -1
1822; SI-NEXT:    s_waitcnt lgkmcnt(0)
1823; SI-NEXT:    s_mov_b32 s4, s8
1824; SI-NEXT:    s_mov_b32 s5, s9
1825; SI-NEXT:    s_mov_b32 s8, s10
1826; SI-NEXT:    s_mov_b32 s9, s11
1827; SI-NEXT:    s_mov_b32 s10, s6
1828; SI-NEXT:    s_mov_b32 s11, s7
1829; SI-NEXT:    s_mov_b32 s2, s6
1830; SI-NEXT:    s_mov_b32 s3, s7
1831; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1832; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
1833; SI-NEXT:    s_waitcnt vmcnt(0)
1834; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
1835; SI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
1836; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
1837; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
1838; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
1839; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
1840; SI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
1841; SI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
1842; SI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
1843; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
1844; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v13
1845; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
1846; SI-NEXT:    s_nop 1
1847; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
1848; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
1849; SI-NEXT:    v_bfe_u32 v10, v9, 20, 11
1850; SI-NEXT:    s_movk_i32 s8, 0xfc01
1851; SI-NEXT:    v_add_i32_e32 v12, vcc, s8, v10
1852; SI-NEXT:    s_mov_b32 s3, 0xfffff
1853; SI-NEXT:    v_lshr_b64 v[10:11], s[2:3], v12
1854; SI-NEXT:    v_not_b32_e32 v10, v10
1855; SI-NEXT:    v_and_b32_e32 v10, v8, v10
1856; SI-NEXT:    v_not_b32_e32 v11, v11
1857; SI-NEXT:    v_and_b32_e32 v11, v9, v11
1858; SI-NEXT:    s_brev_b32 s9, 1
1859; SI-NEXT:    v_and_b32_e32 v13, s9, v9
1860; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v12
1861; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
1862; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v12
1863; SI-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[0:1]
1864; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
1865; SI-NEXT:    v_cndmask_b32_e64 v8, v10, v8, s[0:1]
1866; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
1867; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
1868; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
1869; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1870; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1871; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1872; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1873; SI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
1874; SI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
1875; SI-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
1876; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
1877; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v11
1878; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
1879; SI-NEXT:    s_nop 1
1880; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
1881; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
1882; SI-NEXT:    v_bfe_u32 v8, v7, 20, 11
1883; SI-NEXT:    v_add_i32_e32 v10, vcc, s8, v8
1884; SI-NEXT:    v_lshr_b64 v[8:9], s[2:3], v10
1885; SI-NEXT:    v_not_b32_e32 v8, v8
1886; SI-NEXT:    v_and_b32_e32 v8, v6, v8
1887; SI-NEXT:    v_not_b32_e32 v9, v9
1888; SI-NEXT:    v_and_b32_e32 v9, v7, v9
1889; SI-NEXT:    v_and_b32_e32 v11, s9, v7
1890; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v10
1891; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
1892; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v10
1893; SI-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[0:1]
1894; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
1895; SI-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
1896; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
1897; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1898; SI-NEXT:    s_endpgm
1899;
1900; CI-LABEL: frem_v2f64:
1901; CI:       ; %bb.0:
1902; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1903; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1904; CI-NEXT:    s_mov_b32 s3, 0xf000
1905; CI-NEXT:    s_mov_b32 s2, -1
1906; CI-NEXT:    s_mov_b32 s10, s2
1907; CI-NEXT:    s_waitcnt lgkmcnt(0)
1908; CI-NEXT:    s_mov_b32 s0, s4
1909; CI-NEXT:    s_mov_b32 s1, s5
1910; CI-NEXT:    s_mov_b32 s4, s6
1911; CI-NEXT:    s_mov_b32 s5, s7
1912; CI-NEXT:    s_mov_b32 s6, s2
1913; CI-NEXT:    s_mov_b32 s7, s3
1914; CI-NEXT:    s_mov_b32 s11, s3
1915; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1916; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
1917; CI-NEXT:    s_waitcnt vmcnt(0)
1918; CI-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
1919; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
1920; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
1921; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
1922; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
1923; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
1924; CI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
1925; CI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
1926; CI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
1927; CI-NEXT:    s_nop 1
1928; CI-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
1929; CI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
1930; CI-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
1931; CI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
1932; CI-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
1933; CI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
1934; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1935; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1936; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
1937; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
1938; CI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
1939; CI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
1940; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
1941; CI-NEXT:    s_nop 1
1942; CI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
1943; CI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
1944; CI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1945; CI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
1946; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1947; CI-NEXT:    s_endpgm
1948;
1949; VI-LABEL: frem_v2f64:
1950; VI:       ; %bb.0:
1951; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1952; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1953; VI-NEXT:    s_waitcnt lgkmcnt(0)
1954; VI-NEXT:    v_mov_b32_e32 v0, s6
1955; VI-NEXT:    s_add_u32 s0, s0, 64
1956; VI-NEXT:    s_addc_u32 s1, s1, 0
1957; VI-NEXT:    v_mov_b32_e32 v5, s1
1958; VI-NEXT:    v_mov_b32_e32 v1, s7
1959; VI-NEXT:    v_mov_b32_e32 v4, s0
1960; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1961; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1962; VI-NEXT:    v_mov_b32_e32 v8, s4
1963; VI-NEXT:    v_mov_b32_e32 v9, s5
1964; VI-NEXT:    s_waitcnt vmcnt(0)
1965; VI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
1966; VI-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
1967; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
1968; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
1969; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
1970; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
1971; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
1972; VI-NEXT:    v_mul_f64 v[16:17], v[14:15], v[12:13]
1973; VI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
1974; VI-NEXT:    s_nop 1
1975; VI-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
1976; VI-NEXT:    v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
1977; VI-NEXT:    v_trunc_f64_e32 v[10:11], v[10:11]
1978; VI-NEXT:    v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
1979; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
1980; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
1981; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
1982; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
1983; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
1984; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
1985; VI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
1986; VI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
1987; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
1988; VI-NEXT:    s_nop 1
1989; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
1990; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
1991; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1992; VI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
1993; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1994; VI-NEXT:    s_endpgm
1995                        <2 x double> addrspace(1)* %in2) #0 {
1996   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
1997   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
1998   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
1999   %r2 = frem <2 x double> %r0, %r1
2000   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
2001   ret void
2002}
2003
2004attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2005attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2006