1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs  < %s | FileCheck --check-prefix=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7
8define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
9; SI-LABEL: frem_f16:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
12; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
13; SI-NEXT:    s_mov_b32 s11, 0xf000
14; SI-NEXT:    s_mov_b32 s10, -1
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_mov_b32 s8, s4
17; SI-NEXT:    s_mov_b32 s9, s5
18; SI-NEXT:    s_mov_b32 s4, s6
19; SI-NEXT:    s_mov_b32 s5, s7
20; SI-NEXT:    s_mov_b32 s6, s10
21; SI-NEXT:    s_mov_b32 s7, s11
22; SI-NEXT:    s_mov_b32 s2, s10
23; SI-NEXT:    s_mov_b32 s3, s11
24; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
25; SI-NEXT:    s_waitcnt vmcnt(0)
26; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
27; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
30; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
31; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
32; SI-NEXT:    v_rcp_f32_e32 v4, v3
33; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
34; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
35; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
36; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
37; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
38; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
39; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
40; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
41; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
42; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
43; SI-NEXT:    v_trunc_f32_e32 v2, v2
44; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
45; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
46; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
47; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
48; SI-NEXT:    s_endpgm
49;
50; CI-LABEL: frem_f16:
51; CI:       ; %bb.0:
52; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
53; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
54; CI-NEXT:    s_mov_b32 s11, 0xf000
55; CI-NEXT:    s_mov_b32 s10, -1
56; CI-NEXT:    s_mov_b32 s2, s10
57; CI-NEXT:    s_waitcnt lgkmcnt(0)
58; CI-NEXT:    s_mov_b32 s8, s4
59; CI-NEXT:    s_mov_b32 s9, s5
60; CI-NEXT:    s_mov_b32 s4, s6
61; CI-NEXT:    s_mov_b32 s5, s7
62; CI-NEXT:    s_mov_b32 s6, s10
63; CI-NEXT:    s_mov_b32 s7, s11
64; CI-NEXT:    s_mov_b32 s3, s11
65; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
66; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
67; CI-NEXT:    s_waitcnt vmcnt(1)
68; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
69; CI-NEXT:    s_waitcnt vmcnt(0)
70; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
71; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
72; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
73; CI-NEXT:    v_rcp_f32_e32 v4, v3
74; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
75; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
76; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
77; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
78; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
79; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
80; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
81; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
82; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
83; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
84; CI-NEXT:    v_trunc_f32_e32 v2, v2
85; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
86; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
87; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
88; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
89; CI-NEXT:    s_endpgm
90;
91; VI-LABEL: frem_f16:
92; VI:       ; %bb.0:
93; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
94; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
95; VI-NEXT:    s_waitcnt lgkmcnt(0)
96; VI-NEXT:    v_mov_b32_e32 v2, s6
97; VI-NEXT:    s_add_u32 s0, s0, 8
98; VI-NEXT:    v_mov_b32_e32 v3, s7
99; VI-NEXT:    s_addc_u32 s1, s1, 0
100; VI-NEXT:    flat_load_ushort v4, v[2:3]
101; VI-NEXT:    v_mov_b32_e32 v3, s1
102; VI-NEXT:    v_mov_b32_e32 v2, s0
103; VI-NEXT:    flat_load_ushort v2, v[2:3]
104; VI-NEXT:    v_mov_b32_e32 v0, s4
105; VI-NEXT:    v_mov_b32_e32 v1, s5
106; VI-NEXT:    s_waitcnt vmcnt(1)
107; VI-NEXT:    v_cvt_f32_f16_e32 v3, v4
108; VI-NEXT:    s_waitcnt vmcnt(0)
109; VI-NEXT:    v_cvt_f32_f16_e32 v5, v2
110; VI-NEXT:    v_rcp_f32_e32 v5, v5
111; VI-NEXT:    v_mul_f32_e32 v3, v3, v5
112; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
113; VI-NEXT:    v_div_fixup_f16 v3, v3, v2, v4
114; VI-NEXT:    v_trunc_f16_e32 v3, v3
115; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
116; VI-NEXT:    flat_store_short v[0:1], v2
117; VI-NEXT:    s_endpgm
118;
119; GFX9-LABEL: frem_f16:
120; GFX9:       ; %bb.0:
121; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
122; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
123; GFX9-NEXT:    v_mov_b32_e32 v0, 0
124; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
126; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
127; GFX9-NEXT:    s_waitcnt vmcnt(1)
128; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
129; GFX9-NEXT:    s_waitcnt vmcnt(0)
130; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
131; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
132; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
133; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
134; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
135; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
136; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
137; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
138; GFX9-NEXT:    s_endpgm
139;
140; GFX10-LABEL: frem_f16:
141; GFX10:       ; %bb.0:
142; GFX10-NEXT:    s_clause 0x1
143; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
144; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
145; GFX10-NEXT:    v_mov_b32_e32 v0, 0
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    s_clause 0x1
148; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
149; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
150; GFX10-NEXT:    s_waitcnt vmcnt(1)
151; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
152; GFX10-NEXT:    s_waitcnt vmcnt(0)
153; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
154; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
155; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
156; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
157; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
158; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
159; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
160; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
161; GFX10-NEXT:    s_endpgm
162                      half addrspace(1)* %in2) #0 {
163   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
164   %r0 = load half, half addrspace(1)* %in1, align 4
165   %r1 = load half, half addrspace(1)* %gep2, align 4
166   %r2 = frem half %r0, %r1
167   store half %r2, half addrspace(1)* %out, align 4
168   ret void
169}
170
171define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
172; SI-LABEL: fast_frem_f16:
173; SI:       ; %bb.0:
174; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
175; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
176; SI-NEXT:    s_mov_b32 s11, 0xf000
177; SI-NEXT:    s_mov_b32 s10, -1
178; SI-NEXT:    s_waitcnt lgkmcnt(0)
179; SI-NEXT:    s_mov_b32 s8, s4
180; SI-NEXT:    s_mov_b32 s9, s5
181; SI-NEXT:    s_mov_b32 s4, s6
182; SI-NEXT:    s_mov_b32 s5, s7
183; SI-NEXT:    s_mov_b32 s6, s10
184; SI-NEXT:    s_mov_b32 s7, s11
185; SI-NEXT:    s_mov_b32 s2, s10
186; SI-NEXT:    s_mov_b32 s3, s11
187; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
188; SI-NEXT:    s_waitcnt vmcnt(0)
189; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
190; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
191; SI-NEXT:    s_waitcnt vmcnt(0)
192; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
193; SI-NEXT:    v_rcp_f32_e32 v2, v1
194; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
195; SI-NEXT:    v_trunc_f32_e32 v2, v2
196; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
197; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
198; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
199; SI-NEXT:    s_endpgm
200;
201; CI-LABEL: fast_frem_f16:
202; CI:       ; %bb.0:
203; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
204; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
205; CI-NEXT:    s_mov_b32 s11, 0xf000
206; CI-NEXT:    s_mov_b32 s10, -1
207; CI-NEXT:    s_mov_b32 s2, s10
208; CI-NEXT:    s_mov_b32 s3, s11
209; CI-NEXT:    s_waitcnt lgkmcnt(0)
210; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
211; CI-NEXT:    s_mov_b32 s8, s4
212; CI-NEXT:    s_mov_b32 s9, s5
213; CI-NEXT:    s_mov_b32 s4, s6
214; CI-NEXT:    s_mov_b32 s5, s7
215; CI-NEXT:    s_mov_b32 s6, s10
216; CI-NEXT:    s_mov_b32 s7, s11
217; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
218; CI-NEXT:    s_waitcnt vmcnt(1)
219; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
220; CI-NEXT:    v_rcp_f32_e32 v2, v1
221; CI-NEXT:    s_waitcnt vmcnt(0)
222; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
223; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
224; CI-NEXT:    v_trunc_f32_e32 v2, v2
225; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
226; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
227; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
228; CI-NEXT:    s_endpgm
229;
230; VI-LABEL: fast_frem_f16:
231; VI:       ; %bb.0:
232; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
233; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
234; VI-NEXT:    s_waitcnt lgkmcnt(0)
235; VI-NEXT:    v_mov_b32_e32 v2, s6
236; VI-NEXT:    s_add_u32 s0, s0, 8
237; VI-NEXT:    v_mov_b32_e32 v3, s7
238; VI-NEXT:    s_addc_u32 s1, s1, 0
239; VI-NEXT:    flat_load_ushort v4, v[2:3]
240; VI-NEXT:    v_mov_b32_e32 v3, s1
241; VI-NEXT:    v_mov_b32_e32 v2, s0
242; VI-NEXT:    flat_load_ushort v2, v[2:3]
243; VI-NEXT:    v_mov_b32_e32 v0, s4
244; VI-NEXT:    v_mov_b32_e32 v1, s5
245; VI-NEXT:    s_waitcnt vmcnt(0)
246; VI-NEXT:    v_rcp_f16_e32 v3, v2
247; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
248; VI-NEXT:    v_trunc_f16_e32 v3, v3
249; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
250; VI-NEXT:    flat_store_short v[0:1], v2
251; VI-NEXT:    s_endpgm
252;
253; GFX9-LABEL: fast_frem_f16:
254; GFX9:       ; %bb.0:
255; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
256; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
257; GFX9-NEXT:    v_mov_b32_e32 v0, 0
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
260; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
261; GFX9-NEXT:    s_waitcnt vmcnt(0)
262; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
263; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
264; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
265; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
266; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
267; GFX9-NEXT:    s_endpgm
268;
269; GFX10-LABEL: fast_frem_f16:
270; GFX10:       ; %bb.0:
271; GFX10-NEXT:    s_clause 0x1
272; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
273; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
274; GFX10-NEXT:    v_mov_b32_e32 v0, 0
275; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX10-NEXT:    s_clause 0x1
277; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
278; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
279; GFX10-NEXT:    s_waitcnt vmcnt(0)
280; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
281; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
282; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
283; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
284; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
285; GFX10-NEXT:    s_endpgm
286                      half addrspace(1)* %in2) #0 {
287   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
288   %r0 = load half, half addrspace(1)* %in1, align 4
289   %r1 = load half, half addrspace(1)* %gep2, align 4
290   %r2 = frem fast half %r0, %r1
291   store half %r2, half addrspace(1)* %out, align 4
292   ret void
293}
294
295define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
296; SI-LABEL: unsafe_frem_f16:
297; SI:       ; %bb.0:
298; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
299; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
300; SI-NEXT:    s_mov_b32 s11, 0xf000
301; SI-NEXT:    s_mov_b32 s10, -1
302; SI-NEXT:    s_waitcnt lgkmcnt(0)
303; SI-NEXT:    s_mov_b32 s8, s4
304; SI-NEXT:    s_mov_b32 s9, s5
305; SI-NEXT:    s_mov_b32 s4, s6
306; SI-NEXT:    s_mov_b32 s5, s7
307; SI-NEXT:    s_mov_b32 s6, s10
308; SI-NEXT:    s_mov_b32 s7, s11
309; SI-NEXT:    s_mov_b32 s2, s10
310; SI-NEXT:    s_mov_b32 s3, s11
311; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
312; SI-NEXT:    s_waitcnt vmcnt(0)
313; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
314; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
315; SI-NEXT:    s_waitcnt vmcnt(0)
316; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
317; SI-NEXT:    v_rcp_f32_e32 v2, v1
318; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
319; SI-NEXT:    v_trunc_f32_e32 v2, v2
320; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
321; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
322; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
323; SI-NEXT:    s_endpgm
324;
325; CI-LABEL: unsafe_frem_f16:
326; CI:       ; %bb.0:
327; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
328; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
329; CI-NEXT:    s_mov_b32 s11, 0xf000
330; CI-NEXT:    s_mov_b32 s10, -1
331; CI-NEXT:    s_mov_b32 s2, s10
332; CI-NEXT:    s_mov_b32 s3, s11
333; CI-NEXT:    s_waitcnt lgkmcnt(0)
334; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
335; CI-NEXT:    s_mov_b32 s8, s4
336; CI-NEXT:    s_mov_b32 s9, s5
337; CI-NEXT:    s_mov_b32 s4, s6
338; CI-NEXT:    s_mov_b32 s5, s7
339; CI-NEXT:    s_mov_b32 s6, s10
340; CI-NEXT:    s_mov_b32 s7, s11
341; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
342; CI-NEXT:    s_waitcnt vmcnt(1)
343; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
344; CI-NEXT:    v_rcp_f32_e32 v2, v1
345; CI-NEXT:    s_waitcnt vmcnt(0)
346; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
347; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
348; CI-NEXT:    v_trunc_f32_e32 v2, v2
349; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
350; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
351; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
352; CI-NEXT:    s_endpgm
353;
354; VI-LABEL: unsafe_frem_f16:
355; VI:       ; %bb.0:
356; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
357; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
358; VI-NEXT:    s_waitcnt lgkmcnt(0)
359; VI-NEXT:    v_mov_b32_e32 v2, s6
360; VI-NEXT:    s_add_u32 s0, s0, 8
361; VI-NEXT:    v_mov_b32_e32 v3, s7
362; VI-NEXT:    s_addc_u32 s1, s1, 0
363; VI-NEXT:    flat_load_ushort v4, v[2:3]
364; VI-NEXT:    v_mov_b32_e32 v3, s1
365; VI-NEXT:    v_mov_b32_e32 v2, s0
366; VI-NEXT:    flat_load_ushort v2, v[2:3]
367; VI-NEXT:    v_mov_b32_e32 v0, s4
368; VI-NEXT:    v_mov_b32_e32 v1, s5
369; VI-NEXT:    s_waitcnt vmcnt(0)
370; VI-NEXT:    v_rcp_f16_e32 v3, v2
371; VI-NEXT:    v_mul_f16_e32 v3, v4, v3
372; VI-NEXT:    v_trunc_f16_e32 v3, v3
373; VI-NEXT:    v_fma_f16 v2, -v3, v2, v4
374; VI-NEXT:    flat_store_short v[0:1], v2
375; VI-NEXT:    s_endpgm
376;
377; GFX9-LABEL: unsafe_frem_f16:
378; GFX9:       ; %bb.0:
379; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
380; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
381; GFX9-NEXT:    v_mov_b32_e32 v0, 0
382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
383; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
384; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
385; GFX9-NEXT:    s_waitcnt vmcnt(0)
386; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
387; GFX9-NEXT:    v_mul_f16_e32 v3, v1, v3
388; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
389; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
390; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
391; GFX9-NEXT:    s_endpgm
392;
393; GFX10-LABEL: unsafe_frem_f16:
394; GFX10:       ; %bb.0:
395; GFX10-NEXT:    s_clause 0x1
396; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
397; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
398; GFX10-NEXT:    v_mov_b32_e32 v0, 0
399; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX10-NEXT:    s_clause 0x1
401; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
402; GFX10-NEXT:    global_load_ushort v2, v0, s[2:3] offset:8
403; GFX10-NEXT:    s_waitcnt vmcnt(0)
404; GFX10-NEXT:    v_rcp_f16_e32 v3, v2
405; GFX10-NEXT:    v_mul_f16_e32 v3, v1, v3
406; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
407; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
408; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
409; GFX10-NEXT:    s_endpgm
410                             half addrspace(1)* %in2) #1 {
411   %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4
412   %r0 = load half, half addrspace(1)* %in1, align 4
413   %r1 = load half, half addrspace(1)* %gep2, align 4
414   %r2 = frem afn half %r0, %r1
415   store half %r2, half addrspace(1)* %out, align 4
416   ret void
417}
418
419define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
420; SI-LABEL: frem_f32:
421; SI:       ; %bb.0:
422; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
423; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
424; SI-NEXT:    s_mov_b32 s11, 0xf000
425; SI-NEXT:    s_mov_b32 s10, -1
426; SI-NEXT:    s_waitcnt lgkmcnt(0)
427; SI-NEXT:    s_mov_b32 s8, s4
428; SI-NEXT:    s_mov_b32 s9, s5
429; SI-NEXT:    s_mov_b32 s4, s6
430; SI-NEXT:    s_mov_b32 s5, s7
431; SI-NEXT:    s_mov_b32 s6, s10
432; SI-NEXT:    s_mov_b32 s7, s11
433; SI-NEXT:    s_mov_b32 s2, s10
434; SI-NEXT:    s_mov_b32 s3, s11
435; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
436; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
437; SI-NEXT:    s_waitcnt vmcnt(0)
438; SI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
439; SI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
440; SI-NEXT:    v_rcp_f32_e32 v4, v3
441; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
442; SI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
443; SI-NEXT:    v_fma_f32 v4, v5, v4, v4
444; SI-NEXT:    v_mul_f32_e32 v5, v2, v4
445; SI-NEXT:    v_fma_f32 v6, -v3, v5, v2
446; SI-NEXT:    v_fma_f32 v5, v6, v4, v5
447; SI-NEXT:    v_fma_f32 v2, -v3, v5, v2
448; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
449; SI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
450; SI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
451; SI-NEXT:    v_trunc_f32_e32 v2, v2
452; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
453; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
454; SI-NEXT:    s_endpgm
455;
456; CI-LABEL: frem_f32:
457; CI:       ; %bb.0:
458; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
459; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
460; CI-NEXT:    s_mov_b32 s11, 0xf000
461; CI-NEXT:    s_mov_b32 s10, -1
462; CI-NEXT:    s_mov_b32 s2, s10
463; CI-NEXT:    s_waitcnt lgkmcnt(0)
464; CI-NEXT:    s_mov_b32 s8, s4
465; CI-NEXT:    s_mov_b32 s9, s5
466; CI-NEXT:    s_mov_b32 s4, s6
467; CI-NEXT:    s_mov_b32 s5, s7
468; CI-NEXT:    s_mov_b32 s6, s10
469; CI-NEXT:    s_mov_b32 s7, s11
470; CI-NEXT:    s_mov_b32 s3, s11
471; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
472; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
473; CI-NEXT:    s_waitcnt vmcnt(0)
474; CI-NEXT:    v_div_scale_f32 v3, s[0:1], v1, v1, v0
475; CI-NEXT:    v_div_scale_f32 v2, vcc, v0, v1, v0
476; CI-NEXT:    v_rcp_f32_e32 v4, v3
477; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
478; CI-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
479; CI-NEXT:    v_fma_f32 v4, v5, v4, v4
480; CI-NEXT:    v_mul_f32_e32 v5, v2, v4
481; CI-NEXT:    v_fma_f32 v6, -v3, v5, v2
482; CI-NEXT:    v_fma_f32 v5, v6, v4, v5
483; CI-NEXT:    v_fma_f32 v2, -v3, v5, v2
484; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
485; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
486; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
487; CI-NEXT:    v_trunc_f32_e32 v2, v2
488; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
489; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
490; CI-NEXT:    s_endpgm
491;
492; VI-LABEL: frem_f32:
493; VI:       ; %bb.0:
494; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
495; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
496; VI-NEXT:    s_waitcnt lgkmcnt(0)
497; VI-NEXT:    v_mov_b32_e32 v2, s6
498; VI-NEXT:    s_add_u32 s0, s0, 16
499; VI-NEXT:    v_mov_b32_e32 v3, s7
500; VI-NEXT:    s_addc_u32 s1, s1, 0
501; VI-NEXT:    flat_load_dword v4, v[2:3]
502; VI-NEXT:    v_mov_b32_e32 v3, s1
503; VI-NEXT:    v_mov_b32_e32 v2, s0
504; VI-NEXT:    flat_load_dword v2, v[2:3]
505; VI-NEXT:    v_mov_b32_e32 v0, s4
506; VI-NEXT:    v_mov_b32_e32 v1, s5
507; VI-NEXT:    s_waitcnt vmcnt(0)
508; VI-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v4
509; VI-NEXT:    v_div_scale_f32 v3, vcc, v4, v2, v4
510; VI-NEXT:    v_rcp_f32_e32 v6, v5
511; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
512; VI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
513; VI-NEXT:    v_fma_f32 v6, v7, v6, v6
514; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
515; VI-NEXT:    v_fma_f32 v8, -v5, v7, v3
516; VI-NEXT:    v_fma_f32 v7, v8, v6, v7
517; VI-NEXT:    v_fma_f32 v3, -v5, v7, v3
518; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
519; VI-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
520; VI-NEXT:    v_div_fixup_f32 v3, v3, v2, v4
521; VI-NEXT:    v_trunc_f32_e32 v3, v3
522; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
523; VI-NEXT:    flat_store_dword v[0:1], v2
524; VI-NEXT:    s_endpgm
525;
526; GFX9-LABEL: frem_f32:
527; GFX9:       ; %bb.0:
528; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
529; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
530; GFX9-NEXT:    v_mov_b32_e32 v0, 0
531; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
532; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
533; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
534; GFX9-NEXT:    s_waitcnt vmcnt(0)
535; GFX9-NEXT:    v_div_scale_f32 v4, s[0:1], v2, v2, v1
536; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v1, v2, v1
537; GFX9-NEXT:    v_rcp_f32_e32 v5, v4
538; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
539; GFX9-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
540; GFX9-NEXT:    v_fma_f32 v5, v6, v5, v5
541; GFX9-NEXT:    v_mul_f32_e32 v6, v3, v5
542; GFX9-NEXT:    v_fma_f32 v7, -v4, v6, v3
543; GFX9-NEXT:    v_fma_f32 v6, v7, v5, v6
544; GFX9-NEXT:    v_fma_f32 v3, -v4, v6, v3
545; GFX9-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
546; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
547; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
548; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
549; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
550; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
551; GFX9-NEXT:    s_endpgm
552;
553; GFX10-LABEL: frem_f32:
554; GFX10:       ; %bb.0:
555; GFX10-NEXT:    s_clause 0x1
556; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
557; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
558; GFX10-NEXT:    v_mov_b32_e32 v0, 0
559; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX10-NEXT:    s_clause 0x1
561; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
562; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
563; GFX10-NEXT:    s_waitcnt vmcnt(0)
564; GFX10-NEXT:    v_div_scale_f32 v4, s0, v2, v2, v1
565; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
566; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
567; GFX10-NEXT:    s_denorm_mode 15
568; GFX10-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
569; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v5
570; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
571; GFX10-NEXT:    v_fma_f32 v7, -v4, v6, v3
572; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v5
573; GFX10-NEXT:    v_fma_f32 v3, -v4, v6, v3
574; GFX10-NEXT:    s_denorm_mode 12
575; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
576; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
577; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
578; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
579; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
580; GFX10-NEXT:    s_endpgm
581                      float addrspace(1)* %in2) #0 {
582   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
583   %r0 = load float, float addrspace(1)* %in1, align 4
584   %r1 = load float, float addrspace(1)* %gep2, align 4
585   %r2 = frem float %r0, %r1
586   store float %r2, float addrspace(1)* %out, align 4
587   ret void
588}
589
590define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
591; SI-LABEL: fast_frem_f32:
592; SI:       ; %bb.0:
593; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
594; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
595; SI-NEXT:    s_mov_b32 s11, 0xf000
596; SI-NEXT:    s_mov_b32 s10, -1
597; SI-NEXT:    s_waitcnt lgkmcnt(0)
598; SI-NEXT:    s_mov_b32 s8, s4
599; SI-NEXT:    s_mov_b32 s9, s5
600; SI-NEXT:    s_mov_b32 s4, s6
601; SI-NEXT:    s_mov_b32 s5, s7
602; SI-NEXT:    s_mov_b32 s6, s10
603; SI-NEXT:    s_mov_b32 s7, s11
604; SI-NEXT:    s_mov_b32 s2, s10
605; SI-NEXT:    s_mov_b32 s3, s11
606; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
607; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
608; SI-NEXT:    s_waitcnt vmcnt(0)
609; SI-NEXT:    v_rcp_f32_e32 v2, v1
610; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
611; SI-NEXT:    v_trunc_f32_e32 v2, v2
612; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
613; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
614; SI-NEXT:    s_endpgm
615;
616; CI-LABEL: fast_frem_f32:
617; CI:       ; %bb.0:
618; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
619; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
620; CI-NEXT:    s_mov_b32 s11, 0xf000
621; CI-NEXT:    s_mov_b32 s10, -1
622; CI-NEXT:    s_mov_b32 s2, s10
623; CI-NEXT:    s_waitcnt lgkmcnt(0)
624; CI-NEXT:    s_mov_b32 s8, s4
625; CI-NEXT:    s_mov_b32 s9, s5
626; CI-NEXT:    s_mov_b32 s4, s6
627; CI-NEXT:    s_mov_b32 s5, s7
628; CI-NEXT:    s_mov_b32 s6, s10
629; CI-NEXT:    s_mov_b32 s7, s11
630; CI-NEXT:    s_mov_b32 s3, s11
631; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
632; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
633; CI-NEXT:    s_waitcnt vmcnt(0)
634; CI-NEXT:    v_rcp_f32_e32 v2, v1
635; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
636; CI-NEXT:    v_trunc_f32_e32 v2, v2
637; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
638; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
639; CI-NEXT:    s_endpgm
640;
641; VI-LABEL: fast_frem_f32:
642; VI:       ; %bb.0:
643; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
644; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
645; VI-NEXT:    s_waitcnt lgkmcnt(0)
646; VI-NEXT:    v_mov_b32_e32 v2, s6
647; VI-NEXT:    s_add_u32 s0, s0, 16
648; VI-NEXT:    v_mov_b32_e32 v3, s7
649; VI-NEXT:    s_addc_u32 s1, s1, 0
650; VI-NEXT:    flat_load_dword v4, v[2:3]
651; VI-NEXT:    v_mov_b32_e32 v3, s1
652; VI-NEXT:    v_mov_b32_e32 v2, s0
653; VI-NEXT:    flat_load_dword v2, v[2:3]
654; VI-NEXT:    v_mov_b32_e32 v0, s4
655; VI-NEXT:    v_mov_b32_e32 v1, s5
656; VI-NEXT:    s_waitcnt vmcnt(0)
657; VI-NEXT:    v_rcp_f32_e32 v3, v2
658; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
659; VI-NEXT:    v_trunc_f32_e32 v3, v3
660; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
661; VI-NEXT:    flat_store_dword v[0:1], v2
662; VI-NEXT:    s_endpgm
663;
664; GFX9-LABEL: fast_frem_f32:
665; GFX9:       ; %bb.0:
666; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
667; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
668; GFX9-NEXT:    v_mov_b32_e32 v0, 0
669; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
670; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
671; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
672; GFX9-NEXT:    s_waitcnt vmcnt(0)
673; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
674; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
675; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
676; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
677; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
678; GFX9-NEXT:    s_endpgm
679;
680; GFX10-LABEL: fast_frem_f32:
681; GFX10:       ; %bb.0:
682; GFX10-NEXT:    s_clause 0x1
683; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
684; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
685; GFX10-NEXT:    v_mov_b32_e32 v0, 0
686; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX10-NEXT:    s_clause 0x1
688; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
689; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
690; GFX10-NEXT:    s_waitcnt vmcnt(0)
691; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
692; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
693; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
694; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
695; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
696; GFX10-NEXT:    s_endpgm
697                      float addrspace(1)* %in2) #0 {
698   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
699   %r0 = load float, float addrspace(1)* %in1, align 4
700   %r1 = load float, float addrspace(1)* %gep2, align 4
701   %r2 = frem fast float %r0, %r1
702   store float %r2, float addrspace(1)* %out, align 4
703   ret void
704}
705
706define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
707; SI-LABEL: unsafe_frem_f32:
708; SI:       ; %bb.0:
709; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
710; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
711; SI-NEXT:    s_mov_b32 s11, 0xf000
712; SI-NEXT:    s_mov_b32 s10, -1
713; SI-NEXT:    s_waitcnt lgkmcnt(0)
714; SI-NEXT:    s_mov_b32 s8, s4
715; SI-NEXT:    s_mov_b32 s9, s5
716; SI-NEXT:    s_mov_b32 s4, s6
717; SI-NEXT:    s_mov_b32 s5, s7
718; SI-NEXT:    s_mov_b32 s6, s10
719; SI-NEXT:    s_mov_b32 s7, s11
720; SI-NEXT:    s_mov_b32 s2, s10
721; SI-NEXT:    s_mov_b32 s3, s11
722; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
723; SI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
724; SI-NEXT:    s_waitcnt vmcnt(0)
725; SI-NEXT:    v_rcp_f32_e32 v2, v1
726; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
727; SI-NEXT:    v_trunc_f32_e32 v2, v2
728; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
729; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
730; SI-NEXT:    s_endpgm
731;
732; CI-LABEL: unsafe_frem_f32:
733; CI:       ; %bb.0:
734; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
735; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
736; CI-NEXT:    s_mov_b32 s11, 0xf000
737; CI-NEXT:    s_mov_b32 s10, -1
738; CI-NEXT:    s_mov_b32 s2, s10
739; CI-NEXT:    s_waitcnt lgkmcnt(0)
740; CI-NEXT:    s_mov_b32 s8, s4
741; CI-NEXT:    s_mov_b32 s9, s5
742; CI-NEXT:    s_mov_b32 s4, s6
743; CI-NEXT:    s_mov_b32 s5, s7
744; CI-NEXT:    s_mov_b32 s6, s10
745; CI-NEXT:    s_mov_b32 s7, s11
746; CI-NEXT:    s_mov_b32 s3, s11
747; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
748; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
749; CI-NEXT:    s_waitcnt vmcnt(0)
750; CI-NEXT:    v_rcp_f32_e32 v2, v1
751; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
752; CI-NEXT:    v_trunc_f32_e32 v2, v2
753; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
754; CI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
755; CI-NEXT:    s_endpgm
756;
757; VI-LABEL: unsafe_frem_f32:
758; VI:       ; %bb.0:
759; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
760; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
761; VI-NEXT:    s_waitcnt lgkmcnt(0)
762; VI-NEXT:    v_mov_b32_e32 v2, s6
763; VI-NEXT:    s_add_u32 s0, s0, 16
764; VI-NEXT:    v_mov_b32_e32 v3, s7
765; VI-NEXT:    s_addc_u32 s1, s1, 0
766; VI-NEXT:    flat_load_dword v4, v[2:3]
767; VI-NEXT:    v_mov_b32_e32 v3, s1
768; VI-NEXT:    v_mov_b32_e32 v2, s0
769; VI-NEXT:    flat_load_dword v2, v[2:3]
770; VI-NEXT:    v_mov_b32_e32 v0, s4
771; VI-NEXT:    v_mov_b32_e32 v1, s5
772; VI-NEXT:    s_waitcnt vmcnt(0)
773; VI-NEXT:    v_rcp_f32_e32 v3, v2
774; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
775; VI-NEXT:    v_trunc_f32_e32 v3, v3
776; VI-NEXT:    v_fma_f32 v2, -v3, v2, v4
777; VI-NEXT:    flat_store_dword v[0:1], v2
778; VI-NEXT:    s_endpgm
779;
780; GFX9-LABEL: unsafe_frem_f32:
781; GFX9:       ; %bb.0:
782; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
783; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
784; GFX9-NEXT:    v_mov_b32_e32 v0, 0
785; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
787; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
788; GFX9-NEXT:    s_waitcnt vmcnt(0)
789; GFX9-NEXT:    v_rcp_f32_e32 v3, v2
790; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
791; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
792; GFX9-NEXT:    v_fma_f32 v1, -v3, v2, v1
793; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
794; GFX9-NEXT:    s_endpgm
795;
796; GFX10-LABEL: unsafe_frem_f32:
797; GFX10:       ; %bb.0:
798; GFX10-NEXT:    s_clause 0x1
799; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
800; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
801; GFX10-NEXT:    v_mov_b32_e32 v0, 0
802; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
803; GFX10-NEXT:    s_clause 0x1
804; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
805; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
806; GFX10-NEXT:    s_waitcnt vmcnt(0)
807; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
808; GFX10-NEXT:    v_mul_f32_e32 v3, v1, v3
809; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
810; GFX10-NEXT:    v_fma_f32 v1, -v3, v2, v1
811; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
812; GFX10-NEXT:    s_endpgm
813                             float addrspace(1)* %in2) #1 {
814   %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
815   %r0 = load float, float addrspace(1)* %in1, align 4
816   %r1 = load float, float addrspace(1)* %gep2, align 4
817   %r2 = frem afn float %r0, %r1
818   store float %r2, float addrspace(1)* %out, align 4
819   ret void
820}
821
822define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
823; SI-LABEL: frem_f64:
824; SI:       ; %bb.0:
825; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
826; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
827; SI-NEXT:    s_mov_b32 s7, 0xf000
828; SI-NEXT:    s_mov_b32 s6, -1
829; SI-NEXT:    s_waitcnt lgkmcnt(0)
830; SI-NEXT:    s_mov_b32 s4, s8
831; SI-NEXT:    s_mov_b32 s5, s9
832; SI-NEXT:    s_mov_b32 s8, s10
833; SI-NEXT:    s_mov_b32 s9, s11
834; SI-NEXT:    s_mov_b32 s10, s6
835; SI-NEXT:    s_mov_b32 s11, s7
836; SI-NEXT:    s_mov_b32 s2, s6
837; SI-NEXT:    s_mov_b32 s3, s7
838; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
839; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
840; SI-NEXT:    s_waitcnt vmcnt(0)
841; SI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
842; SI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
843; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
844; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
845; SI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
846; SI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
847; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1]
848; SI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
849; SI-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9]
850; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
851; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v9
852; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
853; SI-NEXT:    s_nop 1
854; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
855; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
856; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
857; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
858; SI-NEXT:    s_mov_b32 s1, 0xfffff
859; SI-NEXT:    s_mov_b32 s0, s6
860; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
861; SI-NEXT:    v_not_b32_e32 v6, v6
862; SI-NEXT:    v_and_b32_e32 v6, v4, v6
863; SI-NEXT:    v_not_b32_e32 v7, v7
864; SI-NEXT:    v_and_b32_e32 v7, v5, v7
865; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
866; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
867; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
868; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
869; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
870; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
871; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
872; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
873; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
874; SI-NEXT:    s_endpgm
875;
876; CI-LABEL: frem_f64:
877; CI:       ; %bb.0:
878; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
879; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
880; CI-NEXT:    s_mov_b32 s11, 0xf000
881; CI-NEXT:    s_mov_b32 s10, -1
882; CI-NEXT:    s_mov_b32 s2, s10
883; CI-NEXT:    s_waitcnt lgkmcnt(0)
884; CI-NEXT:    s_mov_b32 s8, s4
885; CI-NEXT:    s_mov_b32 s9, s5
886; CI-NEXT:    s_mov_b32 s4, s6
887; CI-NEXT:    s_mov_b32 s5, s7
888; CI-NEXT:    s_mov_b32 s6, s10
889; CI-NEXT:    s_mov_b32 s7, s11
890; CI-NEXT:    s_mov_b32 s3, s11
891; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
892; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
893; CI-NEXT:    s_waitcnt vmcnt(0)
894; CI-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
895; CI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
896; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
897; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
898; CI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
899; CI-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
900; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
901; CI-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
902; CI-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
903; CI-NEXT:    s_nop 1
904; CI-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
905; CI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
906; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
907; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
908; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
909; CI-NEXT:    s_endpgm
910;
911; VI-LABEL: frem_f64:
912; VI:       ; %bb.0:
913; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
914; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
915; VI-NEXT:    s_waitcnt lgkmcnt(0)
916; VI-NEXT:    v_mov_b32_e32 v2, s6
917; VI-NEXT:    v_mov_b32_e32 v3, s7
918; VI-NEXT:    v_mov_b32_e32 v4, s0
919; VI-NEXT:    v_mov_b32_e32 v5, s1
920; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
921; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
922; VI-NEXT:    v_mov_b32_e32 v0, s4
923; VI-NEXT:    v_mov_b32_e32 v1, s5
924; VI-NEXT:    s_waitcnt vmcnt(0)
925; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3]
926; VI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
927; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
928; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
929; VI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
930; VI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
931; VI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3]
932; VI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
933; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
934; VI-NEXT:    s_nop 1
935; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
936; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3]
937; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
938; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
939; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
940; VI-NEXT:    s_endpgm
941;
942; GFX9-LABEL: frem_f64:
943; GFX9:       ; %bb.0:
944; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
945; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
946; GFX9-NEXT:    v_mov_b32_e32 v12, 0
947; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX9-NEXT:    global_load_dwordx2 v[0:1], v12, s[6:7]
949; GFX9-NEXT:    global_load_dwordx2 v[2:3], v12, s[2:3]
950; GFX9-NEXT:    s_waitcnt vmcnt(0)
951; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1]
952; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
953; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
954; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
955; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
956; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
957; GFX9-NEXT:    v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1]
958; GFX9-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
959; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
960; GFX9-NEXT:    s_nop 1
961; GFX9-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
962; GFX9-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
963; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
964; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
965; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[4:5]
966; GFX9-NEXT:    s_endpgm
967;
968; GFX10-LABEL: frem_f64:
969; GFX10:       ; %bb.0:
970; GFX10-NEXT:    s_clause 0x1
971; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
972; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
973; GFX10-NEXT:    v_mov_b32_e32 v12, 0
974; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
975; GFX10-NEXT:    s_clause 0x1
976; GFX10-NEXT:    global_load_dwordx2 v[0:1], v12, s[6:7]
977; GFX10-NEXT:    global_load_dwordx2 v[2:3], v12, s[2:3]
978; GFX10-NEXT:    s_waitcnt vmcnt(0)
979; GFX10-NEXT:    v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1]
980; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
981; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
982; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
983; GFX10-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
984; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
985; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
986; GFX10-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
987; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
988; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
989; GFX10-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
990; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
991; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
992; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[4:5]
993; GFX10-NEXT:    s_endpgm
994                      double addrspace(1)* %in2) #0 {
995   %r0 = load double, double addrspace(1)* %in1, align 8
996   %r1 = load double, double addrspace(1)* %in2, align 8
997   %r2 = frem double %r0, %r1
998   store double %r2, double addrspace(1)* %out, align 8
999   ret void
1000}
1001
1002define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
1003; SI-LABEL: fast_frem_f64:
1004; SI:       ; %bb.0:
1005; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
1006; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1007; SI-NEXT:    s_mov_b32 s7, 0xf000
1008; SI-NEXT:    s_mov_b32 s6, -1
1009; SI-NEXT:    s_waitcnt lgkmcnt(0)
1010; SI-NEXT:    s_mov_b32 s4, s8
1011; SI-NEXT:    s_mov_b32 s5, s9
1012; SI-NEXT:    s_mov_b32 s8, s10
1013; SI-NEXT:    s_mov_b32 s9, s11
1014; SI-NEXT:    s_mov_b32 s10, s6
1015; SI-NEXT:    s_mov_b32 s11, s7
1016; SI-NEXT:    s_mov_b32 s2, s6
1017; SI-NEXT:    s_mov_b32 s3, s7
1018; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1019; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1020; SI-NEXT:    s_waitcnt vmcnt(0)
1021; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1022; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1023; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1024; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1025; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1026; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1027; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1028; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1029; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
1030; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1031; SI-NEXT:    s_mov_b32 s1, 0xfffff
1032; SI-NEXT:    s_mov_b32 s0, s6
1033; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
1034; SI-NEXT:    v_not_b32_e32 v6, v6
1035; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1036; SI-NEXT:    v_not_b32_e32 v7, v7
1037; SI-NEXT:    v_and_b32_e32 v7, v5, v7
1038; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
1039; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
1040; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1041; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
1042; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1043; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1044; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1045; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1046; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1047; SI-NEXT:    s_endpgm
1048;
1049; CI-LABEL: fast_frem_f64:
1050; CI:       ; %bb.0:
1051; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1052; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1053; CI-NEXT:    s_mov_b32 s11, 0xf000
1054; CI-NEXT:    s_mov_b32 s10, -1
1055; CI-NEXT:    s_mov_b32 s2, s10
1056; CI-NEXT:    s_waitcnt lgkmcnt(0)
1057; CI-NEXT:    s_mov_b32 s8, s4
1058; CI-NEXT:    s_mov_b32 s9, s5
1059; CI-NEXT:    s_mov_b32 s4, s6
1060; CI-NEXT:    s_mov_b32 s5, s7
1061; CI-NEXT:    s_mov_b32 s6, s10
1062; CI-NEXT:    s_mov_b32 s7, s11
1063; CI-NEXT:    s_mov_b32 s3, s11
1064; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1065; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1066; CI-NEXT:    s_waitcnt vmcnt(0)
1067; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1068; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1069; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1070; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1071; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1072; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1073; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1074; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1075; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1076; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1077; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1078; CI-NEXT:    s_endpgm
1079;
1080; VI-LABEL: fast_frem_f64:
1081; VI:       ; %bb.0:
1082; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1083; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1084; VI-NEXT:    s_waitcnt lgkmcnt(0)
1085; VI-NEXT:    v_mov_b32_e32 v2, s6
1086; VI-NEXT:    v_mov_b32_e32 v3, s7
1087; VI-NEXT:    v_mov_b32_e32 v4, s0
1088; VI-NEXT:    v_mov_b32_e32 v5, s1
1089; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1090; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1091; VI-NEXT:    v_mov_b32_e32 v0, s4
1092; VI-NEXT:    v_mov_b32_e32 v1, s5
1093; VI-NEXT:    s_waitcnt vmcnt(0)
1094; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1095; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1096; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1097; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1098; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1099; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
1100; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1101; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1102; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1103; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1104; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1105; VI-NEXT:    s_endpgm
1106;
1107; GFX9-LABEL: fast_frem_f64:
1108; GFX9:       ; %bb.0:
1109; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1110; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1111; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1112; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1114; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1115; GFX9-NEXT:    s_waitcnt vmcnt(0)
1116; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1117; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1118; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1119; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1120; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1121; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1122; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1123; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1124; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1125; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1126; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1127; GFX9-NEXT:    s_endpgm
1128;
1129; GFX10-LABEL: fast_frem_f64:
1130; GFX10:       ; %bb.0:
1131; GFX10-NEXT:    s_clause 0x1
1132; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1133; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1134; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1135; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX10-NEXT:    s_clause 0x1
1137; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1138; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1139; GFX10-NEXT:    s_waitcnt vmcnt(0)
1140; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1141; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1142; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1143; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1144; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1145; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1146; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1147; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1148; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1149; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1150; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1151; GFX10-NEXT:    s_endpgm
1152                      double addrspace(1)* %in2) #0 {
1153   %r0 = load double, double addrspace(1)* %in1, align 8
1154   %r1 = load double, double addrspace(1)* %in2, align 8
1155   %r2 = frem fast double %r0, %r1
1156   store double %r2, double addrspace(1)* %out, align 8
1157   ret void
1158}
1159
1160define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
1161; SI-LABEL: unsafe_frem_f64:
1162; SI:       ; %bb.0:
1163; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
1164; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1165; SI-NEXT:    s_mov_b32 s7, 0xf000
1166; SI-NEXT:    s_mov_b32 s6, -1
1167; SI-NEXT:    s_waitcnt lgkmcnt(0)
1168; SI-NEXT:    s_mov_b32 s4, s8
1169; SI-NEXT:    s_mov_b32 s5, s9
1170; SI-NEXT:    s_mov_b32 s8, s10
1171; SI-NEXT:    s_mov_b32 s9, s11
1172; SI-NEXT:    s_mov_b32 s10, s6
1173; SI-NEXT:    s_mov_b32 s11, s7
1174; SI-NEXT:    s_mov_b32 s2, s6
1175; SI-NEXT:    s_mov_b32 s3, s7
1176; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1177; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1178; SI-NEXT:    s_waitcnt vmcnt(0)
1179; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1180; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1181; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1182; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1183; SI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1184; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1185; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1186; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1187; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
1188; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
1189; SI-NEXT:    s_mov_b32 s1, 0xfffff
1190; SI-NEXT:    s_mov_b32 s0, s6
1191; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
1192; SI-NEXT:    v_not_b32_e32 v6, v6
1193; SI-NEXT:    v_and_b32_e32 v6, v4, v6
1194; SI-NEXT:    v_not_b32_e32 v7, v7
1195; SI-NEXT:    v_and_b32_e32 v7, v5, v7
1196; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
1197; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
1198; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
1199; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
1200; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
1201; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
1202; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
1203; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1204; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1205; SI-NEXT:    s_endpgm
1206;
1207; CI-LABEL: unsafe_frem_f64:
1208; CI:       ; %bb.0:
1209; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1210; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1211; CI-NEXT:    s_mov_b32 s11, 0xf000
1212; CI-NEXT:    s_mov_b32 s10, -1
1213; CI-NEXT:    s_mov_b32 s2, s10
1214; CI-NEXT:    s_waitcnt lgkmcnt(0)
1215; CI-NEXT:    s_mov_b32 s8, s4
1216; CI-NEXT:    s_mov_b32 s9, s5
1217; CI-NEXT:    s_mov_b32 s4, s6
1218; CI-NEXT:    s_mov_b32 s5, s7
1219; CI-NEXT:    s_mov_b32 s6, s10
1220; CI-NEXT:    s_mov_b32 s7, s11
1221; CI-NEXT:    s_mov_b32 s3, s11
1222; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1223; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
1224; CI-NEXT:    s_waitcnt vmcnt(0)
1225; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1226; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1227; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1228; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1229; CI-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1230; CI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1231; CI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1232; CI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1233; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1234; CI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1235; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1236; CI-NEXT:    s_endpgm
1237;
1238; VI-LABEL: unsafe_frem_f64:
1239; VI:       ; %bb.0:
1240; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1241; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1242; VI-NEXT:    s_waitcnt lgkmcnt(0)
1243; VI-NEXT:    v_mov_b32_e32 v2, s6
1244; VI-NEXT:    v_mov_b32_e32 v3, s7
1245; VI-NEXT:    v_mov_b32_e32 v4, s0
1246; VI-NEXT:    v_mov_b32_e32 v5, s1
1247; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1248; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1249; VI-NEXT:    v_mov_b32_e32 v0, s4
1250; VI-NEXT:    v_mov_b32_e32 v1, s5
1251; VI-NEXT:    s_waitcnt vmcnt(0)
1252; VI-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
1253; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1254; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1255; VI-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
1256; VI-NEXT:    v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7]
1257; VI-NEXT:    v_mul_f64 v[8:9], v[2:3], v[6:7]
1258; VI-NEXT:    v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3]
1259; VI-NEXT:    v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9]
1260; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
1261; VI-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3]
1262; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1263; VI-NEXT:    s_endpgm
1264;
1265; GFX9-LABEL: unsafe_frem_f64:
1266; GFX9:       ; %bb.0:
1267; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1268; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1269; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1270; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1271; GFX9-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1272; GFX9-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1273; GFX9-NEXT:    s_waitcnt vmcnt(0)
1274; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1275; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1276; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1277; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1278; GFX9-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1279; GFX9-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1280; GFX9-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1281; GFX9-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1282; GFX9-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1283; GFX9-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1284; GFX9-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1285; GFX9-NEXT:    s_endpgm
1286;
1287; GFX10-LABEL: unsafe_frem_f64:
1288; GFX10:       ; %bb.0:
1289; GFX10-NEXT:    s_clause 0x1
1290; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1291; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1292; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1293; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1294; GFX10-NEXT:    s_clause 0x1
1295; GFX10-NEXT:    global_load_dwordx2 v[0:1], v10, s[6:7]
1296; GFX10-NEXT:    global_load_dwordx2 v[2:3], v10, s[2:3]
1297; GFX10-NEXT:    s_waitcnt vmcnt(0)
1298; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
1299; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1300; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1301; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1302; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
1303; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
1304; GFX10-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
1305; GFX10-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
1306; GFX10-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
1307; GFX10-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
1308; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
1309; GFX10-NEXT:    s_endpgm
1310                             double addrspace(1)* %in2) #1 {
1311   %r0 = load double, double addrspace(1)* %in1, align 8
1312   %r1 = load double, double addrspace(1)* %in2, align 8
1313   %r2 = frem afn double %r0, %r1
1314   store double %r2, double addrspace(1)* %out, align 8
1315   ret void
1316}
1317
1318define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
1319; SI-LABEL: frem_v2f16:
1320; SI:       ; %bb.0:
1321; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1322; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1323; SI-NEXT:    s_mov_b32 s3, 0xf000
1324; SI-NEXT:    s_mov_b32 s2, -1
1325; SI-NEXT:    s_waitcnt lgkmcnt(0)
1326; SI-NEXT:    s_mov_b32 s0, s4
1327; SI-NEXT:    s_mov_b32 s1, s5
1328; SI-NEXT:    s_mov_b32 s4, s6
1329; SI-NEXT:    s_mov_b32 s5, s7
1330; SI-NEXT:    s_mov_b32 s6, s2
1331; SI-NEXT:    s_mov_b32 s7, s3
1332; SI-NEXT:    s_mov_b32 s10, s2
1333; SI-NEXT:    s_mov_b32 s11, s3
1334; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1335; SI-NEXT:    s_waitcnt vmcnt(0)
1336; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1337; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1338; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1339; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
1340; SI-NEXT:    s_waitcnt vmcnt(0)
1341; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1342; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1343; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1344; SI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1345; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1346; SI-NEXT:    v_rcp_f32_e32 v6, v5
1347; SI-NEXT:    s_mov_b32 s6, 3
1348; SI-NEXT:    s_mov_b32 s7, 0
1349; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1350; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1351; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
1352; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
1353; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1354; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
1355; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1356; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1357; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1358; SI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1359; SI-NEXT:    v_trunc_f32_e32 v4, v4
1360; SI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1361; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1362; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1363; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1364; SI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1365; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1366; SI-NEXT:    v_rcp_f32_e32 v5, v4
1367; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1368; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1369; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
1370; SI-NEXT:    v_mul_f32_e32 v6, v2, v5
1371; SI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1372; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
1373; SI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1374; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1375; SI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1376; SI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1377; SI-NEXT:    v_trunc_f32_e32 v2, v2
1378; SI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1379; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1380; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1381; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1382; SI-NEXT:    s_endpgm
1383;
1384; CI-LABEL: frem_v2f16:
1385; CI:       ; %bb.0:
1386; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1387; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1388; CI-NEXT:    s_mov_b32 s3, 0xf000
1389; CI-NEXT:    s_mov_b32 s2, -1
1390; CI-NEXT:    s_mov_b32 s10, s2
1391; CI-NEXT:    s_waitcnt lgkmcnt(0)
1392; CI-NEXT:    s_mov_b32 s0, s4
1393; CI-NEXT:    s_mov_b32 s1, s5
1394; CI-NEXT:    s_mov_b32 s4, s6
1395; CI-NEXT:    s_mov_b32 s5, s7
1396; CI-NEXT:    s_mov_b32 s6, s2
1397; CI-NEXT:    s_mov_b32 s7, s3
1398; CI-NEXT:    s_mov_b32 s11, s3
1399; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
1400; CI-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:16
1401; CI-NEXT:    s_mov_b32 s6, 3
1402; CI-NEXT:    s_mov_b32 s7, 0
1403; CI-NEXT:    s_waitcnt vmcnt(1)
1404; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1405; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1406; CI-NEXT:    s_waitcnt vmcnt(0)
1407; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
1408; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1409; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1410; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1411; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v2, v2, v0
1412; CI-NEXT:    v_div_scale_f32 v4, vcc, v0, v2, v0
1413; CI-NEXT:    v_rcp_f32_e32 v6, v5
1414; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1415; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1416; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
1417; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
1418; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1419; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
1420; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1421; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1422; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1423; CI-NEXT:    v_div_fixup_f32 v4, v4, v2, v0
1424; CI-NEXT:    v_trunc_f32_e32 v4, v4
1425; CI-NEXT:    v_fma_f32 v0, -v4, v2, v0
1426; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v1
1427; CI-NEXT:    v_div_scale_f32 v2, vcc, v1, v3, v1
1428; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1429; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1430; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1431; CI-NEXT:    v_rcp_f32_e32 v5, v4
1432; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1433; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1434; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
1435; CI-NEXT:    v_mul_f32_e32 v6, v2, v5
1436; CI-NEXT:    v_fma_f32 v7, -v4, v6, v2
1437; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
1438; CI-NEXT:    v_fma_f32 v2, -v4, v6, v2
1439; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1440; CI-NEXT:    v_div_fmas_f32 v2, v2, v5, v6
1441; CI-NEXT:    v_div_fixup_f32 v2, v2, v3, v1
1442; CI-NEXT:    v_trunc_f32_e32 v2, v2
1443; CI-NEXT:    v_fma_f32 v1, -v2, v3, v1
1444; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1445; CI-NEXT:    v_or_b32_e32 v0, v1, v0
1446; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1447; CI-NEXT:    s_endpgm
1448;
1449; VI-LABEL: frem_v2f16:
1450; VI:       ; %bb.0:
1451; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1452; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1453; VI-NEXT:    s_waitcnt lgkmcnt(0)
1454; VI-NEXT:    v_mov_b32_e32 v2, s6
1455; VI-NEXT:    s_add_u32 s0, s0, 16
1456; VI-NEXT:    v_mov_b32_e32 v3, s7
1457; VI-NEXT:    s_addc_u32 s1, s1, 0
1458; VI-NEXT:    flat_load_dword v4, v[2:3]
1459; VI-NEXT:    v_mov_b32_e32 v3, s1
1460; VI-NEXT:    v_mov_b32_e32 v2, s0
1461; VI-NEXT:    flat_load_dword v2, v[2:3]
1462; VI-NEXT:    v_mov_b32_e32 v0, s4
1463; VI-NEXT:    v_mov_b32_e32 v1, s5
1464; VI-NEXT:    s_waitcnt vmcnt(1)
1465; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
1466; VI-NEXT:    v_cvt_f32_f16_e32 v5, v3
1467; VI-NEXT:    s_waitcnt vmcnt(0)
1468; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1469; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1470; VI-NEXT:    v_rcp_f32_e32 v7, v7
1471; VI-NEXT:    v_mul_f32_e32 v5, v5, v7
1472; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1473; VI-NEXT:    v_div_fixup_f16 v5, v5, v6, v3
1474; VI-NEXT:    v_trunc_f16_e32 v5, v5
1475; VI-NEXT:    v_fma_f16 v3, -v5, v6, v3
1476; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
1477; VI-NEXT:    v_cvt_f32_f16_e32 v5, v4
1478; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1479; VI-NEXT:    v_rcp_f32_e32 v6, v6
1480; VI-NEXT:    v_mul_f32_e32 v5, v5, v6
1481; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
1482; VI-NEXT:    v_div_fixup_f16 v5, v5, v2, v4
1483; VI-NEXT:    v_trunc_f16_e32 v5, v5
1484; VI-NEXT:    v_fma_f16 v2, -v5, v2, v4
1485; VI-NEXT:    v_or_b32_e32 v2, v2, v3
1486; VI-NEXT:    flat_store_dword v[0:1], v2
1487; VI-NEXT:    s_endpgm
1488;
1489; GFX9-LABEL: frem_v2f16:
1490; GFX9:       ; %bb.0:
1491; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1492; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1493; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1494; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1495; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
1496; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
1497; GFX9-NEXT:    s_waitcnt vmcnt(1)
1498; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
1499; GFX9-NEXT:    s_waitcnt vmcnt(0)
1500; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
1501; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
1502; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
1503; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
1504; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
1505; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
1506; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v1
1507; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1508; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
1509; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1510; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v1
1511; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
1512; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v5
1513; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
1514; GFX9-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
1515; GFX9-NEXT:    v_trunc_f16_e32 v4, v4
1516; GFX9-NEXT:    v_fma_f16 v1, -v4, v2, v1
1517; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
1518; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
1519; GFX9-NEXT:    s_endpgm
1520;
1521; GFX10-LABEL: frem_v2f16:
1522; GFX10:       ; %bb.0:
1523; GFX10-NEXT:    s_clause 0x1
1524; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1525; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1526; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1527; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1528; GFX10-NEXT:    s_clause 0x1
1529; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
1530; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:16
1531; GFX10-NEXT:    s_waitcnt vmcnt(1)
1532; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
1533; GFX10-NEXT:    s_waitcnt vmcnt(0)
1534; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
1535; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
1536; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
1537; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
1538; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
1539; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
1540; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v1
1541; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1542; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1543; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
1544; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v1
1545; GFX10-NEXT:    v_rcp_f32_e32 v5, v5
1546; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v5
1547; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
1548; GFX10-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
1549; GFX10-NEXT:    v_trunc_f16_e32 v4, v4
1550; GFX10-NEXT:    v_fma_f16 v1, -v4, v2, v1
1551; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
1552; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
1553; GFX10-NEXT:    s_endpgm
1554                        <2 x half> addrspace(1)* %in2) #0 {
1555   %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4
1556   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8
1557   %r1 = load <2 x half>, <2 x half> addrspace(1)* %gep2, align 8
1558   %r2 = frem <2 x half> %r0, %r1
1559   store <2 x half> %r2, <2 x half> addrspace(1)* %out, align 8
1560   ret void
1561}
1562
1563define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1,
1564; SI-LABEL: frem_v4f16:
1565; SI:       ; %bb.0:
1566; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1567; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1568; SI-NEXT:    s_mov_b32 s3, 0xf000
1569; SI-NEXT:    s_mov_b32 s2, -1
1570; SI-NEXT:    s_waitcnt lgkmcnt(0)
1571; SI-NEXT:    s_mov_b32 s0, s4
1572; SI-NEXT:    s_mov_b32 s1, s5
1573; SI-NEXT:    s_mov_b32 s4, s6
1574; SI-NEXT:    s_mov_b32 s5, s7
1575; SI-NEXT:    s_mov_b32 s6, s2
1576; SI-NEXT:    s_mov_b32 s7, s3
1577; SI-NEXT:    s_mov_b32 s10, s2
1578; SI-NEXT:    s_mov_b32 s11, s3
1579; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1580; SI-NEXT:    s_waitcnt vmcnt(0)
1581; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1582; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1583; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
1584; SI-NEXT:    v_cvt_f32_f16_e32 v4, v1
1585; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1586; SI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1587; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1588; SI-NEXT:    s_waitcnt vmcnt(0)
1589; SI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1590; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1591; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1592; SI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1593; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1594; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1595; SI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
1596; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
1597; SI-NEXT:    v_rcp_f32_e32 v10, v9
1598; SI-NEXT:    s_mov_b32 s6, 3
1599; SI-NEXT:    s_mov_b32 s7, 0
1600; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1601; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1602; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
1603; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
1604; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1605; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
1606; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1607; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1608; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1609; SI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
1610; SI-NEXT:    v_trunc_f32_e32 v8, v8
1611; SI-NEXT:    v_fma_f32 v1, -v8, v1, v5
1612; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1613; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1614; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1615; SI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
1616; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
1617; SI-NEXT:    v_rcp_f32_e32 v9, v8
1618; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1619; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1620; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
1621; SI-NEXT:    v_mul_f32_e32 v10, v5, v9
1622; SI-NEXT:    v_fma_f32 v11, -v8, v10, v5
1623; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
1624; SI-NEXT:    v_fma_f32 v5, -v8, v10, v5
1625; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1626; SI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
1627; SI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
1628; SI-NEXT:    v_trunc_f32_e32 v5, v5
1629; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1630; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1631; SI-NEXT:    v_or_b32_e32 v1, v4, v1
1632; SI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
1633; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
1634; SI-NEXT:    v_rcp_f32_e32 v7, v5
1635; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1636; SI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
1637; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
1638; SI-NEXT:    v_mul_f32_e32 v8, v4, v7
1639; SI-NEXT:    v_fma_f32 v9, -v5, v8, v4
1640; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
1641; SI-NEXT:    v_fma_f32 v4, -v5, v8, v4
1642; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1643; SI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
1644; SI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
1645; SI-NEXT:    v_trunc_f32_e32 v4, v4
1646; SI-NEXT:    v_fma_f32 v0, -v4, v0, v3
1647; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1648; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1649; SI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
1650; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
1651; SI-NEXT:    v_rcp_f32_e32 v5, v4
1652; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1653; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1654; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
1655; SI-NEXT:    v_mul_f32_e32 v7, v3, v5
1656; SI-NEXT:    v_fma_f32 v8, -v4, v7, v3
1657; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
1658; SI-NEXT:    v_fma_f32 v3, -v4, v7, v3
1659; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1660; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
1661; SI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
1662; SI-NEXT:    v_trunc_f32_e32 v3, v3
1663; SI-NEXT:    v_fma_f32 v2, -v3, v6, v2
1664; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1665; SI-NEXT:    v_or_b32_e32 v0, v2, v0
1666; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1667; SI-NEXT:    s_endpgm
1668;
1669; CI-LABEL: frem_v4f16:
1670; CI:       ; %bb.0:
1671; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1672; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1673; CI-NEXT:    s_mov_b32 s3, 0xf000
1674; CI-NEXT:    s_mov_b32 s2, -1
1675; CI-NEXT:    s_mov_b32 s10, s2
1676; CI-NEXT:    s_waitcnt lgkmcnt(0)
1677; CI-NEXT:    s_mov_b32 s0, s4
1678; CI-NEXT:    s_mov_b32 s1, s5
1679; CI-NEXT:    s_mov_b32 s4, s6
1680; CI-NEXT:    s_mov_b32 s5, s7
1681; CI-NEXT:    s_mov_b32 s6, s2
1682; CI-NEXT:    s_mov_b32 s7, s3
1683; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1684; CI-NEXT:    s_mov_b32 s11, s3
1685; CI-NEXT:    s_mov_b32 s6, 3
1686; CI-NEXT:    s_mov_b32 s7, 0
1687; CI-NEXT:    s_waitcnt vmcnt(0)
1688; CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1689; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1690; CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
1691; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1692; CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
1693; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1694; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32
1695; CI-NEXT:    s_waitcnt vmcnt(0)
1696; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1697; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1698; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1699; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1700; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1701; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1702; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v1, v1, v5
1703; CI-NEXT:    v_div_scale_f32 v8, vcc, v5, v1, v5
1704; CI-NEXT:    v_rcp_f32_e32 v10, v9
1705; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1706; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
1707; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
1708; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
1709; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
1710; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
1711; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
1712; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1713; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
1714; CI-NEXT:    v_div_fixup_f32 v8, v8, v1, v5
1715; CI-NEXT:    v_trunc_f32_e32 v8, v8
1716; CI-NEXT:    v_fma_f32 v1, -v8, v1, v5
1717; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v7, v7, v4
1718; CI-NEXT:    v_div_scale_f32 v5, vcc, v4, v7, v4
1719; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
1720; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
1721; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1722; CI-NEXT:    v_rcp_f32_e32 v9, v8
1723; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1724; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
1725; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
1726; CI-NEXT:    v_mul_f32_e32 v10, v5, v9
1727; CI-NEXT:    v_fma_f32 v11, -v8, v10, v5
1728; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
1729; CI-NEXT:    v_fma_f32 v5, -v8, v10, v5
1730; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1731; CI-NEXT:    v_div_fmas_f32 v5, v5, v9, v10
1732; CI-NEXT:    v_div_fixup_f32 v5, v5, v7, v4
1733; CI-NEXT:    v_trunc_f32_e32 v5, v5
1734; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1735; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v0, v0, v3
1736; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
1737; CI-NEXT:    v_or_b32_e32 v1, v4, v1
1738; CI-NEXT:    v_div_scale_f32 v4, vcc, v3, v0, v3
1739; CI-NEXT:    v_rcp_f32_e32 v7, v5
1740; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1741; CI-NEXT:    v_fma_f32 v8, -v5, v7, 1.0
1742; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
1743; CI-NEXT:    v_mul_f32_e32 v8, v4, v7
1744; CI-NEXT:    v_fma_f32 v9, -v5, v8, v4
1745; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
1746; CI-NEXT:    v_fma_f32 v4, -v5, v8, v4
1747; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1748; CI-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
1749; CI-NEXT:    v_div_fixup_f32 v4, v4, v0, v3
1750; CI-NEXT:    v_trunc_f32_e32 v4, v4
1751; CI-NEXT:    v_fma_f32 v0, -v4, v0, v3
1752; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v6, v6, v2
1753; CI-NEXT:    v_div_scale_f32 v3, vcc, v2, v6, v2
1754; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1755; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1756; CI-NEXT:    v_rcp_f32_e32 v5, v4
1757; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1758; CI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
1759; CI-NEXT:    v_fma_f32 v5, v7, v5, v5
1760; CI-NEXT:    v_mul_f32_e32 v7, v3, v5
1761; CI-NEXT:    v_fma_f32 v8, -v4, v7, v3
1762; CI-NEXT:    v_fma_f32 v7, v8, v5, v7
1763; CI-NEXT:    v_fma_f32 v3, -v4, v7, v3
1764; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1765; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
1766; CI-NEXT:    v_div_fixup_f32 v3, v3, v6, v2
1767; CI-NEXT:    v_trunc_f32_e32 v3, v3
1768; CI-NEXT:    v_fma_f32 v2, -v3, v6, v2
1769; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
1770; CI-NEXT:    v_or_b32_e32 v0, v2, v0
1771; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1772; CI-NEXT:    s_endpgm
1773;
1774; VI-LABEL: frem_v4f16:
1775; VI:       ; %bb.0:
1776; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1777; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1778; VI-NEXT:    s_waitcnt lgkmcnt(0)
1779; VI-NEXT:    v_mov_b32_e32 v2, s6
1780; VI-NEXT:    s_add_u32 s0, s0, 32
1781; VI-NEXT:    s_addc_u32 s1, s1, 0
1782; VI-NEXT:    v_mov_b32_e32 v5, s1
1783; VI-NEXT:    v_mov_b32_e32 v4, s0
1784; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
1785; VI-NEXT:    v_mov_b32_e32 v3, s7
1786; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1787; VI-NEXT:    v_mov_b32_e32 v0, s4
1788; VI-NEXT:    v_mov_b32_e32 v1, s5
1789; VI-NEXT:    s_waitcnt vmcnt(1)
1790; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
1791; VI-NEXT:    v_cvt_f32_f16_e32 v9, v8
1792; VI-NEXT:    s_waitcnt vmcnt(0)
1793; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1794; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1795; VI-NEXT:    v_rcp_f32_e32 v9, v9
1796; VI-NEXT:    v_mul_f32_e32 v7, v7, v9
1797; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1798; VI-NEXT:    v_div_fixup_f16 v7, v7, v8, v6
1799; VI-NEXT:    v_trunc_f16_e32 v7, v7
1800; VI-NEXT:    v_fma_f16 v6, -v7, v8, v6
1801; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
1802; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
1803; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
1804; VI-NEXT:    v_rcp_f32_e32 v8, v8
1805; VI-NEXT:    v_mul_f32_e32 v7, v7, v8
1806; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
1807; VI-NEXT:    v_div_fixup_f16 v7, v7, v5, v3
1808; VI-NEXT:    v_trunc_f16_e32 v7, v7
1809; VI-NEXT:    v_fma_f16 v3, -v7, v5, v3
1810; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1811; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
1812; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1813; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1814; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
1815; VI-NEXT:    v_rcp_f32_e32 v8, v8
1816; VI-NEXT:    v_mul_f32_e32 v6, v6, v8
1817; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1818; VI-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
1819; VI-NEXT:    v_trunc_f16_e32 v6, v6
1820; VI-NEXT:    v_fma_f16 v5, -v6, v7, v5
1821; VI-NEXT:    v_cvt_f32_f16_e32 v7, v4
1822; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
1823; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1824; VI-NEXT:    v_rcp_f32_e32 v7, v7
1825; VI-NEXT:    v_mul_f32_e32 v6, v6, v7
1826; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
1827; VI-NEXT:    v_div_fixup_f16 v6, v6, v4, v2
1828; VI-NEXT:    v_trunc_f16_e32 v6, v6
1829; VI-NEXT:    v_fma_f16 v2, -v6, v4, v2
1830; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1831; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1832; VI-NEXT:    s_endpgm
1833;
1834; GFX9-LABEL: frem_v4f16:
1835; GFX9:       ; %bb.0:
1836; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1837; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1838; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1839; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1840; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
1841; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
1842; GFX9-NEXT:    s_waitcnt vmcnt(1)
1843; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v1
1844; GFX9-NEXT:    s_waitcnt vmcnt(0)
1845; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v3
1846; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
1847; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
1848; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
1849; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
1850; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
1851; GFX9-NEXT:    v_fma_f16 v5, -v5, v3, v1
1852; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1853; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v3
1854; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1855; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v1
1856; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
1857; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v7
1858; GFX9-NEXT:    v_cvt_f16_f32_e32 v6, v6
1859; GFX9-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
1860; GFX9-NEXT:    v_trunc_f16_e32 v6, v6
1861; GFX9-NEXT:    v_fma_f16 v1, -v6, v3, v1
1862; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v1
1863; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
1864; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
1865; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
1866; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
1867; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
1868; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
1869; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
1870; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v0
1871; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1872; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v2
1873; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1874; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v0
1875; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
1876; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
1877; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
1878; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
1879; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
1880; GFX9-NEXT:    v_fma_f16 v0, -v5, v2, v0
1881; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
1882; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
1883; GFX9-NEXT:    s_endpgm
1884;
1885; GFX10-LABEL: frem_v4f16:
1886; GFX10:       ; %bb.0:
1887; GFX10-NEXT:    s_clause 0x1
1888; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1889; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1890; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1891; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1892; GFX10-NEXT:    s_clause 0x1
1893; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
1894; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
1895; GFX10-NEXT:    s_waitcnt vmcnt(1)
1896; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
1897; GFX10-NEXT:    s_waitcnt vmcnt(0)
1898; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v3
1899; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
1900; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v6
1901; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
1902; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
1903; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
1904; GFX10-NEXT:    v_fma_f16 v5, -v5, v3, v1
1905; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1906; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1907; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v3
1908; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v1
1909; GFX10-NEXT:    v_rcp_f32_e32 v7, v7
1910; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v7
1911; GFX10-NEXT:    v_cvt_f16_f32_e32 v6, v6
1912; GFX10-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
1913; GFX10-NEXT:    v_trunc_f16_e32 v6, v6
1914; GFX10-NEXT:    v_fma_f16 v1, -v6, v3, v1
1915; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
1916; GFX10-NEXT:    v_pack_b32_f16 v1, v5, v1
1917; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
1918; GFX10-NEXT:    v_rcp_f32_e32 v5, v5
1919; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
1920; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
1921; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
1922; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
1923; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v0
1924; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1925; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1926; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v2
1927; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v0
1928; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
1929; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v6
1930; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
1931; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
1932; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
1933; GFX10-NEXT:    v_fma_f16 v0, -v5, v2, v0
1934; GFX10-NEXT:    v_pack_b32_f16 v0, v3, v0
1935; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
1936; GFX10-NEXT:    s_endpgm
1937                        <4 x half> addrspace(1)* %in2) #0 {
1938   %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4
1939   %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16
1940   %r1 = load <4 x half>, <4 x half> addrspace(1)* %gep2, align 16
1941   %r2 = frem <4 x half> %r0, %r1
1942   store <4 x half> %r2, <4 x half> addrspace(1)* %out, align 16
1943   ret void
1944}
1945
1946define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
1947; SI-LABEL: frem_v2f32:
1948; SI:       ; %bb.0:
1949; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1950; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
1951; SI-NEXT:    s_mov_b32 s3, 0xf000
1952; SI-NEXT:    s_mov_b32 s2, -1
1953; SI-NEXT:    s_waitcnt lgkmcnt(0)
1954; SI-NEXT:    s_mov_b32 s0, s4
1955; SI-NEXT:    s_mov_b32 s1, s5
1956; SI-NEXT:    s_mov_b32 s4, s6
1957; SI-NEXT:    s_mov_b32 s5, s7
1958; SI-NEXT:    s_mov_b32 s6, s2
1959; SI-NEXT:    s_mov_b32 s7, s3
1960; SI-NEXT:    s_mov_b32 s10, s2
1961; SI-NEXT:    s_mov_b32 s11, s3
1962; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
1963; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
1964; SI-NEXT:    s_waitcnt vmcnt(0)
1965; SI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
1966; SI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
1967; SI-NEXT:    v_rcp_f32_e32 v6, v5
1968; SI-NEXT:    s_mov_b32 s6, 3
1969; SI-NEXT:    s_mov_b32 s7, 0
1970; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1971; SI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
1972; SI-NEXT:    v_fma_f32 v6, v7, v6, v6
1973; SI-NEXT:    v_mul_f32_e32 v7, v4, v6
1974; SI-NEXT:    v_fma_f32 v8, -v5, v7, v4
1975; SI-NEXT:    v_fma_f32 v7, v8, v6, v7
1976; SI-NEXT:    v_fma_f32 v4, -v5, v7, v4
1977; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1978; SI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
1979; SI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
1980; SI-NEXT:    v_trunc_f32_e32 v4, v4
1981; SI-NEXT:    v_fma_f32 v1, -v4, v3, v1
1982; SI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
1983; SI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
1984; SI-NEXT:    v_rcp_f32_e32 v5, v4
1985; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
1986; SI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
1987; SI-NEXT:    v_fma_f32 v5, v6, v5, v5
1988; SI-NEXT:    v_mul_f32_e32 v6, v3, v5
1989; SI-NEXT:    v_fma_f32 v7, -v4, v6, v3
1990; SI-NEXT:    v_fma_f32 v6, v7, v5, v6
1991; SI-NEXT:    v_fma_f32 v3, -v4, v6, v3
1992; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
1993; SI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
1994; SI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
1995; SI-NEXT:    v_trunc_f32_e32 v3, v3
1996; SI-NEXT:    v_fma_f32 v0, -v3, v2, v0
1997; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1998; SI-NEXT:    s_endpgm
1999;
2000; CI-LABEL: frem_v2f32:
2001; CI:       ; %bb.0:
2002; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2003; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2004; CI-NEXT:    s_mov_b32 s3, 0xf000
2005; CI-NEXT:    s_mov_b32 s2, -1
2006; CI-NEXT:    s_mov_b32 s10, s2
2007; CI-NEXT:    s_waitcnt lgkmcnt(0)
2008; CI-NEXT:    s_mov_b32 s0, s4
2009; CI-NEXT:    s_mov_b32 s1, s5
2010; CI-NEXT:    s_mov_b32 s4, s6
2011; CI-NEXT:    s_mov_b32 s5, s7
2012; CI-NEXT:    s_mov_b32 s6, s2
2013; CI-NEXT:    s_mov_b32 s7, s3
2014; CI-NEXT:    s_mov_b32 s11, s3
2015; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
2016; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
2017; CI-NEXT:    s_mov_b32 s6, 3
2018; CI-NEXT:    s_mov_b32 s7, 0
2019; CI-NEXT:    s_waitcnt vmcnt(0)
2020; CI-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
2021; CI-NEXT:    v_div_scale_f32 v4, vcc, v1, v3, v1
2022; CI-NEXT:    v_rcp_f32_e32 v6, v5
2023; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2024; CI-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2025; CI-NEXT:    v_fma_f32 v6, v7, v6, v6
2026; CI-NEXT:    v_mul_f32_e32 v7, v4, v6
2027; CI-NEXT:    v_fma_f32 v8, -v5, v7, v4
2028; CI-NEXT:    v_fma_f32 v7, v8, v6, v7
2029; CI-NEXT:    v_fma_f32 v4, -v5, v7, v4
2030; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2031; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
2032; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v1
2033; CI-NEXT:    v_trunc_f32_e32 v4, v4
2034; CI-NEXT:    v_fma_f32 v1, -v4, v3, v1
2035; CI-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
2036; CI-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2037; CI-NEXT:    v_rcp_f32_e32 v5, v4
2038; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2039; CI-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
2040; CI-NEXT:    v_fma_f32 v5, v6, v5, v5
2041; CI-NEXT:    v_mul_f32_e32 v6, v3, v5
2042; CI-NEXT:    v_fma_f32 v7, -v4, v6, v3
2043; CI-NEXT:    v_fma_f32 v6, v7, v5, v6
2044; CI-NEXT:    v_fma_f32 v3, -v4, v6, v3
2045; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2046; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
2047; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2048; CI-NEXT:    v_trunc_f32_e32 v3, v3
2049; CI-NEXT:    v_fma_f32 v0, -v3, v2, v0
2050; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2051; CI-NEXT:    s_endpgm
2052;
2053; VI-LABEL: frem_v2f32:
2054; VI:       ; %bb.0:
2055; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2056; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2057; VI-NEXT:    s_mov_b32 s2, 3
2058; VI-NEXT:    s_mov_b32 s3, 0
2059; VI-NEXT:    s_waitcnt lgkmcnt(0)
2060; VI-NEXT:    v_mov_b32_e32 v2, s6
2061; VI-NEXT:    s_add_u32 s0, s0, 32
2062; VI-NEXT:    s_addc_u32 s1, s1, 0
2063; VI-NEXT:    v_mov_b32_e32 v5, s1
2064; VI-NEXT:    v_mov_b32_e32 v3, s7
2065; VI-NEXT:    v_mov_b32_e32 v4, s0
2066; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
2067; VI-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
2068; VI-NEXT:    v_mov_b32_e32 v0, s4
2069; VI-NEXT:    v_mov_b32_e32 v1, s5
2070; VI-NEXT:    s_waitcnt vmcnt(0)
2071; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v3
2072; VI-NEXT:    v_div_scale_f32 v6, vcc, v3, v5, v3
2073; VI-NEXT:    v_rcp_f32_e32 v8, v7
2074; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2075; VI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2076; VI-NEXT:    v_fma_f32 v8, v9, v8, v8
2077; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
2078; VI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2079; VI-NEXT:    v_fma_f32 v9, v10, v8, v9
2080; VI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2081; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2082; VI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2083; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v3
2084; VI-NEXT:    v_trunc_f32_e32 v6, v6
2085; VI-NEXT:    v_fma_f32 v3, -v6, v5, v3
2086; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v2
2087; VI-NEXT:    v_div_scale_f32 v5, vcc, v2, v4, v2
2088; VI-NEXT:    v_rcp_f32_e32 v7, v6
2089; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2090; VI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2091; VI-NEXT:    v_fma_f32 v7, v8, v7, v7
2092; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
2093; VI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2094; VI-NEXT:    v_fma_f32 v8, v9, v7, v8
2095; VI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2096; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2097; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2098; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v2
2099; VI-NEXT:    v_trunc_f32_e32 v5, v5
2100; VI-NEXT:    v_fma_f32 v2, -v5, v4, v2
2101; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2102; VI-NEXT:    s_endpgm
2103;
2104; GFX9-LABEL: frem_v2f32:
2105; GFX9:       ; %bb.0:
2106; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2107; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2108; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2109; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2110; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
2111; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2112; GFX9-NEXT:    s_mov_b32 s2, 3
2113; GFX9-NEXT:    s_mov_b32 s3, 0
2114; GFX9-NEXT:    s_waitcnt vmcnt(0)
2115; GFX9-NEXT:    v_div_scale_f32 v6, s[0:1], v3, v3, v1
2116; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
2117; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
2118; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2119; GFX9-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2120; GFX9-NEXT:    v_fma_f32 v7, v8, v7, v7
2121; GFX9-NEXT:    v_mul_f32_e32 v8, v5, v7
2122; GFX9-NEXT:    v_fma_f32 v9, -v6, v8, v5
2123; GFX9-NEXT:    v_fma_f32 v8, v9, v7, v8
2124; GFX9-NEXT:    v_fma_f32 v5, -v6, v8, v5
2125; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2126; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2127; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
2128; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2129; GFX9-NEXT:    v_fma_f32 v1, -v5, v3, v1
2130; GFX9-NEXT:    v_div_scale_f32 v5, s[0:1], v2, v2, v0
2131; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v2, v0
2132; GFX9-NEXT:    v_rcp_f32_e32 v6, v5
2133; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2134; GFX9-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2135; GFX9-NEXT:    v_fma_f32 v6, v7, v6, v6
2136; GFX9-NEXT:    v_mul_f32_e32 v7, v3, v6
2137; GFX9-NEXT:    v_fma_f32 v8, -v5, v7, v3
2138; GFX9-NEXT:    v_fma_f32 v7, v8, v6, v7
2139; GFX9-NEXT:    v_fma_f32 v3, -v5, v7, v3
2140; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2141; GFX9-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
2142; GFX9-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2143; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2144; GFX9-NEXT:    v_fma_f32 v0, -v3, v2, v0
2145; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
2146; GFX9-NEXT:    s_endpgm
2147;
2148; GFX10-LABEL: frem_v2f32:
2149; GFX10:       ; %bb.0:
2150; GFX10-NEXT:    s_clause 0x1
2151; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2152; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2153; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2154; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2155; GFX10-NEXT:    s_clause 0x1
2156; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
2157; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
2158; GFX10-NEXT:    s_waitcnt vmcnt(0)
2159; GFX10-NEXT:    v_div_scale_f32 v6, s0, v3, v3, v1
2160; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
2161; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
2162; GFX10-NEXT:    s_denorm_mode 15
2163; GFX10-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2164; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v7
2165; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
2166; GFX10-NEXT:    v_fma_f32 v9, -v6, v8, v5
2167; GFX10-NEXT:    v_fmac_f32_e32 v8, v9, v7
2168; GFX10-NEXT:    v_fma_f32 v5, -v6, v8, v5
2169; GFX10-NEXT:    s_denorm_mode 12
2170; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2171; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
2172; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
2173; GFX10-NEXT:    v_fma_f32 v1, -v5, v3, v1
2174; GFX10-NEXT:    v_div_scale_f32 v5, s0, v2, v2, v0
2175; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
2176; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
2177; GFX10-NEXT:    s_denorm_mode 15
2178; GFX10-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
2179; GFX10-NEXT:    v_fmac_f32_e32 v6, v7, v6
2180; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
2181; GFX10-NEXT:    v_fma_f32 v8, -v5, v7, v3
2182; GFX10-NEXT:    v_fmac_f32_e32 v7, v8, v6
2183; GFX10-NEXT:    v_fma_f32 v3, -v5, v7, v3
2184; GFX10-NEXT:    s_denorm_mode 12
2185; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
2186; GFX10-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
2187; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
2188; GFX10-NEXT:    v_fma_f32 v0, -v3, v2, v0
2189; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
2190; GFX10-NEXT:    s_endpgm
2191                        <2 x float> addrspace(1)* %in2) #0 {
2192   %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
2193   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
2194   %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8
2195   %r2 = frem <2 x float> %r0, %r1
2196   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
2197   ret void
2198}
2199
2200define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
2201; SI-LABEL: frem_v4f32:
2202; SI:       ; %bb.0:
2203; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2204; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2205; SI-NEXT:    s_mov_b32 s3, 0xf000
2206; SI-NEXT:    s_mov_b32 s2, -1
2207; SI-NEXT:    s_waitcnt lgkmcnt(0)
2208; SI-NEXT:    s_mov_b32 s0, s4
2209; SI-NEXT:    s_mov_b32 s1, s5
2210; SI-NEXT:    s_mov_b32 s4, s6
2211; SI-NEXT:    s_mov_b32 s5, s7
2212; SI-NEXT:    s_mov_b32 s6, s2
2213; SI-NEXT:    s_mov_b32 s7, s3
2214; SI-NEXT:    s_mov_b32 s10, s2
2215; SI-NEXT:    s_mov_b32 s11, s3
2216; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2217; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2218; SI-NEXT:    s_waitcnt vmcnt(0)
2219; SI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
2220; SI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
2221; SI-NEXT:    v_rcp_f32_e32 v10, v9
2222; SI-NEXT:    s_mov_b32 s6, 3
2223; SI-NEXT:    s_mov_b32 s7, 0
2224; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2225; SI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2226; SI-NEXT:    v_fma_f32 v10, v11, v10, v10
2227; SI-NEXT:    v_mul_f32_e32 v11, v8, v10
2228; SI-NEXT:    v_fma_f32 v12, -v9, v11, v8
2229; SI-NEXT:    v_fma_f32 v11, v12, v10, v11
2230; SI-NEXT:    v_fma_f32 v8, -v9, v11, v8
2231; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2232; SI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
2233; SI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
2234; SI-NEXT:    v_trunc_f32_e32 v8, v8
2235; SI-NEXT:    v_fma_f32 v3, -v8, v7, v3
2236; SI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2237; SI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
2238; SI-NEXT:    v_rcp_f32_e32 v9, v8
2239; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2240; SI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2241; SI-NEXT:    v_fma_f32 v9, v10, v9, v9
2242; SI-NEXT:    v_mul_f32_e32 v10, v7, v9
2243; SI-NEXT:    v_fma_f32 v11, -v8, v10, v7
2244; SI-NEXT:    v_fma_f32 v10, v11, v9, v10
2245; SI-NEXT:    v_fma_f32 v7, -v8, v10, v7
2246; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2247; SI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
2248; SI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2249; SI-NEXT:    v_trunc_f32_e32 v7, v7
2250; SI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2251; SI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2252; SI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
2253; SI-NEXT:    v_rcp_f32_e32 v8, v7
2254; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2255; SI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2256; SI-NEXT:    v_fma_f32 v8, v9, v8, v8
2257; SI-NEXT:    v_mul_f32_e32 v9, v6, v8
2258; SI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2259; SI-NEXT:    v_fma_f32 v9, v10, v8, v9
2260; SI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2261; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2262; SI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2263; SI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2264; SI-NEXT:    v_trunc_f32_e32 v6, v6
2265; SI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2266; SI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2267; SI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
2268; SI-NEXT:    v_rcp_f32_e32 v7, v6
2269; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2270; SI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2271; SI-NEXT:    v_fma_f32 v7, v8, v7, v7
2272; SI-NEXT:    v_mul_f32_e32 v8, v5, v7
2273; SI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2274; SI-NEXT:    v_fma_f32 v8, v9, v7, v8
2275; SI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2276; SI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2277; SI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2278; SI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2279; SI-NEXT:    v_trunc_f32_e32 v5, v5
2280; SI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2281; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2282; SI-NEXT:    s_endpgm
2283;
2284; CI-LABEL: frem_v4f32:
2285; CI:       ; %bb.0:
2286; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2287; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2288; CI-NEXT:    s_mov_b32 s3, 0xf000
2289; CI-NEXT:    s_mov_b32 s2, -1
2290; CI-NEXT:    s_mov_b32 s10, s2
2291; CI-NEXT:    s_waitcnt lgkmcnt(0)
2292; CI-NEXT:    s_mov_b32 s0, s4
2293; CI-NEXT:    s_mov_b32 s1, s5
2294; CI-NEXT:    s_mov_b32 s4, s6
2295; CI-NEXT:    s_mov_b32 s5, s7
2296; CI-NEXT:    s_mov_b32 s6, s2
2297; CI-NEXT:    s_mov_b32 s7, s3
2298; CI-NEXT:    s_mov_b32 s11, s3
2299; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2300; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2301; CI-NEXT:    s_mov_b32 s6, 3
2302; CI-NEXT:    s_mov_b32 s7, 0
2303; CI-NEXT:    s_waitcnt vmcnt(0)
2304; CI-NEXT:    v_div_scale_f32 v9, s[4:5], v7, v7, v3
2305; CI-NEXT:    v_div_scale_f32 v8, vcc, v3, v7, v3
2306; CI-NEXT:    v_rcp_f32_e32 v10, v9
2307; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2308; CI-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2309; CI-NEXT:    v_fma_f32 v10, v11, v10, v10
2310; CI-NEXT:    v_mul_f32_e32 v11, v8, v10
2311; CI-NEXT:    v_fma_f32 v12, -v9, v11, v8
2312; CI-NEXT:    v_fma_f32 v11, v12, v10, v11
2313; CI-NEXT:    v_fma_f32 v8, -v9, v11, v8
2314; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2315; CI-NEXT:    v_div_fmas_f32 v8, v8, v10, v11
2316; CI-NEXT:    v_div_fixup_f32 v8, v8, v7, v3
2317; CI-NEXT:    v_trunc_f32_e32 v8, v8
2318; CI-NEXT:    v_fma_f32 v3, -v8, v7, v3
2319; CI-NEXT:    v_div_scale_f32 v8, s[4:5], v6, v6, v2
2320; CI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2321; CI-NEXT:    v_rcp_f32_e32 v9, v8
2322; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2323; CI-NEXT:    v_fma_f32 v10, -v8, v9, 1.0
2324; CI-NEXT:    v_fma_f32 v9, v10, v9, v9
2325; CI-NEXT:    v_mul_f32_e32 v10, v7, v9
2326; CI-NEXT:    v_fma_f32 v11, -v8, v10, v7
2327; CI-NEXT:    v_fma_f32 v10, v11, v9, v10
2328; CI-NEXT:    v_fma_f32 v7, -v8, v10, v7
2329; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2330; CI-NEXT:    v_div_fmas_f32 v7, v7, v9, v10
2331; CI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2332; CI-NEXT:    v_trunc_f32_e32 v7, v7
2333; CI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2334; CI-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, v1
2335; CI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2336; CI-NEXT:    v_rcp_f32_e32 v8, v7
2337; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2338; CI-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
2339; CI-NEXT:    v_fma_f32 v8, v9, v8, v8
2340; CI-NEXT:    v_mul_f32_e32 v9, v6, v8
2341; CI-NEXT:    v_fma_f32 v10, -v7, v9, v6
2342; CI-NEXT:    v_fma_f32 v9, v10, v8, v9
2343; CI-NEXT:    v_fma_f32 v6, -v7, v9, v6
2344; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2345; CI-NEXT:    v_div_fmas_f32 v6, v6, v8, v9
2346; CI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2347; CI-NEXT:    v_trunc_f32_e32 v6, v6
2348; CI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2349; CI-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, v0
2350; CI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2351; CI-NEXT:    v_rcp_f32_e32 v7, v6
2352; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
2353; CI-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
2354; CI-NEXT:    v_fma_f32 v7, v8, v7, v7
2355; CI-NEXT:    v_mul_f32_e32 v8, v5, v7
2356; CI-NEXT:    v_fma_f32 v9, -v6, v8, v5
2357; CI-NEXT:    v_fma_f32 v8, v9, v7, v8
2358; CI-NEXT:    v_fma_f32 v5, -v6, v8, v5
2359; CI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
2360; CI-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
2361; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2362; CI-NEXT:    v_trunc_f32_e32 v5, v5
2363; CI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2364; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2365; CI-NEXT:    s_endpgm
2366;
2367; VI-LABEL: frem_v4f32:
2368; VI:       ; %bb.0:
2369; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2370; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2371; VI-NEXT:    s_mov_b32 s2, 3
2372; VI-NEXT:    s_mov_b32 s3, 0
2373; VI-NEXT:    s_waitcnt lgkmcnt(0)
2374; VI-NEXT:    v_mov_b32_e32 v0, s6
2375; VI-NEXT:    s_add_u32 s0, s0, 64
2376; VI-NEXT:    s_addc_u32 s1, s1, 0
2377; VI-NEXT:    v_mov_b32_e32 v5, s1
2378; VI-NEXT:    v_mov_b32_e32 v1, s7
2379; VI-NEXT:    v_mov_b32_e32 v4, s0
2380; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2381; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2382; VI-NEXT:    v_mov_b32_e32 v8, s4
2383; VI-NEXT:    v_mov_b32_e32 v9, s5
2384; VI-NEXT:    s_waitcnt vmcnt(0)
2385; VI-NEXT:    v_div_scale_f32 v11, s[0:1], v7, v7, v3
2386; VI-NEXT:    v_div_scale_f32 v10, vcc, v3, v7, v3
2387; VI-NEXT:    v_rcp_f32_e32 v12, v11
2388; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2389; VI-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
2390; VI-NEXT:    v_fma_f32 v12, v13, v12, v12
2391; VI-NEXT:    v_mul_f32_e32 v13, v10, v12
2392; VI-NEXT:    v_fma_f32 v14, -v11, v13, v10
2393; VI-NEXT:    v_fma_f32 v13, v14, v12, v13
2394; VI-NEXT:    v_fma_f32 v10, -v11, v13, v10
2395; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2396; VI-NEXT:    v_div_fmas_f32 v10, v10, v12, v13
2397; VI-NEXT:    v_div_fixup_f32 v10, v10, v7, v3
2398; VI-NEXT:    v_trunc_f32_e32 v10, v10
2399; VI-NEXT:    v_fma_f32 v3, -v10, v7, v3
2400; VI-NEXT:    v_div_scale_f32 v10, s[0:1], v6, v6, v2
2401; VI-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2402; VI-NEXT:    v_rcp_f32_e32 v11, v10
2403; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2404; VI-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2405; VI-NEXT:    v_fma_f32 v11, v12, v11, v11
2406; VI-NEXT:    v_mul_f32_e32 v12, v7, v11
2407; VI-NEXT:    v_fma_f32 v13, -v10, v12, v7
2408; VI-NEXT:    v_fma_f32 v12, v13, v11, v12
2409; VI-NEXT:    v_fma_f32 v7, -v10, v12, v7
2410; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2411; VI-NEXT:    v_div_fmas_f32 v7, v7, v11, v12
2412; VI-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2413; VI-NEXT:    v_trunc_f32_e32 v7, v7
2414; VI-NEXT:    v_fma_f32 v2, -v7, v6, v2
2415; VI-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
2416; VI-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2417; VI-NEXT:    v_rcp_f32_e32 v10, v7
2418; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2419; VI-NEXT:    v_fma_f32 v11, -v7, v10, 1.0
2420; VI-NEXT:    v_fma_f32 v10, v11, v10, v10
2421; VI-NEXT:    v_mul_f32_e32 v11, v6, v10
2422; VI-NEXT:    v_fma_f32 v12, -v7, v11, v6
2423; VI-NEXT:    v_fma_f32 v11, v12, v10, v11
2424; VI-NEXT:    v_fma_f32 v6, -v7, v11, v6
2425; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2426; VI-NEXT:    v_div_fmas_f32 v6, v6, v10, v11
2427; VI-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2428; VI-NEXT:    v_trunc_f32_e32 v6, v6
2429; VI-NEXT:    v_fma_f32 v1, -v6, v5, v1
2430; VI-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
2431; VI-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2432; VI-NEXT:    v_rcp_f32_e32 v7, v6
2433; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2434; VI-NEXT:    v_fma_f32 v10, -v6, v7, 1.0
2435; VI-NEXT:    v_fma_f32 v7, v10, v7, v7
2436; VI-NEXT:    v_mul_f32_e32 v10, v5, v7
2437; VI-NEXT:    v_fma_f32 v11, -v6, v10, v5
2438; VI-NEXT:    v_fma_f32 v10, v11, v7, v10
2439; VI-NEXT:    v_fma_f32 v5, -v6, v10, v5
2440; VI-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2441; VI-NEXT:    v_div_fmas_f32 v5, v5, v7, v10
2442; VI-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2443; VI-NEXT:    v_trunc_f32_e32 v5, v5
2444; VI-NEXT:    v_fma_f32 v0, -v5, v4, v0
2445; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2446; VI-NEXT:    s_endpgm
2447;
2448; GFX9-LABEL: frem_v4f32:
2449; GFX9:       ; %bb.0:
2450; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2451; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2452; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2453; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2454; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
2455; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2456; GFX9-NEXT:    s_mov_b32 s2, 3
2457; GFX9-NEXT:    s_mov_b32 s3, 0
2458; GFX9-NEXT:    s_waitcnt vmcnt(0)
2459; GFX9-NEXT:    v_div_scale_f32 v10, s[0:1], v7, v7, v3
2460; GFX9-NEXT:    v_div_scale_f32 v9, vcc, v3, v7, v3
2461; GFX9-NEXT:    v_rcp_f32_e32 v11, v10
2462; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2463; GFX9-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2464; GFX9-NEXT:    v_fma_f32 v11, v12, v11, v11
2465; GFX9-NEXT:    v_mul_f32_e32 v12, v9, v11
2466; GFX9-NEXT:    v_fma_f32 v13, -v10, v12, v9
2467; GFX9-NEXT:    v_fma_f32 v12, v13, v11, v12
2468; GFX9-NEXT:    v_fma_f32 v9, -v10, v12, v9
2469; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2470; GFX9-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
2471; GFX9-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
2472; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
2473; GFX9-NEXT:    v_fma_f32 v3, -v9, v7, v3
2474; GFX9-NEXT:    v_div_scale_f32 v9, s[0:1], v6, v6, v2
2475; GFX9-NEXT:    v_div_scale_f32 v7, vcc, v2, v6, v2
2476; GFX9-NEXT:    v_rcp_f32_e32 v10, v9
2477; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2478; GFX9-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2479; GFX9-NEXT:    v_fma_f32 v10, v11, v10, v10
2480; GFX9-NEXT:    v_mul_f32_e32 v11, v7, v10
2481; GFX9-NEXT:    v_fma_f32 v12, -v9, v11, v7
2482; GFX9-NEXT:    v_fma_f32 v11, v12, v10, v11
2483; GFX9-NEXT:    v_fma_f32 v7, -v9, v11, v7
2484; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2485; GFX9-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
2486; GFX9-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2487; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
2488; GFX9-NEXT:    v_fma_f32 v2, -v7, v6, v2
2489; GFX9-NEXT:    v_div_scale_f32 v7, s[0:1], v5, v5, v1
2490; GFX9-NEXT:    v_div_scale_f32 v6, vcc, v1, v5, v1
2491; GFX9-NEXT:    v_rcp_f32_e32 v9, v7
2492; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2493; GFX9-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
2494; GFX9-NEXT:    v_fma_f32 v9, v10, v9, v9
2495; GFX9-NEXT:    v_mul_f32_e32 v10, v6, v9
2496; GFX9-NEXT:    v_fma_f32 v11, -v7, v10, v6
2497; GFX9-NEXT:    v_fma_f32 v10, v11, v9, v10
2498; GFX9-NEXT:    v_fma_f32 v6, -v7, v10, v6
2499; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2500; GFX9-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
2501; GFX9-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2502; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
2503; GFX9-NEXT:    v_fma_f32 v1, -v6, v5, v1
2504; GFX9-NEXT:    v_div_scale_f32 v6, s[0:1], v4, v4, v0
2505; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v0, v4, v0
2506; GFX9-NEXT:    v_rcp_f32_e32 v7, v6
2507; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
2508; GFX9-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
2509; GFX9-NEXT:    v_fma_f32 v7, v9, v7, v7
2510; GFX9-NEXT:    v_mul_f32_e32 v9, v5, v7
2511; GFX9-NEXT:    v_fma_f32 v10, -v6, v9, v5
2512; GFX9-NEXT:    v_fma_f32 v9, v10, v7, v9
2513; GFX9-NEXT:    v_fma_f32 v5, -v6, v9, v5
2514; GFX9-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
2515; GFX9-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
2516; GFX9-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2517; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2518; GFX9-NEXT:    v_fma_f32 v0, -v5, v4, v0
2519; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
2520; GFX9-NEXT:    s_endpgm
2521;
2522; GFX10-LABEL: frem_v4f32:
2523; GFX10:       ; %bb.0:
2524; GFX10-NEXT:    s_clause 0x1
2525; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2526; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2527; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2528; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2529; GFX10-NEXT:    s_clause 0x1
2530; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7]
2531; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
2532; GFX10-NEXT:    s_waitcnt vmcnt(0)
2533; GFX10-NEXT:    v_div_scale_f32 v10, s0, v7, v7, v3
2534; GFX10-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
2535; GFX10-NEXT:    v_rcp_f32_e32 v11, v10
2536; GFX10-NEXT:    s_denorm_mode 15
2537; GFX10-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
2538; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v11
2539; GFX10-NEXT:    v_mul_f32_e32 v12, v9, v11
2540; GFX10-NEXT:    v_fma_f32 v13, -v10, v12, v9
2541; GFX10-NEXT:    v_fmac_f32_e32 v12, v13, v11
2542; GFX10-NEXT:    v_fma_f32 v9, -v10, v12, v9
2543; GFX10-NEXT:    s_denorm_mode 12
2544; GFX10-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
2545; GFX10-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
2546; GFX10-NEXT:    v_trunc_f32_e32 v9, v9
2547; GFX10-NEXT:    v_fma_f32 v3, -v9, v7, v3
2548; GFX10-NEXT:    v_div_scale_f32 v9, s0, v6, v6, v2
2549; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
2550; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
2551; GFX10-NEXT:    s_denorm_mode 15
2552; GFX10-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
2553; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v10
2554; GFX10-NEXT:    v_mul_f32_e32 v11, v7, v10
2555; GFX10-NEXT:    v_fma_f32 v12, -v9, v11, v7
2556; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v10
2557; GFX10-NEXT:    v_fma_f32 v7, -v9, v11, v7
2558; GFX10-NEXT:    s_denorm_mode 12
2559; GFX10-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
2560; GFX10-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
2561; GFX10-NEXT:    v_trunc_f32_e32 v7, v7
2562; GFX10-NEXT:    v_fma_f32 v2, -v7, v6, v2
2563; GFX10-NEXT:    v_div_scale_f32 v7, s0, v5, v5, v1
2564; GFX10-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
2565; GFX10-NEXT:    v_rcp_f32_e32 v9, v7
2566; GFX10-NEXT:    s_denorm_mode 15
2567; GFX10-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
2568; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v9
2569; GFX10-NEXT:    v_mul_f32_e32 v10, v6, v9
2570; GFX10-NEXT:    v_fma_f32 v11, -v7, v10, v6
2571; GFX10-NEXT:    v_fmac_f32_e32 v10, v11, v9
2572; GFX10-NEXT:    v_fma_f32 v6, -v7, v10, v6
2573; GFX10-NEXT:    s_denorm_mode 12
2574; GFX10-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
2575; GFX10-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
2576; GFX10-NEXT:    v_trunc_f32_e32 v6, v6
2577; GFX10-NEXT:    v_fma_f32 v1, -v6, v5, v1
2578; GFX10-NEXT:    v_div_scale_f32 v6, s0, v4, v4, v0
2579; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
2580; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
2581; GFX10-NEXT:    s_denorm_mode 15
2582; GFX10-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
2583; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v7
2584; GFX10-NEXT:    v_mul_f32_e32 v9, v5, v7
2585; GFX10-NEXT:    v_fma_f32 v10, -v6, v9, v5
2586; GFX10-NEXT:    v_fmac_f32_e32 v9, v10, v7
2587; GFX10-NEXT:    v_fma_f32 v5, -v6, v9, v5
2588; GFX10-NEXT:    s_denorm_mode 12
2589; GFX10-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
2590; GFX10-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
2591; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
2592; GFX10-NEXT:    v_fma_f32 v0, -v5, v4, v0
2593; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
2594; GFX10-NEXT:    s_endpgm
2595                        <4 x float> addrspace(1)* %in2) #0 {
2596   %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
2597   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
2598   %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16
2599   %r2 = frem <4 x float> %r0, %r1
2600   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
2601   ret void
2602}
2603
2604define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
2605; SI-LABEL: frem_v2f64:
2606; SI:       ; %bb.0:
2607; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
2608; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2609; SI-NEXT:    s_mov_b32 s7, 0xf000
2610; SI-NEXT:    s_mov_b32 s6, -1
2611; SI-NEXT:    s_waitcnt lgkmcnt(0)
2612; SI-NEXT:    s_mov_b32 s4, s8
2613; SI-NEXT:    s_mov_b32 s5, s9
2614; SI-NEXT:    s_mov_b32 s8, s10
2615; SI-NEXT:    s_mov_b32 s9, s11
2616; SI-NEXT:    s_mov_b32 s10, s6
2617; SI-NEXT:    s_mov_b32 s11, s7
2618; SI-NEXT:    s_mov_b32 s2, s6
2619; SI-NEXT:    s_mov_b32 s3, s7
2620; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
2621; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64
2622; SI-NEXT:    s_waitcnt vmcnt(0)
2623; SI-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
2624; SI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
2625; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2626; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2627; SI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2628; SI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2629; SI-NEXT:    v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3]
2630; SI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2631; SI-NEXT:    v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13]
2632; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
2633; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v3, v13
2634; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
2635; SI-NEXT:    s_nop 1
2636; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
2637; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2638; SI-NEXT:    v_bfe_u32 v10, v9, 20, 11
2639; SI-NEXT:    s_movk_i32 s8, 0xfc01
2640; SI-NEXT:    v_add_i32_e32 v12, vcc, s8, v10
2641; SI-NEXT:    s_mov_b32 s3, 0xfffff
2642; SI-NEXT:    v_lshr_b64 v[10:11], s[2:3], v12
2643; SI-NEXT:    v_not_b32_e32 v10, v10
2644; SI-NEXT:    v_and_b32_e32 v10, v8, v10
2645; SI-NEXT:    v_not_b32_e32 v11, v11
2646; SI-NEXT:    v_and_b32_e32 v11, v9, v11
2647; SI-NEXT:    s_brev_b32 s9, 1
2648; SI-NEXT:    v_and_b32_e32 v13, s9, v9
2649; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v12
2650; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
2651; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v12
2652; SI-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[0:1]
2653; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
2654; SI-NEXT:    v_cndmask_b32_e64 v8, v10, v8, s[0:1]
2655; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2656; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2657; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
2658; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2659; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2660; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2661; SI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2662; SI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1]
2663; SI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
2664; SI-NEXT:    v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11]
2665; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
2666; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v11
2667; SI-NEXT:    s_xor_b64 vcc, s[0:1], vcc
2668; SI-NEXT:    s_nop 1
2669; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
2670; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2671; SI-NEXT:    v_bfe_u32 v8, v7, 20, 11
2672; SI-NEXT:    v_add_i32_e32 v10, vcc, s8, v8
2673; SI-NEXT:    v_lshr_b64 v[8:9], s[2:3], v10
2674; SI-NEXT:    v_not_b32_e32 v8, v8
2675; SI-NEXT:    v_and_b32_e32 v8, v6, v8
2676; SI-NEXT:    v_not_b32_e32 v9, v9
2677; SI-NEXT:    v_and_b32_e32 v9, v7, v9
2678; SI-NEXT:    v_and_b32_e32 v11, s9, v7
2679; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v10
2680; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
2681; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v10
2682; SI-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[0:1]
2683; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
2684; SI-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
2685; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2686; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2687; SI-NEXT:    s_endpgm
2688;
2689; CI-LABEL: frem_v2f64:
2690; CI:       ; %bb.0:
2691; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2692; CI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
2693; CI-NEXT:    s_mov_b32 s3, 0xf000
2694; CI-NEXT:    s_mov_b32 s2, -1
2695; CI-NEXT:    s_mov_b32 s10, s2
2696; CI-NEXT:    s_waitcnt lgkmcnt(0)
2697; CI-NEXT:    s_mov_b32 s0, s4
2698; CI-NEXT:    s_mov_b32 s1, s5
2699; CI-NEXT:    s_mov_b32 s4, s6
2700; CI-NEXT:    s_mov_b32 s5, s7
2701; CI-NEXT:    s_mov_b32 s6, s2
2702; CI-NEXT:    s_mov_b32 s7, s3
2703; CI-NEXT:    s_mov_b32 s11, s3
2704; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2705; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
2706; CI-NEXT:    s_waitcnt vmcnt(0)
2707; CI-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3]
2708; CI-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
2709; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2710; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2711; CI-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2712; CI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2713; CI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
2714; CI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2715; CI-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2716; CI-NEXT:    s_nop 1
2717; CI-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2718; CI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2719; CI-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
2720; CI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2721; CI-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1]
2722; CI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
2723; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2724; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2725; CI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2726; CI-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2727; CI-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
2728; CI-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
2729; CI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2730; CI-NEXT:    s_nop 1
2731; CI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2732; CI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2733; CI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
2734; CI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2735; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2736; CI-NEXT:    s_endpgm
2737;
2738; VI-LABEL: frem_v2f64:
2739; VI:       ; %bb.0:
2740; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2741; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2742; VI-NEXT:    s_waitcnt lgkmcnt(0)
2743; VI-NEXT:    v_mov_b32_e32 v0, s6
2744; VI-NEXT:    s_add_u32 s0, s0, 64
2745; VI-NEXT:    s_addc_u32 s1, s1, 0
2746; VI-NEXT:    v_mov_b32_e32 v5, s1
2747; VI-NEXT:    v_mov_b32_e32 v1, s7
2748; VI-NEXT:    v_mov_b32_e32 v4, s0
2749; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2750; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2751; VI-NEXT:    v_mov_b32_e32 v8, s4
2752; VI-NEXT:    v_mov_b32_e32 v9, s5
2753; VI-NEXT:    s_waitcnt vmcnt(0)
2754; VI-NEXT:    v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3]
2755; VI-NEXT:    v_rcp_f64_e32 v[12:13], v[10:11]
2756; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
2757; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
2758; VI-NEXT:    v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0
2759; VI-NEXT:    v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13]
2760; VI-NEXT:    v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3]
2761; VI-NEXT:    v_mul_f64 v[16:17], v[14:15], v[12:13]
2762; VI-NEXT:    v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15]
2763; VI-NEXT:    s_nop 1
2764; VI-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17]
2765; VI-NEXT:    v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3]
2766; VI-NEXT:    v_trunc_f64_e32 v[10:11], v[10:11]
2767; VI-NEXT:    v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3]
2768; VI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2769; VI-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
2770; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
2771; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2772; VI-NEXT:    v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0
2773; VI-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2774; VI-NEXT:    v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1]
2775; VI-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2776; VI-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13]
2777; VI-NEXT:    s_nop 1
2778; VI-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15]
2779; VI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2780; VI-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
2781; VI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2782; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2783; VI-NEXT:    s_endpgm
2784;
2785; GFX9-LABEL: frem_v2f64:
2786; GFX9:       ; %bb.0:
2787; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2788; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2789; GFX9-NEXT:    v_mov_b32_e32 v16, 0
2790; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2791; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
2792; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
2793; GFX9-NEXT:    s_waitcnt vmcnt(0)
2794; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3]
2795; GFX9-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
2796; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2797; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2798; GFX9-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2799; GFX9-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2800; GFX9-NEXT:    v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3]
2801; GFX9-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2802; GFX9-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2803; GFX9-NEXT:    s_nop 1
2804; GFX9-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2805; GFX9-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2806; GFX9-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
2807; GFX9-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2808; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
2809; GFX9-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
2810; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2811; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2812; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2813; GFX9-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2814; GFX9-NEXT:    v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1]
2815; GFX9-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
2816; GFX9-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2817; GFX9-NEXT:    s_nop 1
2818; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2819; GFX9-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2820; GFX9-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
2821; GFX9-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2822; GFX9-NEXT:    global_store_dwordx4 v16, v[0:3], s[4:5]
2823; GFX9-NEXT:    s_endpgm
2824;
2825; GFX10-LABEL: frem_v2f64:
2826; GFX10:       ; %bb.0:
2827; GFX10-NEXT:    s_clause 0x1
2828; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2829; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2830; GFX10-NEXT:    v_mov_b32_e32 v16, 0
2831; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2832; GFX10-NEXT:    s_clause 0x1
2833; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
2834; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[2:3] offset:64
2835; GFX10-NEXT:    s_waitcnt vmcnt(0)
2836; GFX10-NEXT:    v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3]
2837; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
2838; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2839; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2840; GFX10-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
2841; GFX10-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
2842; GFX10-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
2843; GFX10-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
2844; GFX10-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
2845; GFX10-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
2846; GFX10-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
2847; GFX10-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
2848; GFX10-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
2849; GFX10-NEXT:    v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1]
2850; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
2851; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2852; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2853; GFX10-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
2854; GFX10-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
2855; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
2856; GFX10-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
2857; GFX10-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
2858; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
2859; GFX10-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
2860; GFX10-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
2861; GFX10-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
2862; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[4:5]
2863; GFX10-NEXT:    s_endpgm
2864                        <2 x double> addrspace(1)* %in2) #0 {
2865   %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
2866   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
2867   %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16
2868   %r2 = frem <2 x double> %r0, %r1
2869   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
2870   ret void
2871}
2872
2873attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2874attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
2875