1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI %s
4
5define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
6; SI-LABEL: round_f64:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
9; SI-NEXT:    s_mov_b32 s10, -1
10; SI-NEXT:    s_mov_b32 s1, 0xfffff
11; SI-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
12; SI-NEXT:    s_mov_b32 s11, 0xf000
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
15; SI-NEXT:    s_mov_b32 s8, s4
16; SI-NEXT:    s_add_i32 s4, s0, 0xfffffc01
17; SI-NEXT:    s_mov_b32 s0, s10
18; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
19; SI-NEXT:    s_andn2_b64 s[2:3], s[6:7], s[0:1]
20; SI-NEXT:    s_and_b32 s0, s7, 0x80000000
21; SI-NEXT:    s_cmp_lt_i32 s4, 0
22; SI-NEXT:    v_mov_b32_e32 v0, s3
23; SI-NEXT:    v_mov_b32_e32 v1, s0
24; SI-NEXT:    s_cselect_b64 vcc, -1, 0
25; SI-NEXT:    s_cmp_gt_i32 s4, 51
26; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
27; SI-NEXT:    v_mov_b32_e32 v1, s7
28; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
29; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
30; SI-NEXT:    v_mov_b32_e32 v0, s2
31; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
32; SI-NEXT:    v_mov_b32_e32 v2, s6
33; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
34; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
35; SI-NEXT:    s_brev_b32 s0, -2
36; SI-NEXT:    v_mov_b32_e32 v5, s7
37; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
38; SI-NEXT:    v_bfi_b32 v4, s0, v4, v5
39; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
40; SI-NEXT:    v_mov_b32_e32 v2, 0
41; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
42; SI-NEXT:    s_mov_b32 s9, s5
43; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
44; SI-NEXT:    s_endpgm
45;
46; CI-LABEL: round_f64:
47; CI:       ; %bb.0:
48; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
49; CI-NEXT:    s_brev_b32 s5, -2
50; CI-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
51; CI-NEXT:    s_mov_b32 s7, 0xf000
52; CI-NEXT:    s_mov_b32 s6, -1
53; CI-NEXT:    s_waitcnt lgkmcnt(0)
54; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[2:3]
55; CI-NEXT:    v_mov_b32_e32 v5, s3
56; CI-NEXT:    v_add_f64 v[2:3], s[2:3], -v[0:1]
57; CI-NEXT:    v_bfi_b32 v4, s5, v4, v5
58; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
59; CI-NEXT:    v_mov_b32_e32 v2, 0
60; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
61; CI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
62; CI-NEXT:    s_mov_b32 s4, s0
63; CI-NEXT:    s_mov_b32 s5, s1
64; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
65; CI-NEXT:    s_endpgm
66  %result = call double @llvm.round.f64(double %x) #1
67  store double %result, double addrspace(1)* %out
68  ret void
69}
70
71define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
72; SI-LABEL: v_round_f64:
73; SI:       ; %bb.0:
74; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
75; SI-NEXT:    s_mov_b32 s7, 0xf000
76; SI-NEXT:    s_mov_b32 s6, 0
77; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
78; SI-NEXT:    v_mov_b32_e32 v1, 0
79; SI-NEXT:    s_waitcnt lgkmcnt(0)
80; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
81; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
82; SI-NEXT:    s_movk_i32 s4, 0xfc01
83; SI-NEXT:    s_mov_b32 s2, -1
84; SI-NEXT:    s_mov_b32 s3, 0xfffff
85; SI-NEXT:    s_brev_b32 s5, -2
86; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
87; SI-NEXT:    s_waitcnt vmcnt(0)
88; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
89; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
90; SI-NEXT:    v_lshr_b64 v[4:5], s[2:3], v6
91; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v3
92; SI-NEXT:    v_not_b32_e32 v5, v5
93; SI-NEXT:    v_not_b32_e32 v4, v4
94; SI-NEXT:    v_and_b32_e32 v5, v3, v5
95; SI-NEXT:    v_and_b32_e32 v4, v2, v4
96; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v6
97; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
98; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
99; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 51, v6
100; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
101; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
102; SI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
103; SI-NEXT:    v_bfi_b32 v2, s5, v8, v3
104; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
105; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
106; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
107; SI-NEXT:    v_mov_b32_e32 v2, 0
108; SI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
109; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
110; SI-NEXT:    s_endpgm
111;
112; CI-LABEL: v_round_f64:
113; CI:       ; %bb.0:
114; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
115; CI-NEXT:    s_mov_b32 s7, 0xf000
116; CI-NEXT:    s_mov_b32 s6, 0
117; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
118; CI-NEXT:    v_mov_b32_e32 v1, 0
119; CI-NEXT:    s_waitcnt lgkmcnt(0)
120; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
121; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
122; CI-NEXT:    s_brev_b32 s2, -2
123; CI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
124; CI-NEXT:    s_waitcnt vmcnt(0)
125; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[2:3]
126; CI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
127; CI-NEXT:    v_bfi_b32 v2, s2, v8, v3
128; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
129; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
130; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
131; CI-NEXT:    v_mov_b32_e32 v2, 0
132; CI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
133; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
134; CI-NEXT:    s_endpgm
135  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
136  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
137  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
138  %x = load double, double addrspace(1)* %gep
139  %result = call double @llvm.round.f64(double %x) #1
140  store double %result, double addrspace(1)* %out.gep
141  ret void
142}
143
144define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
145; SI-LABEL: round_v2f64:
146; SI:       ; %bb.0:
147; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
148; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
149; SI-NEXT:    s_mov_b32 s6, -1
150; SI-NEXT:    s_movk_i32 s7, 0xfc01
151; SI-NEXT:    s_mov_b32 s3, 0xfffff
152; SI-NEXT:    s_waitcnt lgkmcnt(0)
153; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
154; SI-NEXT:    s_add_i32 s14, s0, s7
155; SI-NEXT:    s_mov_b32 s2, s6
156; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s14
157; SI-NEXT:    s_brev_b32 s15, 1
158; SI-NEXT:    s_andn2_b64 s[12:13], s[10:11], s[0:1]
159; SI-NEXT:    s_and_b32 s0, s11, s15
160; SI-NEXT:    s_cmp_lt_i32 s14, 0
161; SI-NEXT:    v_mov_b32_e32 v0, s13
162; SI-NEXT:    v_mov_b32_e32 v1, s0
163; SI-NEXT:    s_cselect_b64 vcc, -1, 0
164; SI-NEXT:    s_cmp_gt_i32 s14, 51
165; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
166; SI-NEXT:    v_mov_b32_e32 v1, s11
167; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
168; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
169; SI-NEXT:    v_mov_b32_e32 v0, s12
170; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
171; SI-NEXT:    v_mov_b32_e32 v2, s10
172; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
173; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
174; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
175; SI-NEXT:    s_add_i32 s7, s0, s7
176; SI-NEXT:    s_brev_b32 s10, -2
177; SI-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
178; SI-NEXT:    v_mov_b32_e32 v4, s11
179; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
180; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
181; SI-NEXT:    v_bfi_b32 v4, s10, v6, v4
182; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
183; SI-NEXT:    s_and_b32 s0, s9, s15
184; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
185; SI-NEXT:    v_mov_b32_e32 v2, 0
186; SI-NEXT:    s_cmp_lt_i32 s7, 0
187; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
188; SI-NEXT:    v_mov_b32_e32 v0, s3
189; SI-NEXT:    v_mov_b32_e32 v1, s0
190; SI-NEXT:    s_cselect_b64 vcc, -1, 0
191; SI-NEXT:    s_cmp_gt_i32 s7, 51
192; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
193; SI-NEXT:    v_mov_b32_e32 v1, s9
194; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
195; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
196; SI-NEXT:    v_mov_b32_e32 v0, s2
197; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
198; SI-NEXT:    v_mov_b32_e32 v4, s8
199; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
200; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
201; SI-NEXT:    v_mov_b32_e32 v7, s9
202; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
203; SI-NEXT:    v_bfi_b32 v6, s10, v6, v7
204; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
205; SI-NEXT:    v_mov_b32_e32 v4, 0
206; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
207; SI-NEXT:    s_mov_b32 s7, 0xf000
208; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
209; SI-NEXT:    s_endpgm
210;
211; CI-LABEL: round_v2f64:
212; CI:       ; %bb.0:
213; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
214; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
215; CI-NEXT:    s_brev_b32 s2, -2
216; CI-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
217; CI-NEXT:    s_mov_b32 s3, 0xf000
218; CI-NEXT:    s_waitcnt lgkmcnt(0)
219; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[6:7]
220; CI-NEXT:    v_mov_b32_e32 v4, s7
221; CI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
222; CI-NEXT:    v_bfi_b32 v4, s2, v6, v4
223; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
224; CI-NEXT:    v_mov_b32_e32 v2, 0
225; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
226; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[4:5]
227; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
228; CI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[4:5]
229; CI-NEXT:    v_mov_b32_e32 v7, s5
230; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
231; CI-NEXT:    v_bfi_b32 v6, s2, v6, v7
232; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
233; CI-NEXT:    v_mov_b32_e32 v0, 0
234; CI-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
235; CI-NEXT:    s_mov_b32 s2, -1
236; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
237; CI-NEXT:    s_endpgm
238  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
239  store <2 x double> %result, <2 x double> addrspace(1)* %out
240  ret void
241}
242
243define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
244; SI-LABEL: round_v4f64:
245; SI:       ; %bb.0:
246; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
247; SI-NEXT:    s_mov_b32 s14, -1
248; SI-NEXT:    s_movk_i32 s18, 0xfc01
249; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
250; SI-NEXT:    s_mov_b32 s3, 0xfffff
251; SI-NEXT:    s_waitcnt lgkmcnt(0)
252; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
253; SI-NEXT:    s_add_i32 s19, s0, s18
254; SI-NEXT:    s_mov_b32 s2, s14
255; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s19
256; SI-NEXT:    s_brev_b32 s20, 1
257; SI-NEXT:    s_andn2_b64 s[16:17], s[6:7], s[0:1]
258; SI-NEXT:    s_and_b32 s0, s7, s20
259; SI-NEXT:    s_cmp_lt_i32 s19, 0
260; SI-NEXT:    v_mov_b32_e32 v0, s17
261; SI-NEXT:    v_mov_b32_e32 v1, s0
262; SI-NEXT:    s_cselect_b64 vcc, -1, 0
263; SI-NEXT:    s_cmp_gt_i32 s19, 51
264; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
265; SI-NEXT:    v_mov_b32_e32 v1, s7
266; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
267; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
268; SI-NEXT:    v_mov_b32_e32 v0, s16
269; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
270; SI-NEXT:    v_mov_b32_e32 v2, s6
271; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
272; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
273; SI-NEXT:    s_bfe_u32 s0, s5, 0xb0014
274; SI-NEXT:    s_add_i32 s17, s0, s18
275; SI-NEXT:    s_brev_b32 s16, -2
276; SI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
277; SI-NEXT:    v_mov_b32_e32 v4, s7
278; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
279; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s17
280; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
281; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[0:1]
282; SI-NEXT:    s_and_b32 s0, s5, s20
283; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
284; SI-NEXT:    v_mov_b32_e32 v2, 0
285; SI-NEXT:    s_cmp_lt_i32 s17, 0
286; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
287; SI-NEXT:    v_mov_b32_e32 v0, s7
288; SI-NEXT:    v_mov_b32_e32 v1, s0
289; SI-NEXT:    s_cselect_b64 vcc, -1, 0
290; SI-NEXT:    s_cmp_gt_i32 s17, 51
291; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
292; SI-NEXT:    v_mov_b32_e32 v1, s5
293; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
294; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
295; SI-NEXT:    v_mov_b32_e32 v0, s6
296; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
297; SI-NEXT:    v_mov_b32_e32 v4, s4
298; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
299; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
300; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
301; SI-NEXT:    s_add_i32 s6, s0, s18
302; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s6
303; SI-NEXT:    v_mov_b32_e32 v6, s5
304; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
305; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[0:1]
306; SI-NEXT:    s_and_b32 s0, s11, s20
307; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
308; SI-NEXT:    s_cmp_lt_i32 s6, 0
309; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc
310; SI-NEXT:    v_mov_b32_e32 v4, s5
311; SI-NEXT:    v_mov_b32_e32 v5, s0
312; SI-NEXT:    s_cselect_b64 vcc, -1, 0
313; SI-NEXT:    s_cmp_gt_i32 s6, 51
314; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
315; SI-NEXT:    v_mov_b32_e32 v5, s11
316; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
317; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
318; SI-NEXT:    v_mov_b32_e32 v4, s4
319; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
320; SI-NEXT:    v_mov_b32_e32 v6, s10
321; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
322; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
323; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
324; SI-NEXT:    s_add_i32 s4, s0, s18
325; SI-NEXT:    v_mov_b32_e32 v10, s11
326; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
327; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s4
328; SI-NEXT:    v_bfi_b32 v10, s16, v12, v10
329; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
330; SI-NEXT:    s_and_b32 s0, s9, s20
331; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
332; SI-NEXT:    v_mov_b32_e32 v6, 0
333; SI-NEXT:    s_cmp_lt_i32 s4, 0
334; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
335; SI-NEXT:    v_mov_b32_e32 v4, s3
336; SI-NEXT:    v_mov_b32_e32 v5, s0
337; SI-NEXT:    s_cselect_b64 vcc, -1, 0
338; SI-NEXT:    s_cmp_gt_i32 s4, 51
339; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
340; SI-NEXT:    v_mov_b32_e32 v5, s9
341; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
342; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
343; SI-NEXT:    v_mov_b32_e32 v4, s2
344; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
345; SI-NEXT:    v_mov_b32_e32 v10, s8
346; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
347; SI-NEXT:    v_add_f64 v[10:11], s[8:9], -v[4:5]
348; SI-NEXT:    v_mov_b32_e32 v13, s9
349; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
350; SI-NEXT:    v_bfi_b32 v12, s16, v12, v13
351; SI-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
352; SI-NEXT:    v_mov_b32_e32 v10, 0
353; SI-NEXT:    v_mov_b32_e32 v8, 0
354; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[10:11]
355; SI-NEXT:    s_mov_b32 s15, 0xf000
356; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[8:9]
357; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
358; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
359; SI-NEXT:    s_endpgm
360;
361; CI-LABEL: round_v4f64:
362; CI:       ; %bb.0:
363; CI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
364; CI-NEXT:    s_brev_b32 s12, -2
365; CI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
366; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
367; CI-NEXT:    s_mov_b32 s3, 0xf000
368; CI-NEXT:    s_waitcnt lgkmcnt(0)
369; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[6:7]
370; CI-NEXT:    v_mov_b32_e32 v4, s7
371; CI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
372; CI-NEXT:    v_bfi_b32 v4, s12, v12, v4
373; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
374; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[4:5]
375; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
376; CI-NEXT:    v_mov_b32_e32 v2, 0
377; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
378; CI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[8:9]
379; CI-NEXT:    v_mov_b32_e32 v4, s5
380; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
381; CI-NEXT:    v_bfi_b32 v4, s12, v12, v4
382; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
383; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[10:11]
384; CI-NEXT:    v_mov_b32_e32 v10, s11
385; CI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
386; CI-NEXT:    v_bfi_b32 v10, s12, v12, v10
387; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
388; CI-NEXT:    v_mov_b32_e32 v6, 0
389; CI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
390; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[8:9]
391; CI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
392; CI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[10:11]
393; CI-NEXT:    v_mov_b32_e32 v13, s9
394; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
395; CI-NEXT:    v_bfi_b32 v12, s12, v12, v13
396; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
397; CI-NEXT:    v_mov_b32_e32 v4, 0
398; CI-NEXT:    v_mov_b32_e32 v0, 0
399; CI-NEXT:    v_add_f64 v[4:5], v[10:11], v[4:5]
400; CI-NEXT:    s_mov_b32 s2, -1
401; CI-NEXT:    v_add_f64 v[0:1], v[8:9], v[0:1]
402; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
403; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
404; CI-NEXT:    s_endpgm
405  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
406  store <4 x double> %result, <4 x double> addrspace(1)* %out
407  ret void
408}
409
410define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
411; SI-LABEL: round_v8f64:
412; SI:       ; %bb.0:
413; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
414; SI-NEXT:    s_mov_b32 s22, -1
415; SI-NEXT:    s_movk_i32 s28, 0xfc01
416; SI-NEXT:    s_mov_b32 s21, 0xfffff
417; SI-NEXT:    s_mov_b32 s20, s22
418; SI-NEXT:    s_waitcnt lgkmcnt(0)
419; SI-NEXT:    s_bfe_u32 s2, s7, 0xb0014
420; SI-NEXT:    s_add_i32 s23, s2, s28
421; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s23
422; SI-NEXT:    s_brev_b32 s29, 1
423; SI-NEXT:    s_andn2_b64 s[24:25], s[6:7], s[2:3]
424; SI-NEXT:    s_and_b32 s2, s7, s29
425; SI-NEXT:    s_cmp_lt_i32 s23, 0
426; SI-NEXT:    v_mov_b32_e32 v0, s25
427; SI-NEXT:    v_mov_b32_e32 v1, s2
428; SI-NEXT:    s_cselect_b64 vcc, -1, 0
429; SI-NEXT:    s_cmp_gt_i32 s23, 51
430; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
431; SI-NEXT:    v_mov_b32_e32 v1, s7
432; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
433; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
434; SI-NEXT:    v_mov_b32_e32 v0, s24
435; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
436; SI-NEXT:    v_mov_b32_e32 v2, s6
437; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
438; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
439; SI-NEXT:    s_bfe_u32 s2, s5, 0xb0014
440; SI-NEXT:    s_add_i32 s24, s2, s28
441; SI-NEXT:    s_brev_b32 s23, -2
442; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
443; SI-NEXT:    v_mov_b32_e32 v4, s7
444; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
445; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s24
446; SI-NEXT:    v_bfi_b32 v4, s23, v8, v4
447; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[2:3]
448; SI-NEXT:    s_and_b32 s2, s5, s29
449; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
450; SI-NEXT:    v_mov_b32_e32 v2, 0
451; SI-NEXT:    s_cmp_lt_i32 s24, 0
452; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
453; SI-NEXT:    v_mov_b32_e32 v0, s7
454; SI-NEXT:    v_mov_b32_e32 v1, s2
455; SI-NEXT:    s_cselect_b64 vcc, -1, 0
456; SI-NEXT:    s_cmp_gt_i32 s24, 51
457; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
458; SI-NEXT:    v_mov_b32_e32 v1, s5
459; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
460; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
461; SI-NEXT:    v_mov_b32_e32 v0, s6
462; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
463; SI-NEXT:    v_mov_b32_e32 v4, s4
464; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
465; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
466; SI-NEXT:    s_bfe_u32 s2, s11, 0xb0014
467; SI-NEXT:    s_add_i32 s6, s2, s28
468; SI-NEXT:    v_mov_b32_e32 v6, s5
469; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
470; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
471; SI-NEXT:    v_bfi_b32 v6, s23, v8, v6
472; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[2:3]
473; SI-NEXT:    s_and_b32 s2, s11, s29
474; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
475; SI-NEXT:    v_mov_b32_e32 v4, 0
476; SI-NEXT:    s_cmp_lt_i32 s6, 0
477; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
478; SI-NEXT:    v_mov_b32_e32 v4, s5
479; SI-NEXT:    v_mov_b32_e32 v5, s2
480; SI-NEXT:    s_cselect_b64 vcc, -1, 0
481; SI-NEXT:    s_cmp_gt_i32 s6, 51
482; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
483; SI-NEXT:    v_mov_b32_e32 v5, s11
484; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
485; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
486; SI-NEXT:    v_mov_b32_e32 v4, s4
487; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
488; SI-NEXT:    v_mov_b32_e32 v6, s10
489; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[2:3]
490; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
491; SI-NEXT:    s_bfe_u32 s2, s9, 0xb0014
492; SI-NEXT:    s_add_i32 s6, s2, s28
493; SI-NEXT:    v_mov_b32_e32 v9, s11
494; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
495; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
496; SI-NEXT:    v_bfi_b32 v9, s23, v8, v9
497; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[2:3]
498; SI-NEXT:    s_and_b32 s2, s9, s29
499; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
500; SI-NEXT:    v_mov_b32_e32 v6, 0
501; SI-NEXT:    s_cmp_lt_i32 s6, 0
502; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
503; SI-NEXT:    v_mov_b32_e32 v4, s5
504; SI-NEXT:    v_mov_b32_e32 v5, s2
505; SI-NEXT:    s_cselect_b64 vcc, -1, 0
506; SI-NEXT:    s_cmp_gt_i32 s6, 51
507; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
508; SI-NEXT:    v_mov_b32_e32 v5, s9
509; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
510; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
511; SI-NEXT:    v_mov_b32_e32 v4, s4
512; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
513; SI-NEXT:    v_mov_b32_e32 v9, s8
514; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[2:3]
515; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
516; SI-NEXT:    v_add_f64 v[9:10], s[8:9], -v[4:5]
517; SI-NEXT:    s_add_i32 s4, s2, s28
518; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s4
519; SI-NEXT:    v_mov_b32_e32 v11, s9
520; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5
521; SI-NEXT:    s_andn2_b64 s[24:25], s[14:15], s[2:3]
522; SI-NEXT:    s_and_b32 s2, s15, s29
523; SI-NEXT:    v_bfi_b32 v11, s23, v8, v11
524; SI-NEXT:    s_cmp_lt_i32 s4, 0
525; SI-NEXT:    v_cndmask_b32_e32 v10, 0, v11, vcc
526; SI-NEXT:    v_mov_b32_e32 v9, 0
527; SI-NEXT:    s_cselect_b64 vcc, -1, 0
528; SI-NEXT:    s_cmp_gt_i32 s4, 51
529; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[9:10]
530; SI-NEXT:    v_mov_b32_e32 v10, s2
531; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
532; SI-NEXT:    s_bfe_u32 s4, s13, 0xb0014
533; SI-NEXT:    s_add_i32 s6, s4, s28
534; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s6
535; SI-NEXT:    s_andn2_b64 s[26:27], s[12:13], s[4:5]
536; SI-NEXT:    s_and_b32 s4, s13, s29
537; SI-NEXT:    v_mov_b32_e32 v9, s25
538; SI-NEXT:    s_cmp_lt_i32 s6, 0
539; SI-NEXT:    v_cndmask_b32_e32 v15, v9, v10, vcc
540; SI-NEXT:    v_mov_b32_e32 v10, s4
541; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
542; SI-NEXT:    s_cmp_gt_i32 s6, 51
543; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
544; SI-NEXT:    s_bfe_u32 s8, s19, 0xb0014
545; SI-NEXT:    s_add_i32 s25, s8, s28
546; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s25
547; SI-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[8:9]
548; SI-NEXT:    s_and_b32 s8, s19, s29
549; SI-NEXT:    v_mov_b32_e32 v9, s27
550; SI-NEXT:    s_cmp_lt_i32 s25, 0
551; SI-NEXT:    v_cndmask_b32_e64 v17, v9, v10, s[4:5]
552; SI-NEXT:    v_mov_b32_e32 v9, s11
553; SI-NEXT:    v_mov_b32_e32 v10, s8
554; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
555; SI-NEXT:    s_cmp_gt_i32 s25, 51
556; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[8:9]
557; SI-NEXT:    v_mov_b32_e32 v10, s19
558; SI-NEXT:    v_mov_b32_e32 v11, s10
559; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
560; SI-NEXT:    v_cndmask_b32_e64 v10, v9, v10, s[10:11]
561; SI-NEXT:    v_cndmask_b32_e64 v9, v11, 0, s[8:9]
562; SI-NEXT:    v_mov_b32_e32 v11, s18
563; SI-NEXT:    s_bfe_u32 s8, s17, 0xb0014
564; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[10:11]
565; SI-NEXT:    s_add_i32 s10, s8, s28
566; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s10
567; SI-NEXT:    s_andn2_b64 s[20:21], s[16:17], s[8:9]
568; SI-NEXT:    s_and_b32 s8, s17, s29
569; SI-NEXT:    s_cmp_lt_i32 s10, 0
570; SI-NEXT:    v_mov_b32_e32 v11, s21
571; SI-NEXT:    v_mov_b32_e32 v12, s8
572; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
573; SI-NEXT:    s_cmp_gt_i32 s10, 51
574; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[8:9]
575; SI-NEXT:    v_mov_b32_e32 v12, s17
576; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
577; SI-NEXT:    v_cndmask_b32_e64 v14, v11, v12, s[10:11]
578; SI-NEXT:    v_mov_b32_e32 v11, s20
579; SI-NEXT:    v_cndmask_b32_e64 v11, v11, 0, s[8:9]
580; SI-NEXT:    v_mov_b32_e32 v12, s16
581; SI-NEXT:    v_cndmask_b32_e64 v13, v11, v12, s[10:11]
582; SI-NEXT:    v_add_f64 v[11:12], s[16:17], -v[13:14]
583; SI-NEXT:    v_mov_b32_e32 v19, s17
584; SI-NEXT:    v_cmp_ge_f64_e64 s[8:9], |v[11:12]|, 0.5
585; SI-NEXT:    v_mov_b32_e32 v11, s19
586; SI-NEXT:    v_bfi_b32 v20, s23, v8, v11
587; SI-NEXT:    v_add_f64 v[11:12], s[18:19], -v[9:10]
588; SI-NEXT:    v_bfi_b32 v19, s23, v8, v19
589; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[11:12]|, 0.5
590; SI-NEXT:    v_mov_b32_e32 v11, 0
591; SI-NEXT:    v_cndmask_b32_e64 v12, 0, v20, s[10:11]
592; SI-NEXT:    v_add_f64 v[11:12], v[9:10], v[11:12]
593; SI-NEXT:    v_cndmask_b32_e64 v10, 0, v19, s[8:9]
594; SI-NEXT:    v_mov_b32_e32 v9, 0
595; SI-NEXT:    v_mov_b32_e32 v16, s15
596; SI-NEXT:    v_add_f64 v[9:10], v[13:14], v[9:10]
597; SI-NEXT:    v_mov_b32_e32 v13, s24
598; SI-NEXT:    v_cndmask_b32_e64 v14, v15, v16, s[2:3]
599; SI-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
600; SI-NEXT:    v_mov_b32_e32 v15, s14
601; SI-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[2:3]
602; SI-NEXT:    v_mov_b32_e32 v15, s15
603; SI-NEXT:    v_bfi_b32 v19, s23, v8, v15
604; SI-NEXT:    v_mov_b32_e32 v15, s26
605; SI-NEXT:    v_mov_b32_e32 v18, s13
606; SI-NEXT:    v_cndmask_b32_e64 v15, v15, 0, s[4:5]
607; SI-NEXT:    v_mov_b32_e32 v16, s12
608; SI-NEXT:    v_cndmask_b32_e64 v18, v17, v18, s[6:7]
609; SI-NEXT:    v_cndmask_b32_e64 v17, v15, v16, s[6:7]
610; SI-NEXT:    v_mov_b32_e32 v15, s13
611; SI-NEXT:    v_bfi_b32 v8, s23, v8, v15
612; SI-NEXT:    v_add_f64 v[15:16], s[12:13], -v[17:18]
613; SI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
614; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[15:16]|, 0.5
615; SI-NEXT:    v_add_f64 v[15:16], s[14:15], -v[13:14]
616; SI-NEXT:    s_mov_b32 s23, 0xf000
617; SI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[15:16]|, 0.5
618; SI-NEXT:    v_mov_b32_e32 v15, 0
619; SI-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s[0:1]
620; SI-NEXT:    v_add_f64 v[15:16], v[13:14], v[15:16]
621; SI-NEXT:    v_cndmask_b32_e32 v14, 0, v8, vcc
622; SI-NEXT:    v_mov_b32_e32 v13, 0
623; SI-NEXT:    v_add_f64 v[13:14], v[17:18], v[13:14]
624; SI-NEXT:    s_waitcnt lgkmcnt(0)
625; SI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[20:23], 0 offset:48
626; SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[20:23], 0 offset:32
627; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
628; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
629; SI-NEXT:    s_endpgm
630;
631; CI-LABEL: round_v8f64:
632; CI:       ; %bb.0:
633; CI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
634; CI-NEXT:    s_brev_b32 s2, -2
635; CI-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
636; CI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
637; CI-NEXT:    s_mov_b32 s23, 0xf000
638; CI-NEXT:    s_waitcnt lgkmcnt(0)
639; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[6:7]
640; CI-NEXT:    v_mov_b32_e32 v4, s7
641; CI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
642; CI-NEXT:    v_bfi_b32 v4, s2, v16, v4
643; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
644; CI-NEXT:    v_mov_b32_e32 v2, 0
645; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
646; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[4:5]
647; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
648; CI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[4:5]
649; CI-NEXT:    v_mov_b32_e32 v6, s5
650; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
651; CI-NEXT:    v_bfi_b32 v6, s2, v16, v6
652; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
653; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[10:11]
654; CI-NEXT:    v_mov_b32_e32 v0, 0
655; CI-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
656; CI-NEXT:    v_add_f64 v[4:5], s[10:11], -v[6:7]
657; CI-NEXT:    v_mov_b32_e32 v8, s11
658; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
659; CI-NEXT:    v_bfi_b32 v8, s2, v16, v8
660; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
661; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[8:9]
662; CI-NEXT:    v_mov_b32_e32 v4, 0
663; CI-NEXT:    v_add_f64 v[6:7], v[6:7], v[4:5]
664; CI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[8:9]
665; CI-NEXT:    v_mov_b32_e32 v10, s9
666; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
667; CI-NEXT:    v_bfi_b32 v10, s2, v16, v10
668; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v10, vcc
669; CI-NEXT:    v_mov_b32_e32 v4, 0
670; CI-NEXT:    v_add_f64 v[4:5], v[8:9], v[4:5]
671; CI-NEXT:    v_mov_b32_e32 v8, s15
672; CI-NEXT:    v_bfi_b32 v18, s2, v16, v8
673; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[16:17]
674; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[18:19]
675; CI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[8:9]
676; CI-NEXT:    v_mov_b32_e32 v19, s19
677; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
678; CI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[10:11]
679; CI-NEXT:    v_mov_b32_e32 v17, s17
680; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
681; CI-NEXT:    v_bfi_b32 v19, s2, v16, v19
682; CI-NEXT:    v_trunc_f64_e32 v[12:13], s[12:13]
683; CI-NEXT:    v_bfi_b32 v17, s2, v16, v17
684; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v19, s[0:1]
685; CI-NEXT:    v_mov_b32_e32 v14, 0
686; CI-NEXT:    v_add_f64 v[10:11], v[10:11], v[14:15]
687; CI-NEXT:    v_cndmask_b32_e32 v15, 0, v17, vcc
688; CI-NEXT:    v_mov_b32_e32 v14, 0
689; CI-NEXT:    v_mov_b32_e32 v17, s13
690; CI-NEXT:    v_add_f64 v[8:9], v[8:9], v[14:15]
691; CI-NEXT:    v_add_f64 v[14:15], s[12:13], -v[12:13]
692; CI-NEXT:    v_bfi_b32 v19, s2, v16, v17
693; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[14:15]
694; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
695; CI-NEXT:    v_add_f64 v[14:15], s[14:15], -v[16:17]
696; CI-NEXT:    s_mov_b32 s22, -1
697; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
698; CI-NEXT:    v_mov_b32_e32 v14, 0
699; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v18, s[0:1]
700; CI-NEXT:    v_add_f64 v[14:15], v[16:17], v[14:15]
701; CI-NEXT:    v_cndmask_b32_e32 v17, 0, v19, vcc
702; CI-NEXT:    v_mov_b32_e32 v16, 0
703; CI-NEXT:    v_add_f64 v[12:13], v[12:13], v[16:17]
704; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48
705; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32
706; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
707; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
708; CI-NEXT:    s_endpgm
709  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
710  store <8 x double> %result, <8 x double> addrspace(1)* %out
711  ret void
712}
713
714declare i32 @llvm.amdgcn.workitem.id.x() #1
715
716declare double @llvm.round.f64(double) #1
717declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
718declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
719declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
720
721attributes #0 = { nounwind }
722attributes #1 = { nounwind readnone }
723