1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI %s
4
5define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
6; SI-LABEL: round_f64:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
9; SI-NEXT:    s_mov_b32 s10, -1
10; SI-NEXT:    s_mov_b32 s1, 0xfffff
11; SI-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
12; SI-NEXT:    s_mov_b32 s11, 0xf000
13; SI-NEXT:    s_waitcnt lgkmcnt(0)
14; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
15; SI-NEXT:    s_mov_b32 s8, s4
16; SI-NEXT:    s_add_i32 s4, s0, 0xfffffc01
17; SI-NEXT:    s_mov_b32 s0, s10
18; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
19; SI-NEXT:    s_andn2_b64 s[2:3], s[6:7], s[0:1]
20; SI-NEXT:    s_and_b32 s0, s7, 0x80000000
21; SI-NEXT:    s_cmp_lt_i32 s4, 0
22; SI-NEXT:    v_mov_b32_e32 v0, s3
23; SI-NEXT:    v_mov_b32_e32 v1, s0
24; SI-NEXT:    s_cselect_b64 vcc, -1, 0
25; SI-NEXT:    s_cmp_gt_i32 s4, 51
26; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
27; SI-NEXT:    v_mov_b32_e32 v1, s7
28; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
29; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
30; SI-NEXT:    v_mov_b32_e32 v0, s2
31; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
32; SI-NEXT:    v_mov_b32_e32 v2, s6
33; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
34; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
35; SI-NEXT:    s_brev_b32 s0, -2
36; SI-NEXT:    v_mov_b32_e32 v5, s7
37; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
38; SI-NEXT:    v_bfi_b32 v4, s0, v4, v5
39; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
40; SI-NEXT:    v_mov_b32_e32 v2, 0
41; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
42; SI-NEXT:    s_mov_b32 s9, s5
43; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
44; SI-NEXT:    s_endpgm
45;
46; CI-LABEL: round_f64:
47; CI:       ; %bb.0:
48; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
49; CI-NEXT:    s_brev_b32 s5, -2
50; CI-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
51; CI-NEXT:    s_mov_b32 s7, 0xf000
52; CI-NEXT:    s_mov_b32 s6, -1
53; CI-NEXT:    s_waitcnt lgkmcnt(0)
54; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[2:3]
55; CI-NEXT:    v_mov_b32_e32 v5, s3
56; CI-NEXT:    v_add_f64 v[2:3], s[2:3], -v[0:1]
57; CI-NEXT:    v_bfi_b32 v4, s5, v4, v5
58; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
59; CI-NEXT:    v_mov_b32_e32 v2, 0
60; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
61; CI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
62; CI-NEXT:    s_mov_b32 s4, s0
63; CI-NEXT:    s_mov_b32 s5, s1
64; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
65; CI-NEXT:    s_endpgm
66  %result = call double @llvm.round.f64(double %x) #1
67  store double %result, double addrspace(1)* %out
68  ret void
69}
70
71define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
72; SI-LABEL: v_round_f64:
73; SI:       ; %bb.0:
74; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
75; SI-NEXT:    s_mov_b32 s7, 0xf000
76; SI-NEXT:    s_mov_b32 s6, 0
77; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
78; SI-NEXT:    v_mov_b32_e32 v1, 0
79; SI-NEXT:    s_waitcnt lgkmcnt(0)
80; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
81; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
82; SI-NEXT:    s_movk_i32 s4, 0xfc01
83; SI-NEXT:    s_mov_b32 s2, -1
84; SI-NEXT:    s_mov_b32 s3, 0xfffff
85; SI-NEXT:    s_brev_b32 s5, -2
86; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
87; SI-NEXT:    s_waitcnt vmcnt(0)
88; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
89; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
90; SI-NEXT:    v_lshr_b64 v[4:5], s[2:3], v6
91; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v3
92; SI-NEXT:    v_not_b32_e32 v5, v5
93; SI-NEXT:    v_not_b32_e32 v4, v4
94; SI-NEXT:    v_and_b32_e32 v5, v3, v5
95; SI-NEXT:    v_and_b32_e32 v4, v2, v4
96; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v6
97; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
98; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
99; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 51, v6
100; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
101; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
102; SI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
103; SI-NEXT:    v_bfi_b32 v2, s5, v8, v3
104; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
105; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
106; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
107; SI-NEXT:    v_mov_b32_e32 v2, 0
108; SI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
109; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
110; SI-NEXT:    s_endpgm
111;
112; CI-LABEL: v_round_f64:
113; CI:       ; %bb.0:
114; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
115; CI-NEXT:    s_mov_b32 s7, 0xf000
116; CI-NEXT:    s_mov_b32 s6, 0
117; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
118; CI-NEXT:    v_mov_b32_e32 v1, 0
119; CI-NEXT:    s_waitcnt lgkmcnt(0)
120; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
121; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
122; CI-NEXT:    s_brev_b32 s2, -2
123; CI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
124; CI-NEXT:    s_waitcnt vmcnt(0)
125; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[2:3]
126; CI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
127; CI-NEXT:    v_bfi_b32 v2, s2, v8, v3
128; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
129; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
130; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
131; CI-NEXT:    v_mov_b32_e32 v2, 0
132; CI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
133; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
134; CI-NEXT:    s_endpgm
135  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
136  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
137  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
138  %x = load double, double addrspace(1)* %gep
139  %result = call double @llvm.round.f64(double %x) #1
140  store double %result, double addrspace(1)* %out.gep
141  ret void
142}
143
144define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
145; SI-LABEL: round_v2f64:
146; SI:       ; %bb.0:
147; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
148; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
149; SI-NEXT:    s_mov_b32 s6, -1
150; SI-NEXT:    s_mov_b32 s3, 0xfffff
151; SI-NEXT:    s_mov_b32 s2, s6
152; SI-NEXT:    s_waitcnt lgkmcnt(0)
153; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
154; SI-NEXT:    s_add_i32 s7, s0, 0xfffffc01
155; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
156; SI-NEXT:    s_andn2_b64 s[12:13], s[10:11], s[0:1]
157; SI-NEXT:    s_and_b32 s0, s11, 0x80000000
158; SI-NEXT:    s_cmp_lt_i32 s7, 0
159; SI-NEXT:    v_mov_b32_e32 v0, s13
160; SI-NEXT:    v_mov_b32_e32 v1, s0
161; SI-NEXT:    s_cselect_b64 vcc, -1, 0
162; SI-NEXT:    s_cmp_gt_i32 s7, 51
163; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
164; SI-NEXT:    v_mov_b32_e32 v1, s11
165; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
166; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
167; SI-NEXT:    v_mov_b32_e32 v0, s12
168; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
169; SI-NEXT:    v_mov_b32_e32 v2, s10
170; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
171; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
172; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
173; SI-NEXT:    s_add_i32 s10, s0, 0xfffffc01
174; SI-NEXT:    s_brev_b32 s7, -2
175; SI-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
176; SI-NEXT:    v_mov_b32_e32 v4, s11
177; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
178; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s10
179; SI-NEXT:    v_bfi_b32 v4, s7, v6, v4
180; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
181; SI-NEXT:    s_and_b32 s0, s9, 0x80000000
182; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
183; SI-NEXT:    v_mov_b32_e32 v2, 0
184; SI-NEXT:    s_cmp_lt_i32 s10, 0
185; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
186; SI-NEXT:    v_mov_b32_e32 v0, s3
187; SI-NEXT:    v_mov_b32_e32 v1, s0
188; SI-NEXT:    s_cselect_b64 vcc, -1, 0
189; SI-NEXT:    s_cmp_gt_i32 s10, 51
190; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
191; SI-NEXT:    v_mov_b32_e32 v1, s9
192; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
193; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
194; SI-NEXT:    v_mov_b32_e32 v0, s2
195; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
196; SI-NEXT:    v_mov_b32_e32 v4, s8
197; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
198; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
199; SI-NEXT:    v_mov_b32_e32 v7, s9
200; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
201; SI-NEXT:    v_bfi_b32 v6, s7, v6, v7
202; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
203; SI-NEXT:    v_mov_b32_e32 v4, 0
204; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
205; SI-NEXT:    s_mov_b32 s7, 0xf000
206; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
207; SI-NEXT:    s_endpgm
208;
209; CI-LABEL: round_v2f64:
210; CI:       ; %bb.0:
211; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
212; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
213; CI-NEXT:    s_brev_b32 s2, -2
214; CI-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
215; CI-NEXT:    s_mov_b32 s3, 0xf000
216; CI-NEXT:    s_waitcnt lgkmcnt(0)
217; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[6:7]
218; CI-NEXT:    v_mov_b32_e32 v4, s7
219; CI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
220; CI-NEXT:    v_bfi_b32 v4, s2, v6, v4
221; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
222; CI-NEXT:    v_mov_b32_e32 v2, 0
223; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
224; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[4:5]
225; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
226; CI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[4:5]
227; CI-NEXT:    v_mov_b32_e32 v7, s5
228; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
229; CI-NEXT:    v_bfi_b32 v6, s2, v6, v7
230; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
231; CI-NEXT:    v_mov_b32_e32 v0, 0
232; CI-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
233; CI-NEXT:    s_mov_b32 s2, -1
234; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
235; CI-NEXT:    s_endpgm
236  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
237  store <2 x double> %result, <2 x double> addrspace(1)* %out
238  ret void
239}
240
241define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
242; SI-LABEL: round_v4f64:
243; SI:       ; %bb.0:
244; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
245; SI-NEXT:    s_mov_b32 s14, -1
246; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
247; SI-NEXT:    s_mov_b32 s3, 0xfffff
248; SI-NEXT:    s_mov_b32 s2, s14
249; SI-NEXT:    s_waitcnt lgkmcnt(0)
250; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
251; SI-NEXT:    s_add_i32 s18, s0, 0xfffffc01
252; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s18
253; SI-NEXT:    s_andn2_b64 s[16:17], s[6:7], s[0:1]
254; SI-NEXT:    s_and_b32 s0, s7, 0x80000000
255; SI-NEXT:    s_cmp_lt_i32 s18, 0
256; SI-NEXT:    v_mov_b32_e32 v0, s17
257; SI-NEXT:    v_mov_b32_e32 v1, s0
258; SI-NEXT:    s_cselect_b64 vcc, -1, 0
259; SI-NEXT:    s_cmp_gt_i32 s18, 51
260; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
261; SI-NEXT:    v_mov_b32_e32 v1, s7
262; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
263; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
264; SI-NEXT:    v_mov_b32_e32 v0, s16
265; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
266; SI-NEXT:    v_mov_b32_e32 v2, s6
267; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
268; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
269; SI-NEXT:    s_bfe_u32 s0, s5, 0xb0014
270; SI-NEXT:    s_add_i32 s17, s0, 0xfffffc01
271; SI-NEXT:    s_brev_b32 s16, -2
272; SI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
273; SI-NEXT:    v_mov_b32_e32 v4, s7
274; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
275; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s17
276; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
277; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[0:1]
278; SI-NEXT:    s_and_b32 s0, s5, 0x80000000
279; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
280; SI-NEXT:    v_mov_b32_e32 v2, 0
281; SI-NEXT:    s_cmp_lt_i32 s17, 0
282; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
283; SI-NEXT:    v_mov_b32_e32 v0, s7
284; SI-NEXT:    v_mov_b32_e32 v1, s0
285; SI-NEXT:    s_cselect_b64 vcc, -1, 0
286; SI-NEXT:    s_cmp_gt_i32 s17, 51
287; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
288; SI-NEXT:    v_mov_b32_e32 v1, s5
289; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
290; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
291; SI-NEXT:    v_mov_b32_e32 v0, s6
292; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
293; SI-NEXT:    v_mov_b32_e32 v4, s4
294; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
295; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
296; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
297; SI-NEXT:    s_add_i32 s6, s0, 0xfffffc01
298; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s6
299; SI-NEXT:    v_mov_b32_e32 v6, s5
300; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
301; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[0:1]
302; SI-NEXT:    s_and_b32 s0, s11, 0x80000000
303; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
304; SI-NEXT:    s_cmp_lt_i32 s6, 0
305; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc
306; SI-NEXT:    v_mov_b32_e32 v4, s5
307; SI-NEXT:    v_mov_b32_e32 v5, s0
308; SI-NEXT:    s_cselect_b64 vcc, -1, 0
309; SI-NEXT:    s_cmp_gt_i32 s6, 51
310; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
311; SI-NEXT:    v_mov_b32_e32 v5, s11
312; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
313; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
314; SI-NEXT:    v_mov_b32_e32 v4, s4
315; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
316; SI-NEXT:    v_mov_b32_e32 v6, s10
317; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
318; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
319; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
320; SI-NEXT:    s_add_i32 s4, s0, 0xfffffc01
321; SI-NEXT:    v_mov_b32_e32 v10, s11
322; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
323; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s4
324; SI-NEXT:    v_bfi_b32 v10, s16, v12, v10
325; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
326; SI-NEXT:    s_and_b32 s0, s9, 0x80000000
327; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
328; SI-NEXT:    v_mov_b32_e32 v6, 0
329; SI-NEXT:    s_cmp_lt_i32 s4, 0
330; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
331; SI-NEXT:    v_mov_b32_e32 v4, s3
332; SI-NEXT:    v_mov_b32_e32 v5, s0
333; SI-NEXT:    s_cselect_b64 vcc, -1, 0
334; SI-NEXT:    s_cmp_gt_i32 s4, 51
335; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
336; SI-NEXT:    v_mov_b32_e32 v5, s9
337; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
338; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
339; SI-NEXT:    v_mov_b32_e32 v4, s2
340; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
341; SI-NEXT:    v_mov_b32_e32 v10, s8
342; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
343; SI-NEXT:    v_add_f64 v[10:11], s[8:9], -v[4:5]
344; SI-NEXT:    v_mov_b32_e32 v13, s9
345; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
346; SI-NEXT:    v_bfi_b32 v12, s16, v12, v13
347; SI-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
348; SI-NEXT:    v_mov_b32_e32 v10, 0
349; SI-NEXT:    v_mov_b32_e32 v8, 0
350; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[10:11]
351; SI-NEXT:    s_mov_b32 s15, 0xf000
352; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[8:9]
353; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
354; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
355; SI-NEXT:    s_endpgm
356;
357; CI-LABEL: round_v4f64:
358; CI:       ; %bb.0:
359; CI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
360; CI-NEXT:    s_brev_b32 s12, -2
361; CI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
362; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
363; CI-NEXT:    s_mov_b32 s3, 0xf000
364; CI-NEXT:    s_waitcnt lgkmcnt(0)
365; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[6:7]
366; CI-NEXT:    v_mov_b32_e32 v4, s7
367; CI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
368; CI-NEXT:    v_bfi_b32 v4, s12, v12, v4
369; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
370; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[4:5]
371; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
372; CI-NEXT:    v_mov_b32_e32 v2, 0
373; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
374; CI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[8:9]
375; CI-NEXT:    v_mov_b32_e32 v4, s5
376; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
377; CI-NEXT:    v_bfi_b32 v4, s12, v12, v4
378; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
379; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[10:11]
380; CI-NEXT:    v_mov_b32_e32 v10, s11
381; CI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
382; CI-NEXT:    v_bfi_b32 v10, s12, v12, v10
383; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
384; CI-NEXT:    v_mov_b32_e32 v6, 0
385; CI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
386; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[8:9]
387; CI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
388; CI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[10:11]
389; CI-NEXT:    v_mov_b32_e32 v13, s9
390; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
391; CI-NEXT:    v_bfi_b32 v12, s12, v12, v13
392; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
393; CI-NEXT:    v_mov_b32_e32 v4, 0
394; CI-NEXT:    v_mov_b32_e32 v0, 0
395; CI-NEXT:    v_add_f64 v[4:5], v[10:11], v[4:5]
396; CI-NEXT:    s_mov_b32 s2, -1
397; CI-NEXT:    v_add_f64 v[0:1], v[8:9], v[0:1]
398; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
399; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
400; CI-NEXT:    s_endpgm
401  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
402  store <4 x double> %result, <4 x double> addrspace(1)* %out
403  ret void
404}
405
406define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
407; SI-LABEL: round_v8f64:
408; SI:       ; %bb.0:
409; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
410; SI-NEXT:    s_mov_b32 s22, -1
411; SI-NEXT:    s_mov_b32 s21, 0xfffff
412; SI-NEXT:    s_mov_b32 s20, s22
413; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
414; SI-NEXT:    s_waitcnt lgkmcnt(0)
415; SI-NEXT:    s_bfe_u32 s2, s7, 0xb0014
416; SI-NEXT:    s_add_i32 s26, s2, 0xfffffc01
417; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s26
418; SI-NEXT:    s_and_b32 s23, s7, 0x80000000
419; SI-NEXT:    s_andn2_b64 s[24:25], s[6:7], s[2:3]
420; SI-NEXT:    s_cmp_lt_i32 s26, 0
421; SI-NEXT:    v_mov_b32_e32 v0, s25
422; SI-NEXT:    v_mov_b32_e32 v1, s23
423; SI-NEXT:    s_cselect_b64 vcc, -1, 0
424; SI-NEXT:    s_cmp_gt_i32 s26, 51
425; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
426; SI-NEXT:    v_mov_b32_e32 v1, s7
427; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
428; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
429; SI-NEXT:    v_mov_b32_e32 v0, s24
430; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
431; SI-NEXT:    v_mov_b32_e32 v2, s6
432; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
433; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
434; SI-NEXT:    s_bfe_u32 s2, s5, 0xb0014
435; SI-NEXT:    s_add_i32 s24, s2, 0xfffffc01
436; SI-NEXT:    s_brev_b32 s23, -2
437; SI-NEXT:    v_mov_b32_e32 v4, s7
438; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
439; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s24
440; SI-NEXT:    v_bfi_b32 v4, s23, v8, v4
441; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[2:3]
442; SI-NEXT:    s_and_b32 s2, s5, 0x80000000
443; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
444; SI-NEXT:    v_mov_b32_e32 v2, 0
445; SI-NEXT:    s_cmp_lt_i32 s24, 0
446; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
447; SI-NEXT:    v_mov_b32_e32 v0, s7
448; SI-NEXT:    v_mov_b32_e32 v1, s2
449; SI-NEXT:    s_cselect_b64 vcc, -1, 0
450; SI-NEXT:    s_cmp_gt_i32 s24, 51
451; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
452; SI-NEXT:    v_mov_b32_e32 v1, s5
453; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
454; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
455; SI-NEXT:    v_mov_b32_e32 v0, s6
456; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
457; SI-NEXT:    v_mov_b32_e32 v4, s4
458; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
459; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
460; SI-NEXT:    s_bfe_u32 s2, s11, 0xb0014
461; SI-NEXT:    s_add_i32 s6, s2, 0xfffffc01
462; SI-NEXT:    v_mov_b32_e32 v6, s5
463; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
464; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
465; SI-NEXT:    v_bfi_b32 v6, s23, v8, v6
466; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[2:3]
467; SI-NEXT:    s_and_b32 s2, s11, 0x80000000
468; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
469; SI-NEXT:    v_mov_b32_e32 v4, 0
470; SI-NEXT:    s_cmp_lt_i32 s6, 0
471; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
472; SI-NEXT:    v_mov_b32_e32 v4, s5
473; SI-NEXT:    v_mov_b32_e32 v5, s2
474; SI-NEXT:    s_cselect_b64 vcc, -1, 0
475; SI-NEXT:    s_cmp_gt_i32 s6, 51
476; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
477; SI-NEXT:    v_mov_b32_e32 v5, s11
478; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
479; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
480; SI-NEXT:    v_mov_b32_e32 v4, s4
481; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
482; SI-NEXT:    v_mov_b32_e32 v6, s10
483; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[2:3]
484; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
485; SI-NEXT:    s_bfe_u32 s2, s9, 0xb0014
486; SI-NEXT:    s_add_i32 s6, s2, 0xfffffc01
487; SI-NEXT:    v_mov_b32_e32 v9, s11
488; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
489; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
490; SI-NEXT:    v_bfi_b32 v9, s23, v8, v9
491; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[2:3]
492; SI-NEXT:    s_and_b32 s2, s9, 0x80000000
493; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
494; SI-NEXT:    v_mov_b32_e32 v6, 0
495; SI-NEXT:    s_cmp_lt_i32 s6, 0
496; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
497; SI-NEXT:    v_mov_b32_e32 v4, s5
498; SI-NEXT:    v_mov_b32_e32 v5, s2
499; SI-NEXT:    s_cselect_b64 vcc, -1, 0
500; SI-NEXT:    s_cmp_gt_i32 s6, 51
501; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
502; SI-NEXT:    v_mov_b32_e32 v5, s9
503; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
504; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
505; SI-NEXT:    v_mov_b32_e32 v4, s4
506; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
507; SI-NEXT:    v_mov_b32_e32 v9, s8
508; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[2:3]
509; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
510; SI-NEXT:    v_add_f64 v[9:10], s[8:9], -v[4:5]
511; SI-NEXT:    s_add_i32 s4, s2, 0xfffffc01
512; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s4
513; SI-NEXT:    v_mov_b32_e32 v11, s9
514; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5
515; SI-NEXT:    s_andn2_b64 s[24:25], s[14:15], s[2:3]
516; SI-NEXT:    s_and_b32 s2, s15, 0x80000000
517; SI-NEXT:    v_bfi_b32 v11, s23, v8, v11
518; SI-NEXT:    s_cmp_lt_i32 s4, 0
519; SI-NEXT:    v_cndmask_b32_e32 v10, 0, v11, vcc
520; SI-NEXT:    v_mov_b32_e32 v9, 0
521; SI-NEXT:    s_cselect_b64 vcc, -1, 0
522; SI-NEXT:    s_cmp_gt_i32 s4, 51
523; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[9:10]
524; SI-NEXT:    v_mov_b32_e32 v10, s2
525; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
526; SI-NEXT:    s_bfe_u32 s4, s13, 0xb0014
527; SI-NEXT:    s_add_i32 s6, s4, 0xfffffc01
528; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s6
529; SI-NEXT:    s_andn2_b64 s[26:27], s[12:13], s[4:5]
530; SI-NEXT:    s_and_b32 s4, s13, 0x80000000
531; SI-NEXT:    v_mov_b32_e32 v9, s25
532; SI-NEXT:    s_cmp_lt_i32 s6, 0
533; SI-NEXT:    v_cndmask_b32_e32 v15, v9, v10, vcc
534; SI-NEXT:    v_mov_b32_e32 v10, s4
535; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
536; SI-NEXT:    s_cmp_gt_i32 s6, 51
537; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
538; SI-NEXT:    s_bfe_u32 s8, s19, 0xb0014
539; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
540; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s10
541; SI-NEXT:    s_andn2_b64 s[28:29], s[18:19], s[8:9]
542; SI-NEXT:    s_and_b32 s8, s19, 0x80000000
543; SI-NEXT:    v_mov_b32_e32 v9, s27
544; SI-NEXT:    s_cmp_lt_i32 s10, 0
545; SI-NEXT:    v_cndmask_b32_e64 v17, v9, v10, s[4:5]
546; SI-NEXT:    v_mov_b32_e32 v9, s29
547; SI-NEXT:    v_mov_b32_e32 v10, s8
548; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
549; SI-NEXT:    s_cmp_gt_i32 s10, 51
550; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[8:9]
551; SI-NEXT:    v_mov_b32_e32 v10, s19
552; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
553; SI-NEXT:    v_cndmask_b32_e64 v10, v9, v10, s[10:11]
554; SI-NEXT:    v_mov_b32_e32 v9, s28
555; SI-NEXT:    v_cndmask_b32_e64 v9, v9, 0, s[8:9]
556; SI-NEXT:    v_mov_b32_e32 v11, s18
557; SI-NEXT:    s_bfe_u32 s8, s17, 0xb0014
558; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[10:11]
559; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
560; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s10
561; SI-NEXT:    s_andn2_b64 s[20:21], s[16:17], s[8:9]
562; SI-NEXT:    s_and_b32 s8, s17, 0x80000000
563; SI-NEXT:    s_cmp_lt_i32 s10, 0
564; SI-NEXT:    v_mov_b32_e32 v11, s21
565; SI-NEXT:    v_mov_b32_e32 v12, s8
566; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
567; SI-NEXT:    s_cmp_gt_i32 s10, 51
568; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[8:9]
569; SI-NEXT:    v_mov_b32_e32 v12, s17
570; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
571; SI-NEXT:    v_cndmask_b32_e64 v14, v11, v12, s[10:11]
572; SI-NEXT:    v_mov_b32_e32 v11, s20
573; SI-NEXT:    v_cndmask_b32_e64 v11, v11, 0, s[8:9]
574; SI-NEXT:    v_mov_b32_e32 v12, s16
575; SI-NEXT:    v_cndmask_b32_e64 v13, v11, v12, s[10:11]
576; SI-NEXT:    v_add_f64 v[11:12], s[16:17], -v[13:14]
577; SI-NEXT:    v_mov_b32_e32 v19, s17
578; SI-NEXT:    v_cmp_ge_f64_e64 s[8:9], |v[11:12]|, 0.5
579; SI-NEXT:    v_mov_b32_e32 v11, s19
580; SI-NEXT:    v_bfi_b32 v20, s23, v8, v11
581; SI-NEXT:    v_add_f64 v[11:12], s[18:19], -v[9:10]
582; SI-NEXT:    v_bfi_b32 v19, s23, v8, v19
583; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[11:12]|, 0.5
584; SI-NEXT:    v_mov_b32_e32 v11, 0
585; SI-NEXT:    v_cndmask_b32_e64 v12, 0, v20, s[10:11]
586; SI-NEXT:    v_add_f64 v[11:12], v[9:10], v[11:12]
587; SI-NEXT:    v_cndmask_b32_e64 v10, 0, v19, s[8:9]
588; SI-NEXT:    v_mov_b32_e32 v9, 0
589; SI-NEXT:    v_mov_b32_e32 v16, s15
590; SI-NEXT:    v_add_f64 v[9:10], v[13:14], v[9:10]
591; SI-NEXT:    v_mov_b32_e32 v13, s24
592; SI-NEXT:    v_cndmask_b32_e64 v14, v15, v16, s[2:3]
593; SI-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
594; SI-NEXT:    v_mov_b32_e32 v15, s14
595; SI-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[2:3]
596; SI-NEXT:    v_mov_b32_e32 v15, s15
597; SI-NEXT:    v_bfi_b32 v19, s23, v8, v15
598; SI-NEXT:    v_mov_b32_e32 v15, s26
599; SI-NEXT:    v_mov_b32_e32 v18, s13
600; SI-NEXT:    v_cndmask_b32_e64 v15, v15, 0, s[4:5]
601; SI-NEXT:    v_mov_b32_e32 v16, s12
602; SI-NEXT:    v_cndmask_b32_e64 v18, v17, v18, s[6:7]
603; SI-NEXT:    v_cndmask_b32_e64 v17, v15, v16, s[6:7]
604; SI-NEXT:    v_mov_b32_e32 v15, s13
605; SI-NEXT:    v_bfi_b32 v8, s23, v8, v15
606; SI-NEXT:    v_add_f64 v[15:16], s[12:13], -v[17:18]
607; SI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
608; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[15:16]|, 0.5
609; SI-NEXT:    v_add_f64 v[15:16], s[14:15], -v[13:14]
610; SI-NEXT:    s_mov_b32 s23, 0xf000
611; SI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[15:16]|, 0.5
612; SI-NEXT:    v_mov_b32_e32 v15, 0
613; SI-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s[0:1]
614; SI-NEXT:    v_add_f64 v[15:16], v[13:14], v[15:16]
615; SI-NEXT:    v_cndmask_b32_e32 v14, 0, v8, vcc
616; SI-NEXT:    v_mov_b32_e32 v13, 0
617; SI-NEXT:    v_add_f64 v[13:14], v[17:18], v[13:14]
618; SI-NEXT:    s_waitcnt lgkmcnt(0)
619; SI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[20:23], 0 offset:48
620; SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[20:23], 0 offset:32
621; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
622; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
623; SI-NEXT:    s_endpgm
624;
625; CI-LABEL: round_v8f64:
626; CI:       ; %bb.0:
627; CI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
628; CI-NEXT:    s_brev_b32 s2, -2
629; CI-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
630; CI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
631; CI-NEXT:    s_mov_b32 s23, 0xf000
632; CI-NEXT:    s_waitcnt lgkmcnt(0)
633; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[6:7]
634; CI-NEXT:    v_mov_b32_e32 v4, s7
635; CI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
636; CI-NEXT:    v_bfi_b32 v4, s2, v16, v4
637; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
638; CI-NEXT:    v_mov_b32_e32 v2, 0
639; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
640; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[4:5]
641; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
642; CI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[4:5]
643; CI-NEXT:    v_mov_b32_e32 v6, s5
644; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
645; CI-NEXT:    v_bfi_b32 v6, s2, v16, v6
646; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
647; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[10:11]
648; CI-NEXT:    v_mov_b32_e32 v0, 0
649; CI-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
650; CI-NEXT:    v_add_f64 v[4:5], s[10:11], -v[6:7]
651; CI-NEXT:    v_mov_b32_e32 v8, s11
652; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
653; CI-NEXT:    v_bfi_b32 v8, s2, v16, v8
654; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
655; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[8:9]
656; CI-NEXT:    v_mov_b32_e32 v4, 0
657; CI-NEXT:    v_add_f64 v[6:7], v[6:7], v[4:5]
658; CI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[8:9]
659; CI-NEXT:    v_mov_b32_e32 v10, s9
660; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
661; CI-NEXT:    v_bfi_b32 v10, s2, v16, v10
662; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v10, vcc
663; CI-NEXT:    v_mov_b32_e32 v4, 0
664; CI-NEXT:    v_add_f64 v[4:5], v[8:9], v[4:5]
665; CI-NEXT:    v_mov_b32_e32 v8, s15
666; CI-NEXT:    v_bfi_b32 v18, s2, v16, v8
667; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[16:17]
668; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[18:19]
669; CI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[8:9]
670; CI-NEXT:    v_mov_b32_e32 v19, s19
671; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
672; CI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[10:11]
673; CI-NEXT:    v_mov_b32_e32 v17, s17
674; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
675; CI-NEXT:    v_bfi_b32 v19, s2, v16, v19
676; CI-NEXT:    v_trunc_f64_e32 v[12:13], s[12:13]
677; CI-NEXT:    v_bfi_b32 v17, s2, v16, v17
678; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v19, s[0:1]
679; CI-NEXT:    v_mov_b32_e32 v14, 0
680; CI-NEXT:    v_add_f64 v[10:11], v[10:11], v[14:15]
681; CI-NEXT:    v_cndmask_b32_e32 v15, 0, v17, vcc
682; CI-NEXT:    v_mov_b32_e32 v14, 0
683; CI-NEXT:    v_mov_b32_e32 v17, s13
684; CI-NEXT:    v_add_f64 v[8:9], v[8:9], v[14:15]
685; CI-NEXT:    v_add_f64 v[14:15], s[12:13], -v[12:13]
686; CI-NEXT:    v_bfi_b32 v19, s2, v16, v17
687; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[14:15]
688; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
689; CI-NEXT:    v_add_f64 v[14:15], s[14:15], -v[16:17]
690; CI-NEXT:    s_mov_b32 s22, -1
691; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
692; CI-NEXT:    v_mov_b32_e32 v14, 0
693; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v18, s[0:1]
694; CI-NEXT:    v_add_f64 v[14:15], v[16:17], v[14:15]
695; CI-NEXT:    v_cndmask_b32_e32 v17, 0, v19, vcc
696; CI-NEXT:    v_mov_b32_e32 v16, 0
697; CI-NEXT:    v_add_f64 v[12:13], v[12:13], v[16:17]
698; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48
699; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32
700; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
701; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
702; CI-NEXT:    s_endpgm
703  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
704  store <8 x double> %result, <8 x double> addrspace(1)* %out
705  ret void
706}
707
708declare i32 @llvm.amdgcn.workitem.id.x() #1
709
710declare double @llvm.round.f64(double) #1
711declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
712declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
713declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
714
715attributes #0 = { nounwind }
716attributes #1 = { nounwind readnone }
717