1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn | FileCheck %s -check-prefixes=FUNC,SI,GCN
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,TONGA
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s -check-prefixes=FUNC,SI,GFX9
5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood | FileCheck %s -check-prefixes=FUNC,EG
6
7; The code generated by sdiv is long and complex and may frequently change.
8; The goal of this test is to make sure the ISel doesn't fail.
9;
10; This program was previously failing to compile when one of the selectcc
11; opcodes generated by the sdiv lowering was being legalized and optimized to:
12; selectcc Remainder -1, 0, -1, SETGT
13; This was fixed by adding an additional pattern in R600Instructions.td to
14; match this pattern with a CNDGE_INT.
15
16define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
17; GCN-LABEL: sdiv_i32:
18; GCN:       ; %bb.0:
19; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
20; GCN-NEXT:    s_mov_b32 s7, 0xf000
21; GCN-NEXT:    s_mov_b32 s6, -1
22; GCN-NEXT:    s_mov_b32 s10, s6
23; GCN-NEXT:    s_mov_b32 s11, s7
24; GCN-NEXT:    s_waitcnt lgkmcnt(0)
25; GCN-NEXT:    s_mov_b32 s8, s2
26; GCN-NEXT:    s_mov_b32 s9, s3
27; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
28; GCN-NEXT:    s_mov_b32 s4, s0
29; GCN-NEXT:    s_mov_b32 s5, s1
30; GCN-NEXT:    s_waitcnt vmcnt(0)
31; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
32; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
33; GCN-NEXT:    v_xor_b32_e32 v4, v2, v3
34; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
35; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
36; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
37; GCN-NEXT:    v_xor_b32_e32 v1, v1, v3
38; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
39; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
40; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
41; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
42; GCN-NEXT:    v_mul_hi_u32 v3, v2, v1
43; GCN-NEXT:    v_mul_lo_u32 v5, v2, v1
44; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v5
45; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
46; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
47; GCN-NEXT:    v_mul_hi_u32 v3, v3, v2
48; GCN-NEXT:    v_add_i32_e32 v5, vcc, v3, v2
49; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
50; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
51; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
52; GCN-NEXT:    v_mul_lo_u32 v3, v2, v1
53; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
54; GCN-NEXT:    v_add_i32_e32 v6, vcc, -1, v2
55; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v0
56; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
57; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v1
58; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
59; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v5, s[0:1]
60; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
61; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
62; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
63; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
64; GCN-NEXT:    s_endpgm
65;
66; TONGA-LABEL: sdiv_i32:
67; TONGA:       ; %bb.0:
68; TONGA-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
69; TONGA-NEXT:    s_mov_b32 s7, 0xf000
70; TONGA-NEXT:    s_mov_b32 s6, -1
71; TONGA-NEXT:    s_mov_b32 s2, s6
72; TONGA-NEXT:    s_mov_b32 s3, s7
73; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
74; TONGA-NEXT:    s_mov_b32 s0, s10
75; TONGA-NEXT:    s_mov_b32 s1, s11
76; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
77; TONGA-NEXT:    s_mov_b32 s4, s8
78; TONGA-NEXT:    s_mov_b32 s5, s9
79; TONGA-NEXT:    s_waitcnt vmcnt(0)
80; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
81; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
82; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v2
83; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v1
84; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
85; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
86; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v6
87; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
88; TONGA-NEXT:    v_xor_b32_e32 v2, v6, v2
89; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
90; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
91; TONGA-NEXT:    v_mul_lo_u32 v4, v3, v1
92; TONGA-NEXT:    v_mul_hi_u32 v5, v3, v1
93; TONGA-NEXT:    v_sub_u32_e32 v7, vcc, 0, v4
94; TONGA-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
95; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
96; TONGA-NEXT:    v_mul_hi_u32 v4, v4, v3
97; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v4, v3
98; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v4, v3
99; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
100; TONGA-NEXT:    v_mul_hi_u32 v3, v3, v0
101; TONGA-NEXT:    v_mul_lo_u32 v4, v3, v1
102; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
103; TONGA-NEXT:    v_add_u32_e32 v6, vcc, -1, v3
104; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, v4, v0
105; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
106; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v1
107; TONGA-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
108; TONGA-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
109; TONGA-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
110; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v2
111; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
112; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
113; TONGA-NEXT:    s_endpgm
114;
115; GFX9-LABEL: sdiv_i32:
116; GFX9:       ; %bb.0:
117; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
118; GFX9-NEXT:    s_mov_b32 s7, 0xf000
119; GFX9-NEXT:    s_mov_b32 s6, -1
120; GFX9-NEXT:    s_mov_b32 s10, s6
121; GFX9-NEXT:    s_mov_b32 s11, s7
122; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX9-NEXT:    s_mov_b32 s8, s2
124; GFX9-NEXT:    s_mov_b32 s9, s3
125; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
126; GFX9-NEXT:    s_mov_b32 s4, s0
127; GFX9-NEXT:    s_mov_b32 s5, s1
128; GFX9-NEXT:    s_waitcnt vmcnt(0)
129; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
130; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
131; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
132; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
133; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
134; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
135; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
136; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
137; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
138; GFX9-NEXT:    v_sub_u32_e32 v6, 0, v4
139; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
140; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
141; GFX9-NEXT:    v_mul_hi_u32 v4, v4, v3
142; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
143; GFX9-NEXT:    v_add_u32_e32 v0, v0, v5
144; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
145; GFX9-NEXT:    v_add_u32_e32 v6, v3, v4
146; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v4
147; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
148; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v0
149; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v2
150; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
151; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
152; GFX9-NEXT:    v_add_u32_e32 v6, -1, v3
153; GFX9-NEXT:    v_sub_u32_e32 v7, v0, v4
154; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
155; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v1
156; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
157; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
158; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
159; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
160; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
161; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
162; GFX9-NEXT:    s_endpgm
163;
164; EG-LABEL: sdiv_i32:
165; EG:       ; %bb.0:
166; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
167; EG-NEXT:    TEX 0 @6
168; EG-NEXT:    ALU 30, @9, KC0[CB0:0-32], KC1[]
169; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
170; EG-NEXT:    CF_END
171; EG-NEXT:    PAD
172; EG-NEXT:    Fetch clause starting at 6:
173; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
174; EG-NEXT:    ALU clause starting at 8:
175; EG-NEXT:     MOV * T0.X, KC0[2].Z,
176; EG-NEXT:    ALU clause starting at 9:
177; EG-NEXT:     SETGT_INT * T0.W, 0.0, T0.Y,
178; EG-NEXT:     ADD_INT * T1.W, T0.Y, PV.W,
179; EG-NEXT:     XOR_INT * T1.W, PV.W, T0.W,
180; EG-NEXT:     RECIP_UINT * T0.Y, PV.W,
181; EG-NEXT:     MULLO_INT * T0.Z, PS, T1.W,
182; EG-NEXT:     SUB_INT T2.W, 0.0, PS,
183; EG-NEXT:     MULHI * T1.X, T0.Y, T1.W,
184; EG-NEXT:     CNDE_INT T2.W, PS, PV.W, T0.Z,
185; EG-NEXT:     SETGT_INT * T3.W, 0.0, T0.X,
186; EG-NEXT:     MULHI * T0.Z, PV.W, T0.Y,
187; EG-NEXT:     ADD_INT T1.Z, T0.X, T3.W,
188; EG-NEXT:     ADD_INT T2.W, T0.Y, PS,
189; EG-NEXT:     SUB_INT * T4.W, T0.Y, PS,
190; EG-NEXT:     CNDE_INT T2.W, T1.X, PV.W, PS,
191; EG-NEXT:     XOR_INT * T4.W, PV.Z, T3.W,
192; EG-NEXT:     MULHI * T0.X, PV.W, PS,
193; EG-NEXT:     MULLO_INT * T0.Y, PS, T1.W,
194; EG-NEXT:     SUB_INT * T2.W, T4.W, PS,
195; EG-NEXT:     SETGE_UINT T1.W, PV.W, T1.W,
196; EG-NEXT:     SETGE_UINT * T2.W, T4.W, T0.Y,
197; EG-NEXT:     AND_INT T1.W, PV.W, PS,
198; EG-NEXT:     ADD_INT * T4.W, T0.X, 1,
199; EG-NEXT:     CNDE_INT T1.W, PV.W, T0.X, PS,
200; EG-NEXT:     ADD_INT * T4.W, T0.X, literal.x,
201; EG-NEXT:    -1(nan), 0(0.000000e+00)
202; EG-NEXT:     CNDE_INT T1.W, T2.W, PS, PV.W,
203; EG-NEXT:     XOR_INT * T0.W, T3.W, T0.W,
204; EG-NEXT:     XOR_INT * T1.W, PV.W, PS,
205; EG-NEXT:     SUB_INT T0.X, PV.W, T0.W,
206; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
207; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
208  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
209  %num = load i32, i32 addrspace(1) * %in
210  %den = load i32, i32 addrspace(1) * %den_ptr
211  %result = sdiv i32 %num, %den
212  store i32 %result, i32 addrspace(1)* %out
213  ret void
214}
215
216define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
217; GCN-LABEL: sdiv_i32_4:
218; GCN:       ; %bb.0:
219; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
220; GCN-NEXT:    s_mov_b32 s7, 0xf000
221; GCN-NEXT:    s_mov_b32 s6, -1
222; GCN-NEXT:    s_mov_b32 s10, s6
223; GCN-NEXT:    s_mov_b32 s11, s7
224; GCN-NEXT:    s_waitcnt lgkmcnt(0)
225; GCN-NEXT:    s_mov_b32 s8, s2
226; GCN-NEXT:    s_mov_b32 s9, s3
227; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
228; GCN-NEXT:    s_mov_b32 s4, s0
229; GCN-NEXT:    s_mov_b32 s5, s1
230; GCN-NEXT:    s_waitcnt vmcnt(0)
231; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
232; GCN-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
233; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
234; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
235; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
236; GCN-NEXT:    s_endpgm
237;
238; TONGA-LABEL: sdiv_i32_4:
239; TONGA:       ; %bb.0:
240; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
241; TONGA-NEXT:    s_mov_b32 s3, 0xf000
242; TONGA-NEXT:    s_mov_b32 s2, -1
243; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
244; TONGA-NEXT:    s_mov_b32 s0, s4
245; TONGA-NEXT:    s_mov_b32 s1, s5
246; TONGA-NEXT:    s_mov_b32 s4, s6
247; TONGA-NEXT:    s_mov_b32 s5, s7
248; TONGA-NEXT:    s_mov_b32 s6, s2
249; TONGA-NEXT:    s_mov_b32 s7, s3
250; TONGA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
251; TONGA-NEXT:    s_waitcnt vmcnt(0)
252; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
253; TONGA-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
254; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
255; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
256; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
257; TONGA-NEXT:    s_endpgm
258;
259; GFX9-LABEL: sdiv_i32_4:
260; GFX9:       ; %bb.0:
261; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
262; GFX9-NEXT:    s_mov_b32 s3, 0xf000
263; GFX9-NEXT:    s_mov_b32 s2, -1
264; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX9-NEXT:    s_mov_b32 s0, s4
266; GFX9-NEXT:    s_mov_b32 s1, s5
267; GFX9-NEXT:    s_mov_b32 s4, s6
268; GFX9-NEXT:    s_mov_b32 s5, s7
269; GFX9-NEXT:    s_mov_b32 s6, s2
270; GFX9-NEXT:    s_mov_b32 s7, s3
271; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
272; GFX9-NEXT:    s_waitcnt vmcnt(0)
273; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
274; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
275; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
276; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
277; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
278; GFX9-NEXT:    s_endpgm
279;
280; EG-LABEL: sdiv_i32_4:
281; EG:       ; %bb.0:
282; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
283; EG-NEXT:    TEX 0 @6
284; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
285; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
286; EG-NEXT:    CF_END
287; EG-NEXT:    PAD
288; EG-NEXT:    Fetch clause starting at 6:
289; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
290; EG-NEXT:    ALU clause starting at 8:
291; EG-NEXT:     MOV * T0.X, KC0[2].Z,
292; EG-NEXT:    ALU clause starting at 9:
293; EG-NEXT:     ASHR * T0.W, T0.X, literal.x,
294; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
295; EG-NEXT:     LSHR * T0.W, PV.W, literal.x,
296; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
297; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
298; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
299; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
300; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
301  %num = load i32, i32 addrspace(1) * %in
302  %result = sdiv i32 %num, 4
303  store i32 %result, i32 addrspace(1)* %out
304  ret void
305}
306
307; Multiply by a weird constant to make sure setIntDivIsCheap is
308; working.
309
310define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
311; GCN-LABEL: slow_sdiv_i32_3435:
312; GCN:       ; %bb.0:
313; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
314; GCN-NEXT:    s_mov_b32 s7, 0xf000
315; GCN-NEXT:    s_mov_b32 s6, -1
316; GCN-NEXT:    s_mov_b32 s10, s6
317; GCN-NEXT:    s_mov_b32 s11, s7
318; GCN-NEXT:    s_waitcnt lgkmcnt(0)
319; GCN-NEXT:    s_mov_b32 s8, s2
320; GCN-NEXT:    s_mov_b32 s9, s3
321; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
322; GCN-NEXT:    s_mov_b32 s2, 0x98a1930b
323; GCN-NEXT:    s_mov_b32 s4, s0
324; GCN-NEXT:    s_mov_b32 s5, s1
325; GCN-NEXT:    s_waitcnt vmcnt(0)
326; GCN-NEXT:    v_mul_hi_i32 v1, v0, s2
327; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
328; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
329; GCN-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
330; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
331; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
332; GCN-NEXT:    s_endpgm
333;
334; TONGA-LABEL: slow_sdiv_i32_3435:
335; TONGA:       ; %bb.0:
336; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
337; TONGA-NEXT:    s_mov_b32 s3, 0xf000
338; TONGA-NEXT:    s_mov_b32 s2, -1
339; TONGA-NEXT:    s_mov_b32 s10, s2
340; TONGA-NEXT:    s_mov_b32 s11, s3
341; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
342; TONGA-NEXT:    s_mov_b32 s8, s6
343; TONGA-NEXT:    s_mov_b32 s9, s7
344; TONGA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
345; TONGA-NEXT:    s_mov_b32 s0, 0x98a1930b
346; TONGA-NEXT:    s_mov_b32 s1, s5
347; TONGA-NEXT:    s_waitcnt vmcnt(0)
348; TONGA-NEXT:    v_mul_hi_i32 v1, v0, s0
349; TONGA-NEXT:    s_mov_b32 s0, s4
350; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
351; TONGA-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
352; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
353; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
354; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
355; TONGA-NEXT:    s_endpgm
356;
357; GFX9-LABEL: slow_sdiv_i32_3435:
358; GFX9:       ; %bb.0:
359; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
360; GFX9-NEXT:    s_mov_b32 s3, 0xf000
361; GFX9-NEXT:    s_mov_b32 s2, -1
362; GFX9-NEXT:    s_mov_b32 s10, s2
363; GFX9-NEXT:    s_mov_b32 s11, s3
364; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
365; GFX9-NEXT:    s_mov_b32 s8, s6
366; GFX9-NEXT:    s_mov_b32 s9, s7
367; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
368; GFX9-NEXT:    s_mov_b32 s0, 0x98a1930b
369; GFX9-NEXT:    s_mov_b32 s1, s5
370; GFX9-NEXT:    s_waitcnt vmcnt(0)
371; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s0
372; GFX9-NEXT:    s_mov_b32 s0, s4
373; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
374; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
375; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
376; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
377; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
378; GFX9-NEXT:    s_endpgm
379;
380; EG-LABEL: slow_sdiv_i32_3435:
381; EG:       ; %bb.0:
382; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
383; EG-NEXT:    TEX 0 @6
384; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
385; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
386; EG-NEXT:    CF_END
387; EG-NEXT:    PAD
388; EG-NEXT:    Fetch clause starting at 6:
389; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
390; EG-NEXT:    ALU clause starting at 8:
391; EG-NEXT:     MOV * T0.X, KC0[2].Z,
392; EG-NEXT:    ALU clause starting at 9:
393; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
394; EG-NEXT:    -1734241525(-4.176600e-24), 0(0.000000e+00)
395; EG-NEXT:     ADD_INT * T0.W, PS, T0.X,
396; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
397; EG-NEXT:     LSHR * T0.W, PV.W, literal.y,
398; EG-NEXT:    11(1.541428e-44), 31(4.344025e-44)
399; EG-NEXT:     ADD_INT T0.X, PV.W, PS,
400; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
401; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
402  %num = load i32, i32 addrspace(1) * %in
403  %result = sdiv i32 %num, 3435
404  store i32 %result, i32 addrspace(1)* %out
405  ret void
406}
407
408define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
409; GCN-LABEL: sdiv_v2i32:
410; GCN:       ; %bb.0:
411; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
412; GCN-NEXT:    s_mov_b32 s11, 0xf000
413; GCN-NEXT:    s_mov_b32 s10, -1
414; GCN-NEXT:    s_mov_b32 s6, s10
415; GCN-NEXT:    s_mov_b32 s7, s11
416; GCN-NEXT:    s_waitcnt lgkmcnt(0)
417; GCN-NEXT:    s_mov_b32 s4, s2
418; GCN-NEXT:    s_mov_b32 s5, s3
419; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
420; GCN-NEXT:    s_mov_b32 s2, 0x4f800000
421; GCN-NEXT:    s_mov_b32 s8, s0
422; GCN-NEXT:    s_mov_b32 s9, s1
423; GCN-NEXT:    s_waitcnt vmcnt(0)
424; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
425; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
426; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
427; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
428; GCN-NEXT:    v_xor_b32_e32 v8, v4, v5
429; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
430; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
431; GCN-NEXT:    v_xor_b32_e32 v9, v6, v7
432; GCN-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
433; GCN-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
434; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
435; GCN-NEXT:    v_xor_b32_e32 v2, v2, v5
436; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
437; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
438; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
439; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
440; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
441; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
442; GCN-NEXT:    v_mul_f32_e32 v4, s2, v4
443; GCN-NEXT:    v_mul_f32_e32 v5, s2, v5
444; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
445; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
446; GCN-NEXT:    v_mul_hi_u32 v6, v4, v2
447; GCN-NEXT:    v_mul_lo_u32 v7, v4, v2
448; GCN-NEXT:    v_mul_hi_u32 v10, v5, v3
449; GCN-NEXT:    v_mul_lo_u32 v11, v5, v3
450; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v7
451; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v11
452; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
453; GCN-NEXT:    v_cndmask_b32_e64 v6, v7, v12, s[0:1]
454; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v10
455; GCN-NEXT:    v_cndmask_b32_e64 v7, v11, v13, s[2:3]
456; GCN-NEXT:    v_mul_hi_u32 v6, v6, v4
457; GCN-NEXT:    v_mul_hi_u32 v7, v7, v5
458; GCN-NEXT:    v_add_i32_e32 v10, vcc, v6, v4
459; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v6, v4
460; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v5
461; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v7, v5
462; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
463; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[2:3]
464; GCN-NEXT:    v_mul_hi_u32 v4, v4, v0
465; GCN-NEXT:    v_mul_hi_u32 v5, v5, v1
466; GCN-NEXT:    v_mul_lo_u32 v6, v4, v2
467; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
468; GCN-NEXT:    v_add_i32_e32 v10, vcc, -1, v4
469; GCN-NEXT:    v_mul_lo_u32 v11, v5, v3
470; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v5
471; GCN-NEXT:    v_add_i32_e32 v13, vcc, -1, v5
472; GCN-NEXT:    v_subrev_i32_e32 v14, vcc, v6, v0
473; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v6
474; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v11, v1
475; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v11
476; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v14, v2
477; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
478; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], s[0:1]
479; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v7, s[2:3]
480; GCN-NEXT:    s_and_b64 s[2:3], s[4:5], vcc
481; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v12, s[2:3]
482; GCN-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s[0:1]
483; GCN-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
484; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
485; GCN-NEXT:    v_xor_b32_e32 v1, v1, v9
486; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
487; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
488; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
489; GCN-NEXT:    s_endpgm
490;
491; TONGA-LABEL: sdiv_v2i32:
492; TONGA:       ; %bb.0:
493; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
494; TONGA-NEXT:    s_mov_b32 s11, 0xf000
495; TONGA-NEXT:    s_mov_b32 s10, -1
496; TONGA-NEXT:    s_mov_b32 s4, 0x4f800000
497; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
498; TONGA-NEXT:    s_mov_b32 s8, s0
499; TONGA-NEXT:    s_mov_b32 s9, s1
500; TONGA-NEXT:    s_mov_b32 s0, s2
501; TONGA-NEXT:    s_mov_b32 s1, s3
502; TONGA-NEXT:    s_mov_b32 s2, s10
503; TONGA-NEXT:    s_mov_b32 s3, s11
504; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
505; TONGA-NEXT:    s_waitcnt vmcnt(0)
506; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
507; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
508; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
509; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
510; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
511; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v5
512; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
513; TONGA-NEXT:    v_xor_b32_e32 v8, v4, v5
514; TONGA-NEXT:    v_cvt_f32_u32_e32 v5, v2
515; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
516; TONGA-NEXT:    v_xor_b32_e32 v9, v6, v7
517; TONGA-NEXT:    v_cvt_f32_u32_e32 v7, v3
518; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v5
519; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
520; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
521; TONGA-NEXT:    v_rcp_iflag_f32_e32 v7, v7
522; TONGA-NEXT:    v_mul_f32_e32 v4, s4, v5
523; TONGA-NEXT:    v_cvt_u32_f32_e32 v4, v4
524; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
525; TONGA-NEXT:    v_mul_f32_e32 v5, s4, v7
526; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
527; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
528; TONGA-NEXT:    v_mul_hi_u32 v6, v4, v2
529; TONGA-NEXT:    v_mul_lo_u32 v7, v4, v2
530; TONGA-NEXT:    v_mul_hi_u32 v10, v5, v3
531; TONGA-NEXT:    v_mul_lo_u32 v11, v5, v3
532; TONGA-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
533; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v7
534; TONGA-NEXT:    v_cndmask_b32_e64 v6, v7, v12, s[0:1]
535; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v11
536; TONGA-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v10
537; TONGA-NEXT:    v_cndmask_b32_e64 v7, v11, v13, s[2:3]
538; TONGA-NEXT:    v_mul_hi_u32 v6, v6, v4
539; TONGA-NEXT:    v_mul_hi_u32 v7, v7, v5
540; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v6, v4
541; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v6, v4
542; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
543; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v7, v5
544; TONGA-NEXT:    v_subrev_u32_e32 v5, vcc, v7, v5
545; TONGA-NEXT:    v_mul_hi_u32 v4, v4, v0
546; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[2:3]
547; TONGA-NEXT:    v_mul_hi_u32 v5, v5, v1
548; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v2
549; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
550; TONGA-NEXT:    v_mul_lo_u32 v11, v5, v3
551; TONGA-NEXT:    v_add_u32_e32 v10, vcc, -1, v4
552; TONGA-NEXT:    v_subrev_u32_e32 v14, vcc, v6, v0
553; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v6
554; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v14, v2
555; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v11, v1
556; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v5
557; TONGA-NEXT:    v_add_u32_e32 v13, vcc, -1, v5
558; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v11
559; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
560; TONGA-NEXT:    s_and_b64 s[2:3], s[2:3], s[0:1]
561; TONGA-NEXT:    v_cndmask_b32_e64 v0, v4, v7, s[2:3]
562; TONGA-NEXT:    s_and_b64 s[2:3], s[4:5], vcc
563; TONGA-NEXT:    v_cndmask_b32_e64 v1, v5, v12, s[2:3]
564; TONGA-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s[0:1]
565; TONGA-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
566; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
567; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v9
568; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
569; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v9
570; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
571; TONGA-NEXT:    s_endpgm
572;
573; GFX9-LABEL: sdiv_v2i32:
574; GFX9:       ; %bb.0:
575; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
576; GFX9-NEXT:    s_mov_b32 s11, 0xf000
577; GFX9-NEXT:    s_mov_b32 s10, -1
578; GFX9-NEXT:    s_mov_b32 s4, 0x4f800000
579; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX9-NEXT:    s_mov_b32 s8, s0
581; GFX9-NEXT:    s_mov_b32 s9, s1
582; GFX9-NEXT:    s_mov_b32 s0, s2
583; GFX9-NEXT:    s_mov_b32 s1, s3
584; GFX9-NEXT:    s_mov_b32 s2, s10
585; GFX9-NEXT:    s_mov_b32 s3, s11
586; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
587; GFX9-NEXT:    s_waitcnt vmcnt(0)
588; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
589; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
590; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
591; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
592; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v5
593; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v2
594; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v6
595; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v3
596; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
597; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
598; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
599; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
600; GFX9-NEXT:    v_xor_b32_e32 v5, v4, v5
601; GFX9-NEXT:    v_mul_f32_e32 v7, s4, v7
602; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
603; GFX9-NEXT:    v_mul_f32_e32 v8, s4, v8
604; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v8
605; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
606; GFX9-NEXT:    v_mul_lo_u32 v4, v7, v2
607; GFX9-NEXT:    v_mul_hi_u32 v11, v7, v2
608; GFX9-NEXT:    v_mul_lo_u32 v10, v8, v3
609; GFX9-NEXT:    v_mul_hi_u32 v12, v8, v3
610; GFX9-NEXT:    v_sub_u32_e32 v13, 0, v4
611; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
612; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc
613; GFX9-NEXT:    v_sub_u32_e32 v14, 0, v10
614; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v12
615; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v14, s[0:1]
616; GFX9-NEXT:    v_mul_hi_u32 v4, v4, v7
617; GFX9-NEXT:    v_mul_hi_u32 v10, v10, v8
618; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
619; GFX9-NEXT:    v_add_u32_e32 v1, v1, v9
620; GFX9-NEXT:    v_xor_b32_e32 v6, v9, v6
621; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v9
622; GFX9-NEXT:    v_add_u32_e32 v9, v7, v4
623; GFX9-NEXT:    v_sub_u32_e32 v4, v7, v4
624; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
625; GFX9-NEXT:    v_add_u32_e32 v7, v8, v10
626; GFX9-NEXT:    v_sub_u32_e32 v8, v8, v10
627; GFX9-NEXT:    v_mul_hi_u32 v4, v4, v0
628; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
629; GFX9-NEXT:    v_mul_hi_u32 v7, v7, v1
630; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v2
631; GFX9-NEXT:    v_add_u32_e32 v9, 1, v4
632; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v3
633; GFX9-NEXT:    v_add_u32_e32 v12, 1, v7
634; GFX9-NEXT:    v_sub_u32_e32 v14, v0, v8
635; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v8
636; GFX9-NEXT:    v_cmp_ge_u32_e64 s[2:3], v14, v2
637; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v11
638; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v11
639; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
640; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
641; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v9, s[2:3]
642; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[0:1]
643; GFX9-NEXT:    v_add_u32_e32 v10, -1, v4
644; GFX9-NEXT:    v_add_u32_e32 v13, -1, v7
645; GFX9-NEXT:    v_cndmask_b32_e64 v1, v7, v12, s[2:3]
646; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
647; GFX9-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[0:1]
648; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
649; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v6
650; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v5
651; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v6
652; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
653; GFX9-NEXT:    s_endpgm
654;
655; EG-LABEL: sdiv_v2i32:
656; EG:       ; %bb.0:
657; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
658; EG-NEXT:    TEX 1 @6
659; EG-NEXT:    ALU 59, @11, KC0[CB0:0-32], KC1[]
660; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
661; EG-NEXT:    CF_END
662; EG-NEXT:    PAD
663; EG-NEXT:    Fetch clause starting at 6:
664; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
665; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
666; EG-NEXT:    ALU clause starting at 10:
667; EG-NEXT:     MOV * T0.X, KC0[2].Z,
668; EG-NEXT:    ALU clause starting at 11:
669; EG-NEXT:     SETGT_INT * T0.W, 0.0, T1.Y,
670; EG-NEXT:     ADD_INT * T1.W, T1.Y, PV.W,
671; EG-NEXT:     XOR_INT T1.W, PV.W, T0.W,
672; EG-NEXT:     SETGT_INT * T2.W, 0.0, T1.X,
673; EG-NEXT:     ADD_INT T3.W, T1.X, PS,
674; EG-NEXT:     RECIP_UINT * T0.Z, PV.W,
675; EG-NEXT:     XOR_INT T3.W, PV.W, T2.W, BS:VEC_021/SCL_122
676; EG-NEXT:     MULLO_INT * T1.X, PS, T1.W,
677; EG-NEXT:     RECIP_UINT * T1.Y, PV.W,
678; EG-NEXT:     MULLO_INT * T1.Z, PS, T3.W,
679; EG-NEXT:     SUB_INT T4.W, 0.0, PS,
680; EG-NEXT:     MULHI * T2.X, T1.Y, T3.W,
681; EG-NEXT:     CNDE_INT T1.Z, PS, PV.W, T1.Z, BS:VEC_021/SCL_122
682; EG-NEXT:     SUB_INT T4.W, 0.0, T1.X,
683; EG-NEXT:     MULHI * T2.Y, T0.Z, T1.W,
684; EG-NEXT:     CNDE_INT T2.Z, PS, PV.W, T1.X,
685; EG-NEXT:     SETGT_INT T4.W, 0.0, T0.X,
686; EG-NEXT:     MULHI * T1.X, PV.Z, T1.Y,
687; EG-NEXT:     SETGT_INT T3.X, 0.0, T0.Y,
688; EG-NEXT:     ADD_INT T3.Y, T0.X, PV.W,
689; EG-NEXT:     ADD_INT T1.Z, T1.Y, PS,
690; EG-NEXT:     SUB_INT T5.W, T1.Y, PS,
691; EG-NEXT:     MULHI * T0.X, PV.Z, T0.Z,
692; EG-NEXT:     CNDE_INT T1.X, T2.X, PV.Z, PV.W,
693; EG-NEXT:     XOR_INT T1.Y, PV.Y, T4.W,
694; EG-NEXT:     ADD_INT T1.Z, T0.Y, PV.X,
695; EG-NEXT:     ADD_INT T5.W, T0.Z, PS,
696; EG-NEXT:     SUB_INT * T6.W, T0.Z, PS,
697; EG-NEXT:     CNDE_INT T0.Z, T2.Y, PV.W, PS,
698; EG-NEXT:     XOR_INT T5.W, PV.Z, T3.X,
699; EG-NEXT:     MULHI * T0.X, PV.X, PV.Y,
700; EG-NEXT:     MULHI * T0.Y, PV.Z, PV.W,
701; EG-NEXT:     MULLO_INT * T0.Z, PS, T1.W,
702; EG-NEXT:     SUB_INT T6.W, T5.W, PS,
703; EG-NEXT:     MULLO_INT * T1.X, T0.X, T3.W,
704; EG-NEXT:     SUB_INT T1.Z, T1.Y, PS,
705; EG-NEXT:     SETGE_UINT T1.W, PV.W, T1.W,
706; EG-NEXT:     SETGE_UINT * T5.W, T5.W, T0.Z,
707; EG-NEXT:     AND_INT T2.Y, PV.W, PS,
708; EG-NEXT:     ADD_INT T0.Z, T0.Y, 1,
709; EG-NEXT:     SETGE_UINT T1.W, PV.Z, T3.W,
710; EG-NEXT:     SETGE_UINT * T3.W, T1.Y, T1.X,
711; EG-NEXT:     AND_INT T1.Y, PV.W, PS,
712; EG-NEXT:     ADD_INT T1.Z, T0.X, 1,
713; EG-NEXT:     CNDE_INT T1.W, PV.Y, T0.Y, PV.Z,
714; EG-NEXT:     ADD_INT * T6.W, T0.Y, literal.x,
715; EG-NEXT:    -1(nan), 0(0.000000e+00)
716; EG-NEXT:     CNDE_INT T0.Y, T5.W, PS, PV.W,
717; EG-NEXT:     XOR_INT T0.Z, T3.X, T0.W,
718; EG-NEXT:     CNDE_INT T0.W, PV.Y, T0.X, PV.Z,
719; EG-NEXT:     ADD_INT * T1.W, T0.X, literal.x,
720; EG-NEXT:    -1(nan), 0(0.000000e+00)
721; EG-NEXT:     CNDE_INT T1.Z, T3.W, PS, PV.W,
722; EG-NEXT:     XOR_INT T0.W, T4.W, T2.W, BS:VEC_120/SCL_212
723; EG-NEXT:     XOR_INT * T1.W, PV.Y, PV.Z,
724; EG-NEXT:     SUB_INT T0.Y, PS, T0.Z,
725; EG-NEXT:     XOR_INT * T1.W, PV.Z, PV.W,
726; EG-NEXT:     SUB_INT T0.X, PV.W, T0.W,
727; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
728; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
729  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
730  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
731  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
732  %result = sdiv <2 x i32> %num, %den
733  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
734  ret void
735}
736
737define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
738; GCN-LABEL: sdiv_v2i32_4:
739; GCN:       ; %bb.0:
740; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
741; GCN-NEXT:    s_mov_b32 s7, 0xf000
742; GCN-NEXT:    s_mov_b32 s6, -1
743; GCN-NEXT:    s_mov_b32 s10, s6
744; GCN-NEXT:    s_mov_b32 s11, s7
745; GCN-NEXT:    s_waitcnt lgkmcnt(0)
746; GCN-NEXT:    s_mov_b32 s8, s2
747; GCN-NEXT:    s_mov_b32 s9, s3
748; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
749; GCN-NEXT:    s_mov_b32 s4, s0
750; GCN-NEXT:    s_mov_b32 s5, s1
751; GCN-NEXT:    s_waitcnt vmcnt(0)
752; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
753; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
754; GCN-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
755; GCN-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
756; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
757; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
758; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
759; GCN-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
760; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
761; GCN-NEXT:    s_endpgm
762;
763; TONGA-LABEL: sdiv_v2i32_4:
764; TONGA:       ; %bb.0:
765; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
766; TONGA-NEXT:    s_mov_b32 s3, 0xf000
767; TONGA-NEXT:    s_mov_b32 s2, -1
768; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
769; TONGA-NEXT:    s_mov_b32 s0, s4
770; TONGA-NEXT:    s_mov_b32 s1, s5
771; TONGA-NEXT:    s_mov_b32 s4, s6
772; TONGA-NEXT:    s_mov_b32 s5, s7
773; TONGA-NEXT:    s_mov_b32 s6, s2
774; TONGA-NEXT:    s_mov_b32 s7, s3
775; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
776; TONGA-NEXT:    s_waitcnt vmcnt(0)
777; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
778; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
779; TONGA-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
780; TONGA-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
781; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
782; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
783; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
784; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
785; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
786; TONGA-NEXT:    s_endpgm
787;
788; GFX9-LABEL: sdiv_v2i32_4:
789; GFX9:       ; %bb.0:
790; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
791; GFX9-NEXT:    s_mov_b32 s3, 0xf000
792; GFX9-NEXT:    s_mov_b32 s2, -1
793; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX9-NEXT:    s_mov_b32 s0, s4
795; GFX9-NEXT:    s_mov_b32 s1, s5
796; GFX9-NEXT:    s_mov_b32 s4, s6
797; GFX9-NEXT:    s_mov_b32 s5, s7
798; GFX9-NEXT:    s_mov_b32 s6, s2
799; GFX9-NEXT:    s_mov_b32 s7, s3
800; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
801; GFX9-NEXT:    s_waitcnt vmcnt(0)
802; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
803; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
804; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
805; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
806; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
807; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
808; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
809; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
810; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
811; GFX9-NEXT:    s_endpgm
812;
813; EG-LABEL: sdiv_v2i32_4:
814; EG:       ; %bb.0:
815; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
816; EG-NEXT:    TEX 0 @6
817; EG-NEXT:    ALU 13, @9, KC0[CB0:0-32], KC1[]
818; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
819; EG-NEXT:    CF_END
820; EG-NEXT:    PAD
821; EG-NEXT:    Fetch clause starting at 6:
822; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
823; EG-NEXT:    ALU clause starting at 8:
824; EG-NEXT:     MOV * T0.X, KC0[2].Z,
825; EG-NEXT:    ALU clause starting at 9:
826; EG-NEXT:     ASHR * T0.W, T0.Y, literal.x,
827; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
828; EG-NEXT:     LSHR T0.W, PV.W, literal.x,
829; EG-NEXT:     ASHR * T1.W, T0.X, literal.y,
830; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
831; EG-NEXT:     LSHR T1.W, PS, literal.x,
832; EG-NEXT:     ADD_INT * T0.W, T0.Y, PV.W,
833; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
834; EG-NEXT:     ASHR T0.Y, PS, literal.x,
835; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
836; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
837; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
838; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
839; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
840  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
841  %result = sdiv <2 x i32> %num, <i32 4, i32 4>
842  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
843  ret void
844}
845
846define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
847; GCN-LABEL: sdiv_v4i32:
848; GCN:       ; %bb.0:
849; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
850; GCN-NEXT:    s_mov_b32 s19, 0xf000
851; GCN-NEXT:    s_mov_b32 s18, -1
852; GCN-NEXT:    s_mov_b32 s2, s18
853; GCN-NEXT:    s_mov_b32 s3, s19
854; GCN-NEXT:    s_waitcnt lgkmcnt(0)
855; GCN-NEXT:    s_mov_b32 s0, s10
856; GCN-NEXT:    s_mov_b32 s1, s11
857; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
858; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
859; GCN-NEXT:    s_mov_b32 s6, 0x4f800000
860; GCN-NEXT:    s_waitcnt vmcnt(1)
861; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
862; GCN-NEXT:    s_waitcnt vmcnt(0)
863; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
864; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
865; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
866; GCN-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
867; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
868; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
869; GCN-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
870; GCN-NEXT:    v_xor_b32_e32 v16, v8, v9
871; GCN-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
872; GCN-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
873; GCN-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
874; GCN-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
875; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
876; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
877; GCN-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
878; GCN-NEXT:    v_add_i32_e32 v7, vcc, v15, v7
879; GCN-NEXT:    v_xor_b32_e32 v17, v10, v11
880; GCN-NEXT:    v_xor_b32_e32 v18, v12, v13
881; GCN-NEXT:    v_xor_b32_e32 v19, v14, v15
882; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
883; GCN-NEXT:    v_xor_b32_e32 v4, v4, v9
884; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
885; GCN-NEXT:    v_xor_b32_e32 v5, v5, v11
886; GCN-NEXT:    v_xor_b32_e32 v2, v2, v12
887; GCN-NEXT:    v_xor_b32_e32 v6, v6, v13
888; GCN-NEXT:    v_xor_b32_e32 v3, v3, v14
889; GCN-NEXT:    v_xor_b32_e32 v7, v7, v15
890; GCN-NEXT:    v_cvt_f32_u32_e32 v8, v4
891; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v5
892; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v6
893; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v8
894; GCN-NEXT:    v_rcp_iflag_f32_e32 v9, v9
895; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
896; GCN-NEXT:    v_mul_f32_e32 v8, s6, v8
897; GCN-NEXT:    v_mul_f32_e32 v9, s6, v9
898; GCN-NEXT:    v_mul_f32_e32 v10, s6, v10
899; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
900; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
901; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
902; GCN-NEXT:    v_mul_hi_u32 v11, v8, v4
903; GCN-NEXT:    v_mul_lo_u32 v12, v8, v4
904; GCN-NEXT:    v_mul_hi_u32 v13, v9, v5
905; GCN-NEXT:    v_mul_lo_u32 v14, v9, v5
906; GCN-NEXT:    v_sub_i32_e32 v15, vcc, 0, v12
907; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v11
908; GCN-NEXT:    v_mul_hi_u32 v11, v10, v6
909; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v15, s[0:1]
910; GCN-NEXT:    v_sub_i32_e32 v15, vcc, 0, v14
911; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v13
912; GCN-NEXT:    v_mul_lo_u32 v13, v10, v6
913; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[2:3]
914; GCN-NEXT:    v_sub_i32_e32 v15, vcc, 0, v13
915; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
916; GCN-NEXT:    v_cvt_f32_u32_e32 v11, v7
917; GCN-NEXT:    v_rcp_iflag_f32_e32 v11, v11
918; GCN-NEXT:    v_mul_f32_e32 v11, s6, v11
919; GCN-NEXT:    v_cvt_u32_f32_e32 v11, v11
920; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[4:5]
921; GCN-NEXT:    v_mul_hi_u32 v15, v11, v7
922; GCN-NEXT:    v_mul_lo_u32 v20, v11, v7
923; GCN-NEXT:    v_sub_i32_e32 v21, vcc, 0, v20
924; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v15
925; GCN-NEXT:    v_cndmask_b32_e64 v15, v20, v21, s[6:7]
926; GCN-NEXT:    v_mul_hi_u32 v12, v12, v8
927; GCN-NEXT:    v_add_i32_e32 v20, vcc, v12, v8
928; GCN-NEXT:    v_subrev_i32_e32 v8, vcc, v12, v8
929; GCN-NEXT:    v_mul_hi_u32 v12, v14, v9
930; GCN-NEXT:    v_add_i32_e32 v14, vcc, v12, v9
931; GCN-NEXT:    v_subrev_i32_e32 v9, vcc, v12, v9
932; GCN-NEXT:    v_mul_hi_u32 v12, v13, v10
933; GCN-NEXT:    v_add_i32_e32 v13, vcc, v12, v10
934; GCN-NEXT:    v_subrev_i32_e32 v10, vcc, v12, v10
935; GCN-NEXT:    v_mul_hi_u32 v12, v15, v11
936; GCN-NEXT:    v_add_i32_e32 v15, vcc, v12, v11
937; GCN-NEXT:    v_subrev_i32_e32 v11, vcc, v12, v11
938; GCN-NEXT:    s_mov_b32 s16, s8
939; GCN-NEXT:    s_mov_b32 s17, s9
940; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v20, s[0:1]
941; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[2:3]
942; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v13, s[4:5]
943; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v15, s[6:7]
944; GCN-NEXT:    v_mul_hi_u32 v8, v8, v0
945; GCN-NEXT:    v_mul_hi_u32 v9, v9, v1
946; GCN-NEXT:    v_mul_hi_u32 v10, v10, v2
947; GCN-NEXT:    v_mul_hi_u32 v11, v11, v3
948; GCN-NEXT:    v_mul_lo_u32 v12, v8, v4
949; GCN-NEXT:    v_add_i32_e32 v13, vcc, 1, v8
950; GCN-NEXT:    v_add_i32_e32 v14, vcc, -1, v8
951; GCN-NEXT:    v_mul_lo_u32 v15, v9, v5
952; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v12
953; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
954; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v9
955; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v4
956; GCN-NEXT:    v_add_i32_e32 v0, vcc, -1, v9
957; GCN-NEXT:    v_mul_lo_u32 v4, v10, v6
958; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v15
959; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v15
960; GCN-NEXT:    v_add_i32_e32 v15, vcc, 1, v10
961; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
962; GCN-NEXT:    v_add_i32_e32 v1, vcc, -1, v10
963; GCN-NEXT:    v_mul_lo_u32 v5, v11, v7
964; GCN-NEXT:    v_cmp_ge_u32_e64 s[8:9], v2, v4
965; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
966; GCN-NEXT:    v_add_i32_e32 v4, vcc, -1, v11
967; GCN-NEXT:    v_cmp_ge_u32_e64 s[10:11], v3, v5
968; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
969; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v11
970; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
971; GCN-NEXT:    v_cmp_ge_u32_e64 s[12:13], v3, v7
972; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], s[0:1]
973; GCN-NEXT:    v_cndmask_b32_e64 v2, v8, v13, s[2:3]
974; GCN-NEXT:    s_and_b64 s[2:3], s[6:7], s[4:5]
975; GCN-NEXT:    v_cndmask_b32_e64 v3, v9, v12, s[2:3]
976; GCN-NEXT:    s_and_b64 vcc, vcc, s[8:9]
977; GCN-NEXT:    v_cndmask_b32_e32 v6, v10, v15, vcc
978; GCN-NEXT:    s_and_b64 vcc, s[12:13], s[10:11]
979; GCN-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
980; GCN-NEXT:    v_cndmask_b32_e64 v2, v14, v2, s[0:1]
981; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[4:5]
982; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[8:9]
983; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v5, s[10:11]
984; GCN-NEXT:    v_xor_b32_e32 v2, v2, v16
985; GCN-NEXT:    v_xor_b32_e32 v4, v0, v17
986; GCN-NEXT:    v_xor_b32_e32 v5, v1, v18
987; GCN-NEXT:    v_xor_b32_e32 v3, v3, v19
988; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v2, v16
989; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v4, v17
990; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v5, v18
991; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v19
992; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
993; GCN-NEXT:    s_endpgm
994;
995; TONGA-LABEL: sdiv_v4i32:
996; TONGA:       ; %bb.0:
997; TONGA-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
998; TONGA-NEXT:    s_mov_b32 s11, 0xf000
999; TONGA-NEXT:    s_mov_b32 s10, -1
1000; TONGA-NEXT:    s_mov_b32 s2, s10
1001; TONGA-NEXT:    s_mov_b32 s3, s11
1002; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1003; TONGA-NEXT:    s_mov_b32 s0, s14
1004; TONGA-NEXT:    s_mov_b32 s1, s15
1005; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1006; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
1007; TONGA-NEXT:    s_mov_b32 s14, 0x4f800000
1008; TONGA-NEXT:    s_mov_b32 s8, s12
1009; TONGA-NEXT:    s_mov_b32 s9, s13
1010; TONGA-NEXT:    s_waitcnt vmcnt(1)
1011; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
1012; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
1013; TONGA-NEXT:    s_waitcnt vmcnt(0)
1014; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
1015; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v9
1016; TONGA-NEXT:    v_xor_b32_e32 v15, v8, v9
1017; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v4
1018; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
1019; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v11, v5
1020; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
1021; TONGA-NEXT:    v_rcp_iflag_f32_e32 v9, v9
1022; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v11
1023; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
1024; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v5
1025; TONGA-NEXT:    v_mul_f32_e32 v9, s14, v9
1026; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
1027; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
1028; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
1029; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
1030; TONGA-NEXT:    v_xor_b32_e32 v16, v10, v11
1031; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
1032; TONGA-NEXT:    v_mul_f32_e32 v8, s14, v8
1033; TONGA-NEXT:    v_mul_hi_u32 v11, v9, v4
1034; TONGA-NEXT:    v_mul_lo_u32 v10, v9, v4
1035; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
1036; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
1037; TONGA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
1038; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v12, v2
1039; TONGA-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v11
1040; TONGA-NEXT:    v_xor_b32_e32 v17, v12, v13
1041; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v12
1042; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v10
1043; TONGA-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[0:1]
1044; TONGA-NEXT:    v_mul_hi_u32 v12, v8, v5
1045; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v13, v6
1046; TONGA-NEXT:    v_xor_b32_e32 v6, v6, v13
1047; TONGA-NEXT:    v_mul_lo_u32 v11, v8, v5
1048; TONGA-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v12
1049; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v6
1050; TONGA-NEXT:    v_mul_hi_u32 v10, v10, v9
1051; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v11
1052; TONGA-NEXT:    v_cndmask_b32_e64 v11, v11, v13, s[2:3]
1053; TONGA-NEXT:    v_rcp_iflag_f32_e32 v12, v12
1054; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
1055; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v14, v7
1056; TONGA-NEXT:    v_xor_b32_e32 v7, v7, v14
1057; TONGA-NEXT:    v_mul_f32_e32 v12, s14, v12
1058; TONGA-NEXT:    v_cvt_u32_f32_e32 v12, v12
1059; TONGA-NEXT:    v_mul_hi_u32 v18, v12, v6
1060; TONGA-NEXT:    v_mul_lo_u32 v13, v12, v6
1061; TONGA-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v18
1062; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v10, v9
1063; TONGA-NEXT:    v_subrev_u32_e32 v9, vcc, v10, v9
1064; TONGA-NEXT:    v_mul_hi_u32 v10, v11, v8
1065; TONGA-NEXT:    v_cndmask_b32_e64 v9, v9, v18, s[0:1]
1066; TONGA-NEXT:    v_mul_hi_u32 v9, v9, v0
1067; TONGA-NEXT:    v_sub_u32_e32 v19, vcc, 0, v13
1068; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v10, v8
1069; TONGA-NEXT:    v_subrev_u32_e32 v8, vcc, v10, v8
1070; TONGA-NEXT:    v_cndmask_b32_e64 v13, v13, v19, s[4:5]
1071; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[2:3]
1072; TONGA-NEXT:    v_mul_hi_u32 v10, v13, v12
1073; TONGA-NEXT:    v_mul_lo_u32 v11, v9, v4
1074; TONGA-NEXT:    v_mul_hi_u32 v8, v8, v1
1075; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v10, v12
1076; TONGA-NEXT:    v_subrev_u32_e32 v10, vcc, v10, v12
1077; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v11
1078; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v11
1079; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v4
1080; TONGA-NEXT:    v_cndmask_b32_e64 v10, v10, v13, s[4:5]
1081; TONGA-NEXT:    v_mul_lo_u32 v0, v8, v5
1082; TONGA-NEXT:    v_mul_hi_u32 v4, v10, v2
1083; TONGA-NEXT:    v_add_u32_e32 v12, vcc, -1, v9
1084; TONGA-NEXT:    v_add_u32_e32 v10, vcc, -1, v8
1085; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v0
1086; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
1087; TONGA-NEXT:    v_cmp_ge_u32_e64 s[6:7], v0, v5
1088; TONGA-NEXT:    v_mul_lo_u32 v5, v4, v6
1089; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v9
1090; TONGA-NEXT:    v_add_u32_e32 v0, vcc, 1, v8
1091; TONGA-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
1092; TONGA-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
1093; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v2, v5
1094; TONGA-NEXT:    s_and_b64 vcc, s[6:7], s[4:5]
1095; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v7
1096; TONGA-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
1097; TONGA-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[0:1]
1098; TONGA-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s[4:5]
1099; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v15
1100; TONGA-NEXT:    v_xor_b32_e32 v8, v0, v16
1101; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v15
1102; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v8, v16
1103; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v11
1104; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v9, v6
1105; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v2, v5
1106; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
1107; TONGA-NEXT:    v_mul_f32_e32 v8, s14, v8
1108; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
1109; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v10, v3
1110; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v10
1111; TONGA-NEXT:    v_add_u32_e32 v6, vcc, -1, v4
1112; TONGA-NEXT:    v_mul_lo_u32 v5, v8, v7
1113; TONGA-NEXT:    v_mul_hi_u32 v9, v8, v7
1114; TONGA-NEXT:    v_add_u32_e32 v2, vcc, 1, v4
1115; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v5
1116; TONGA-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
1117; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
1118; TONGA-NEXT:    v_mul_hi_u32 v5, v5, v8
1119; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v5, v8
1120; TONGA-NEXT:    v_subrev_u32_e32 v5, vcc, v5, v8
1121; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[4:5]
1122; TONGA-NEXT:    v_mul_hi_u32 v5, v5, v3
1123; TONGA-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1124; TONGA-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
1125; TONGA-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[2:3]
1126; TONGA-NEXT:    v_mul_lo_u32 v4, v5, v7
1127; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v17
1128; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v17
1129; TONGA-NEXT:    v_xor_b32_e32 v6, v10, v14
1130; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v3, v4
1131; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v8, v7
1132; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v3, v4
1133; TONGA-NEXT:    v_add_u32_e32 v7, vcc, -1, v5
1134; TONGA-NEXT:    v_add_u32_e32 v3, vcc, 1, v5
1135; TONGA-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
1136; TONGA-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
1137; TONGA-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
1138; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v6
1139; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
1140; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1141; TONGA-NEXT:    s_endpgm
1142;
1143; GFX9-LABEL: sdiv_v4i32:
1144; GFX9:       ; %bb.0:
1145; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
1146; GFX9-NEXT:    s_mov_b32 s15, 0xf000
1147; GFX9-NEXT:    s_mov_b32 s14, -1
1148; GFX9-NEXT:    s_mov_b32 s2, s14
1149; GFX9-NEXT:    s_mov_b32 s3, s15
1150; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1151; GFX9-NEXT:    s_mov_b32 s0, s10
1152; GFX9-NEXT:    s_mov_b32 s1, s11
1153; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1154; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
1155; GFX9-NEXT:    s_mov_b32 s4, 0x4f800000
1156; GFX9-NEXT:    s_mov_b32 s12, s8
1157; GFX9-NEXT:    s_mov_b32 s13, s9
1158; GFX9-NEXT:    s_waitcnt vmcnt(1)
1159; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
1160; GFX9-NEXT:    s_waitcnt vmcnt(0)
1161; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
1162; GFX9-NEXT:    v_add_u32_e32 v4, v4, v9
1163; GFX9-NEXT:    v_add_u32_e32 v0, v0, v8
1164; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v9
1165; GFX9-NEXT:    v_xor_b32_e32 v16, v8, v9
1166; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
1167; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v4
1168; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
1169; GFX9-NEXT:    v_add_u32_e32 v5, v5, v11
1170; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v11
1171; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, v5
1172; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
1173; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
1174; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
1175; GFX9-NEXT:    v_add_u32_e32 v6, v6, v13
1176; GFX9-NEXT:    v_add_u32_e32 v1, v1, v10
1177; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v9
1178; GFX9-NEXT:    v_mul_f32_e32 v8, s4, v8
1179; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v13
1180; GFX9-NEXT:    v_xor_b32_e32 v17, v10, v11
1181; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v10
1182; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, v6
1183; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v8
1184; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
1185; GFX9-NEXT:    v_add_u32_e32 v2, v2, v12
1186; GFX9-NEXT:    v_mul_f32_e32 v9, s4, v9
1187; GFX9-NEXT:    v_rcp_iflag_f32_e32 v10, v10
1188; GFX9-NEXT:    v_xor_b32_e32 v18, v12, v13
1189; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v12
1190; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
1191; GFX9-NEXT:    v_mul_hi_u32 v12, v8, v4
1192; GFX9-NEXT:    v_mul_lo_u32 v11, v8, v4
1193; GFX9-NEXT:    v_mul_f32_e32 v10, s4, v10
1194; GFX9-NEXT:    v_mul_lo_u32 v13, v9, v5
1195; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
1196; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v5
1197; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v10
1198; GFX9-NEXT:    v_sub_u32_e32 v19, 0, v11
1199; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v19, vcc
1200; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v12
1201; GFX9-NEXT:    v_sub_u32_e32 v19, 0, v13
1202; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v19, s[0:1]
1203; GFX9-NEXT:    v_mul_hi_u32 v19, v10, v6
1204; GFX9-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
1205; GFX9-NEXT:    v_add_u32_e32 v7, v7, v15
1206; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v15
1207; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v19
1208; GFX9-NEXT:    v_cvt_f32_u32_e32 v19, v7
1209; GFX9-NEXT:    v_mul_hi_u32 v11, v11, v8
1210; GFX9-NEXT:    v_mul_lo_u32 v12, v10, v6
1211; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
1212; GFX9-NEXT:    v_rcp_iflag_f32_e32 v19, v19
1213; GFX9-NEXT:    v_add_u32_e32 v3, v3, v14
1214; GFX9-NEXT:    v_sub_u32_e32 v20, 0, v12
1215; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v20, s[2:3]
1216; GFX9-NEXT:    v_mul_f32_e32 v19, s4, v19
1217; GFX9-NEXT:    v_cvt_u32_f32_e32 v19, v19
1218; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v14
1219; GFX9-NEXT:    v_mul_hi_u32 v21, v19, v7
1220; GFX9-NEXT:    v_mul_lo_u32 v20, v19, v7
1221; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v21
1222; GFX9-NEXT:    v_add_u32_e32 v21, v8, v11
1223; GFX9-NEXT:    v_sub_u32_e32 v8, v8, v11
1224; GFX9-NEXT:    v_mul_hi_u32 v11, v13, v9
1225; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v21, vcc
1226; GFX9-NEXT:    v_mul_hi_u32 v8, v8, v0
1227; GFX9-NEXT:    v_sub_u32_e32 v22, 0, v20
1228; GFX9-NEXT:    v_add_u32_e32 v13, v9, v11
1229; GFX9-NEXT:    v_sub_u32_e32 v9, v9, v11
1230; GFX9-NEXT:    v_mul_hi_u32 v11, v12, v10
1231; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v13, s[0:1]
1232; GFX9-NEXT:    v_mul_hi_u32 v9, v9, v1
1233; GFX9-NEXT:    v_cndmask_b32_e64 v20, v20, v22, s[4:5]
1234; GFX9-NEXT:    v_add_u32_e32 v12, v10, v11
1235; GFX9-NEXT:    v_sub_u32_e32 v10, v10, v11
1236; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[2:3]
1237; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v4
1238; GFX9-NEXT:    v_mul_hi_u32 v11, v20, v19
1239; GFX9-NEXT:    v_mul_hi_u32 v10, v10, v2
1240; GFX9-NEXT:    v_add_u32_e32 v13, 1, v8
1241; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v12
1242; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v12
1243; GFX9-NEXT:    v_mul_lo_u32 v12, v9, v5
1244; GFX9-NEXT:    v_add_u32_e32 v20, v19, v11
1245; GFX9-NEXT:    v_sub_u32_e32 v11, v19, v11
1246; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v20, s[4:5]
1247; GFX9-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v12
1248; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v12
1249; GFX9-NEXT:    v_mul_lo_u32 v12, v10, v6
1250; GFX9-NEXT:    v_mul_hi_u32 v11, v11, v3
1251; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
1252; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v5
1253; GFX9-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v12
1254; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v12
1255; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
1256; GFX9-NEXT:    v_cmp_ge_u32_e64 s[8:9], v2, v6
1257; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v13, s[0:1]
1258; GFX9-NEXT:    v_add_u32_e32 v0, 1, v9
1259; GFX9-NEXT:    s_and_b64 s[0:1], s[4:5], s[2:3]
1260; GFX9-NEXT:    v_cndmask_b32_e64 v0, v9, v0, s[0:1]
1261; GFX9-NEXT:    v_add_u32_e32 v1, 1, v10
1262; GFX9-NEXT:    s_and_b64 s[0:1], s[8:9], s[6:7]
1263; GFX9-NEXT:    v_mul_lo_u32 v12, v11, v7
1264; GFX9-NEXT:    v_add_u32_e32 v19, -1, v8
1265; GFX9-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s[0:1]
1266; GFX9-NEXT:    v_add_u32_e32 v5, -1, v10
1267; GFX9-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
1268; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[6:7]
1269; GFX9-NEXT:    v_add_u32_e32 v4, -1, v9
1270; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[2:3]
1271; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v16
1272; GFX9-NEXT:    v_xor_b32_e32 v5, v1, v18
1273; GFX9-NEXT:    v_xor_b32_e32 v4, v0, v17
1274; GFX9-NEXT:    v_sub_u32_e32 v0, v2, v16
1275; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v18
1276; GFX9-NEXT:    v_sub_u32_e32 v5, v3, v12
1277; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v7
1278; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v12
1279; GFX9-NEXT:    v_add_u32_e32 v3, 1, v11
1280; GFX9-NEXT:    s_and_b64 vcc, vcc, s[0:1]
1281; GFX9-NEXT:    v_add_u32_e32 v5, -1, v11
1282; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
1283; GFX9-NEXT:    v_sub_u32_e32 v1, v4, v17
1284; GFX9-NEXT:    v_xor_b32_e32 v4, v14, v15
1285; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
1286; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v4
1287; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v4
1288; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1289; GFX9-NEXT:    s_endpgm
1290;
1291; EG-LABEL: sdiv_v4i32:
1292; EG:       ; %bb.0:
1293; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
1294; EG-NEXT:    TEX 0 @8
1295; EG-NEXT:    ALU 2, @13, KC0[], KC1[]
1296; EG-NEXT:    TEX 0 @10
1297; EG-NEXT:    ALU 114, @16, KC0[CB0:0-32], KC1[]
1298; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
1299; EG-NEXT:    CF_END
1300; EG-NEXT:    PAD
1301; EG-NEXT:    Fetch clause starting at 8:
1302; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
1303; EG-NEXT:    Fetch clause starting at 10:
1304; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 0, #1
1305; EG-NEXT:    ALU clause starting at 12:
1306; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1307; EG-NEXT:    ALU clause starting at 13:
1308; EG-NEXT:     SETGT_INT * T0.W, 0.0, T1.Z,
1309; EG-NEXT:     ADD_INT * T2.W, T1.Z, PV.W,
1310; EG-NEXT:     XOR_INT * T2.W, PV.W, T0.W,
1311; EG-NEXT:    ALU clause starting at 16:
1312; EG-NEXT:     RECIP_UINT * T0.X, T2.W,
1313; EG-NEXT:     MULLO_INT * T0.Y, PS, T2.W,
1314; EG-NEXT:     SUB_INT T4.W, 0.0, PS,
1315; EG-NEXT:     MULHI * T0.Z, T0.X, T2.W,
1316; EG-NEXT:     CNDE_INT T4.W, PS, PV.W, T0.Y,
1317; EG-NEXT:     SETGT_INT * T5.W, 0.0, T3.Z,
1318; EG-NEXT:     MULHI * T0.Y, PV.W, T0.X,
1319; EG-NEXT:     SETGT_INT T2.Y, 0.0, T1.W,
1320; EG-NEXT:     ADD_INT T1.Z, T3.Z, T5.W, BS:VEC_021/SCL_122
1321; EG-NEXT:     ADD_INT T4.W, T0.X, PS,
1322; EG-NEXT:     SUB_INT * T6.W, T0.X, PS,
1323; EG-NEXT:     CNDE_INT T0.Z, T0.Z, PV.W, PS,
1324; EG-NEXT:     XOR_INT T4.W, PV.Z, T5.W,
1325; EG-NEXT:     ADD_INT * T1.W, T1.W, PV.Y,
1326; EG-NEXT:     XOR_INT T1.W, PS, T2.Y,
1327; EG-NEXT:     MULHI * T0.X, PV.Z, PV.W,
1328; EG-NEXT:     SETGT_INT T6.W, 0.0, T1.Y,
1329; EG-NEXT:     RECIP_UINT * T0.Y, PV.W,
1330; EG-NEXT:     ADD_INT T7.W, T1.Y, PV.W,
1331; EG-NEXT:     MULLO_INT * T0.Z, PS, T1.W,
1332; EG-NEXT:     XOR_INT T1.Z, PV.W, T6.W, BS:VEC_021/SCL_122
1333; EG-NEXT:     SUB_INT T7.W, 0.0, PS,
1334; EG-NEXT:     MULHI * T1.Y, T0.Y, T1.W,
1335; EG-NEXT:     CNDE_INT T7.W, PS, PV.W, T0.Z,
1336; EG-NEXT:     RECIP_UINT * T0.Z, PV.Z,
1337; EG-NEXT:     SETGT_INT T8.W, 0.0, T3.W,
1338; EG-NEXT:     MULHI * T2.X, PV.W, T0.Y,
1339; EG-NEXT:     ADD_INT T4.Y, T3.W, PV.W,
1340; EG-NEXT:     ADD_INT T2.Z, T0.Y, PS,
1341; EG-NEXT:     SUB_INT T3.W, T0.Y, PS,
1342; EG-NEXT:     MULLO_INT * T0.Y, T0.Z, T1.Z,
1343; EG-NEXT:     CNDE_INT T2.X, T1.Y, PV.Z, PV.W,
1344; EG-NEXT:     XOR_INT T1.Y, PV.Y, T8.W,
1345; EG-NEXT:     SETGT_INT T2.Z, 0.0, T1.X,
1346; EG-NEXT:     SUB_INT T3.W, 0.0, PS,
1347; EG-NEXT:     MULHI * T3.Z, T0.Z, T1.Z,
1348; EG-NEXT:     CNDE_INT T4.Z, PS, PV.W, T0.Y,
1349; EG-NEXT:     ADD_INT T3.W, T1.X, PV.Z,
1350; EG-NEXT:     MULHI * T0.Y, PV.X, PV.Y,
1351; EG-NEXT:     XOR_INT T3.W, PV.W, T2.Z, BS:VEC_021/SCL_122
1352; EG-NEXT:     MULHI * T1.X, PV.Z, T0.Z,
1353; EG-NEXT:     RECIP_UINT * T2.X, PV.W,
1354; EG-NEXT:     MULLO_INT * T4.X, PS, T3.W,
1355; EG-NEXT:     SETGT_INT T4.Z, 0.0, T3.Y,
1356; EG-NEXT:     SUB_INT T7.W, 0.0, PS,
1357; EG-NEXT:     MULHI * T4.Y, T2.X, T3.W,
1358; EG-NEXT:     CNDE_INT T4.X, PS, PV.W, T4.X,
1359; EG-NEXT:     ADD_INT T3.Y, T3.Y, PV.Z,
1360; EG-NEXT:     ADD_INT T5.Z, T0.Z, T1.X,
1361; EG-NEXT:     SUB_INT T7.W, T0.Z, T1.X,
1362; EG-NEXT:     MULLO_INT * T0.Z, T0.Y, T1.W,
1363; EG-NEXT:     CNDE_INT T5.Y, T3.Z, PV.Z, PV.W,
1364; EG-NEXT:     XOR_INT T3.Z, PV.Y, T4.Z,
1365; EG-NEXT:     SUB_INT T7.W, T1.Y, PS,
1366; EG-NEXT:     MULHI * T1.X, PV.X, T2.X,
1367; EG-NEXT:     SETGE_UINT T5.Z, PV.W, T1.W,
1368; EG-NEXT:     SETGE_UINT T1.W, T1.Y, T0.Z,
1369; EG-NEXT:     MULHI * T0.Z, PV.Y, PV.Z,
1370; EG-NEXT:     AND_INT T1.Y, PV.Z, PV.W,
1371; EG-NEXT:     ADD_INT T5.Z, T0.Y, 1,
1372; EG-NEXT:     SETGT_INT T7.W, 0.0, T3.X,
1373; EG-NEXT:     MULLO_INT * T3.Y, PS, T1.Z,
1374; EG-NEXT:     SUB_INT T4.X, T3.Z, PS,
1375; EG-NEXT:     ADD_INT T5.Y, T3.X, PV.W,
1376; EG-NEXT:     ADD_INT T6.Z, T2.X, T1.X, BS:VEC_120/SCL_212
1377; EG-NEXT:     SUB_INT * T9.W, T2.X, T1.X, BS:VEC_120/SCL_212
1378; EG-NEXT:     MULLO_INT * T1.X, T0.X, T2.W,
1379; EG-NEXT:     CNDE_INT T2.X, T4.Y, T6.Z, T9.W,
1380; EG-NEXT:     XOR_INT T4.Y, T5.Y, T7.W, BS:VEC_201
1381; EG-NEXT:     SUB_INT T6.Z, T4.W, PS, BS:VEC_120/SCL_212
1382; EG-NEXT:     SETGE_UINT T9.W, T4.X, T1.Z, BS:VEC_102/SCL_221
1383; EG-NEXT:     SETGE_UINT * T10.W, T3.Z, T3.Y,
1384; EG-NEXT:     AND_INT T3.X, PV.W, PS,
1385; EG-NEXT:     ADD_INT T3.Y, T0.Z, 1,
1386; EG-NEXT:     SETGE_UINT T1.Z, PV.Z, T2.W,
1387; EG-NEXT:     SETGE_UINT T2.W, T4.W, T1.X,
1388; EG-NEXT:     MULHI * T1.X, PV.X, PV.Y,
1389; EG-NEXT:     AND_INT T2.X, PV.Z, PV.W,
1390; EG-NEXT:     ADD_INT T5.Y, T0.X, 1,
1391; EG-NEXT:     CNDE_INT T1.Z, PV.X, T0.Z, PV.Y,
1392; EG-NEXT:     ADD_INT T4.W, T0.Z, literal.x,
1393; EG-NEXT:     MULLO_INT * T0.Z, PS, T3.W,
1394; EG-NEXT:    -1(nan), 0(0.000000e+00)
1395; EG-NEXT:     CNDE_INT T3.X, T10.W, PV.W, PV.Z,
1396; EG-NEXT:     CNDE_INT T3.Y, PV.X, T0.X, PV.Y,
1397; EG-NEXT:     CNDE_INT T1.Z, T1.Y, T0.Y, T5.Z,
1398; EG-NEXT:     ADD_INT T4.W, T0.Y, literal.x, BS:VEC_120/SCL_212
1399; EG-NEXT:     SUB_INT * T9.W, T4.Y, PS,
1400; EG-NEXT:    -1(nan), 0(0.000000e+00)
1401; EG-NEXT:     ADD_INT T0.X, T0.X, literal.x,
1402; EG-NEXT:     SETGE_UINT T0.Y, PS, T3.W,
1403; EG-NEXT:     SETGE_UINT T0.Z, T4.Y, T0.Z,
1404; EG-NEXT:     CNDE_INT T1.W, T1.W, PV.W, PV.Z,
1405; EG-NEXT:     XOR_INT * T3.W, T8.W, T2.Y,
1406; EG-NEXT:    -1(nan), 0(0.000000e+00)
1407; EG-NEXT:     XOR_INT T2.X, PV.W, PS,
1408; EG-NEXT:     AND_INT T0.Y, PV.Y, PV.Z,
1409; EG-NEXT:     ADD_INT T1.Z, T1.X, 1,
1410; EG-NEXT:     CNDE_INT T1.W, T2.W, PV.X, T3.Y,
1411; EG-NEXT:     XOR_INT * T0.W, T5.W, T0.W,
1412; EG-NEXT:     XOR_INT T0.X, T4.Z, T6.W, BS:VEC_021/SCL_122
1413; EG-NEXT:     XOR_INT T1.Y, PV.W, PS,
1414; EG-NEXT:     CNDE_INT T1.Z, PV.Y, T1.X, PV.Z,
1415; EG-NEXT:     ADD_INT T1.W, T1.X, literal.x,
1416; EG-NEXT:     SUB_INT * T3.W, PV.X, T3.W,
1417; EG-NEXT:    -1(nan), 0(0.000000e+00)
1418; EG-NEXT:     CNDE_INT T0.Y, T0.Z, PV.W, PV.Z,
1419; EG-NEXT:     SUB_INT T3.Z, PV.Y, T0.W,
1420; EG-NEXT:     XOR_INT T0.W, T7.W, T2.Z,
1421; EG-NEXT:     XOR_INT * T1.W, T3.X, PV.X,
1422; EG-NEXT:     SUB_INT T3.Y, PS, T0.X,
1423; EG-NEXT:     XOR_INT * T1.W, PV.Y, PV.W,
1424; EG-NEXT:     SUB_INT T3.X, PV.W, T0.W,
1425; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1426; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1427  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1428  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
1429  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
1430  %result = sdiv <4 x i32> %num, %den
1431  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1432  ret void
1433}
1434
1435define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
1436; GCN-LABEL: sdiv_v4i32_4:
1437; GCN:       ; %bb.0:
1438; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1439; GCN-NEXT:    s_mov_b32 s7, 0xf000
1440; GCN-NEXT:    s_mov_b32 s6, -1
1441; GCN-NEXT:    s_mov_b32 s10, s6
1442; GCN-NEXT:    s_mov_b32 s11, s7
1443; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1444; GCN-NEXT:    s_mov_b32 s8, s2
1445; GCN-NEXT:    s_mov_b32 s9, s3
1446; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1447; GCN-NEXT:    s_mov_b32 s4, s0
1448; GCN-NEXT:    s_mov_b32 s5, s1
1449; GCN-NEXT:    s_waitcnt vmcnt(0)
1450; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1451; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1452; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1453; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1454; GCN-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1455; GCN-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1456; GCN-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1457; GCN-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1458; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
1459; GCN-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
1460; GCN-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
1461; GCN-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
1462; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1463; GCN-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1464; GCN-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1465; GCN-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1466; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1467; GCN-NEXT:    s_endpgm
1468;
1469; TONGA-LABEL: sdiv_v4i32_4:
1470; TONGA:       ; %bb.0:
1471; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1472; TONGA-NEXT:    s_mov_b32 s3, 0xf000
1473; TONGA-NEXT:    s_mov_b32 s2, -1
1474; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1475; TONGA-NEXT:    s_mov_b32 s0, s4
1476; TONGA-NEXT:    s_mov_b32 s1, s5
1477; TONGA-NEXT:    s_mov_b32 s4, s6
1478; TONGA-NEXT:    s_mov_b32 s5, s7
1479; TONGA-NEXT:    s_mov_b32 s6, s2
1480; TONGA-NEXT:    s_mov_b32 s7, s3
1481; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1482; TONGA-NEXT:    s_waitcnt vmcnt(0)
1483; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1484; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1485; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1486; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1487; TONGA-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1488; TONGA-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1489; TONGA-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1490; TONGA-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1491; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
1492; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
1493; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
1494; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
1495; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1496; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1497; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1498; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1499; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1500; TONGA-NEXT:    s_endpgm
1501;
1502; GFX9-LABEL: sdiv_v4i32_4:
1503; GFX9:       ; %bb.0:
1504; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1505; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1506; GFX9-NEXT:    s_mov_b32 s2, -1
1507; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1508; GFX9-NEXT:    s_mov_b32 s0, s4
1509; GFX9-NEXT:    s_mov_b32 s1, s5
1510; GFX9-NEXT:    s_mov_b32 s4, s6
1511; GFX9-NEXT:    s_mov_b32 s5, s7
1512; GFX9-NEXT:    s_mov_b32 s6, s2
1513; GFX9-NEXT:    s_mov_b32 s7, s3
1514; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1515; GFX9-NEXT:    s_waitcnt vmcnt(0)
1516; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1517; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1518; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1519; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1520; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1521; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1522; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1523; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1524; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
1525; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
1526; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
1527; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
1528; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1529; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1530; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1531; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1532; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1533; GFX9-NEXT:    s_endpgm
1534;
1535; EG-LABEL: sdiv_v4i32_4:
1536; EG:       ; %bb.0:
1537; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1538; EG-NEXT:    TEX 0 @6
1539; EG-NEXT:    ALU 24, @9, KC0[CB0:0-32], KC1[]
1540; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1541; EG-NEXT:    CF_END
1542; EG-NEXT:    PAD
1543; EG-NEXT:    Fetch clause starting at 6:
1544; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
1545; EG-NEXT:    ALU clause starting at 8:
1546; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1547; EG-NEXT:    ALU clause starting at 9:
1548; EG-NEXT:     ASHR T1.W, T0.W, literal.x,
1549; EG-NEXT:     ASHR * T2.W, T0.Z, literal.x,
1550; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1551; EG-NEXT:     LSHR * T1.W, PV.W, literal.x,
1552; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1553; EG-NEXT:     ADD_INT T1.Z, T0.W, PV.W,
1554; EG-NEXT:     LSHR T0.W, T2.W, literal.x, BS:VEC_120/SCL_212
1555; EG-NEXT:     ASHR * T1.W, T0.Y, literal.y,
1556; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
1557; EG-NEXT:     LSHR T1.Y, PS, literal.x,
1558; EG-NEXT:     ASHR T2.Z, T0.X, literal.y,
1559; EG-NEXT:     ADD_INT T0.W, T0.Z, PV.W,
1560; EG-NEXT:     ASHR * T1.W, PV.Z, literal.z,
1561; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
1562; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1563; EG-NEXT:     ASHR T1.Z, PV.W, literal.x,
1564; EG-NEXT:     LSHR T0.W, PV.Z, literal.y,
1565; EG-NEXT:     ADD_INT * T2.W, T0.Y, PV.Y,
1566; EG-NEXT:    2(2.802597e-45), 30(4.203895e-44)
1567; EG-NEXT:     ASHR T1.Y, PS, literal.x,
1568; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
1569; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1570; EG-NEXT:     ASHR T1.X, PV.W, literal.x,
1571; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1572; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1573  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
1574  %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
1575  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1576  ret void
1577}
1578
1579define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1580; GCN-LABEL: v_sdiv_i8:
1581; GCN:       ; %bb.0:
1582; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1583; GCN-NEXT:    s_mov_b32 s7, 0xf000
1584; GCN-NEXT:    s_mov_b32 s6, -1
1585; GCN-NEXT:    s_mov_b32 s10, s6
1586; GCN-NEXT:    s_mov_b32 s11, s7
1587; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1588; GCN-NEXT:    s_mov_b32 s8, s2
1589; GCN-NEXT:    s_mov_b32 s9, s3
1590; GCN-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0
1591; GCN-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0 offset:1
1592; GCN-NEXT:    s_mov_b32 s4, s0
1593; GCN-NEXT:    s_mov_b32 s5, s1
1594; GCN-NEXT:    s_waitcnt vmcnt(0)
1595; GCN-NEXT:    v_xor_b32_e32 v2, v0, v1
1596; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
1597; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
1598; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
1599; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1600; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
1601; GCN-NEXT:    v_mul_f32_e32 v3, v0, v3
1602; GCN-NEXT:    v_trunc_f32_e32 v3, v3
1603; GCN-NEXT:    v_mad_f32 v0, -v3, v1, v0
1604; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
1605; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
1606; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
1607; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
1608; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 8
1609; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1610; GCN-NEXT:    s_endpgm
1611;
1612; TONGA-LABEL: v_sdiv_i8:
1613; TONGA:       ; %bb.0:
1614; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1615; TONGA-NEXT:    s_mov_b32 s3, 0xf000
1616; TONGA-NEXT:    s_mov_b32 s2, -1
1617; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1618; TONGA-NEXT:    s_mov_b32 s0, s4
1619; TONGA-NEXT:    s_mov_b32 s1, s5
1620; TONGA-NEXT:    s_mov_b32 s4, s6
1621; TONGA-NEXT:    s_mov_b32 s5, s7
1622; TONGA-NEXT:    s_mov_b32 s6, s2
1623; TONGA-NEXT:    s_mov_b32 s7, s3
1624; TONGA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 offset:1
1625; TONGA-NEXT:    buffer_load_sbyte v2, off, s[4:7], 0
1626; TONGA-NEXT:    s_waitcnt vmcnt(1)
1627; TONGA-NEXT:    v_cvt_f32_i32_e32 v1, v0
1628; TONGA-NEXT:    s_waitcnt vmcnt(0)
1629; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v2
1630; TONGA-NEXT:    v_xor_b32_e32 v0, v2, v0
1631; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1632; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1633; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1634; TONGA-NEXT:    v_mul_f32_e32 v2, v3, v4
1635; TONGA-NEXT:    v_trunc_f32_e32 v2, v2
1636; TONGA-NEXT:    v_mad_f32 v3, -v2, v1, v3
1637; TONGA-NEXT:    v_cvt_i32_f32_e32 v2, v2
1638; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1639; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1640; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1641; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 8
1642; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1643; TONGA-NEXT:    s_endpgm
1644;
1645; GFX9-LABEL: v_sdiv_i8:
1646; GFX9:       ; %bb.0:
1647; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1648; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1649; GFX9-NEXT:    s_mov_b32 s2, -1
1650; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1651; GFX9-NEXT:    s_mov_b32 s0, s4
1652; GFX9-NEXT:    s_mov_b32 s1, s5
1653; GFX9-NEXT:    s_mov_b32 s4, s6
1654; GFX9-NEXT:    s_mov_b32 s5, s7
1655; GFX9-NEXT:    s_mov_b32 s6, s2
1656; GFX9-NEXT:    s_mov_b32 s7, s3
1657; GFX9-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 offset:1
1658; GFX9-NEXT:    buffer_load_sbyte v2, off, s[4:7], 0
1659; GFX9-NEXT:    s_waitcnt vmcnt(1)
1660; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v0
1661; GFX9-NEXT:    s_waitcnt vmcnt(0)
1662; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v2
1663; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
1664; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1665; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1666; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
1667; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v4
1668; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1669; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v2
1670; GFX9-NEXT:    v_mad_f32 v2, -v2, v1, v3
1671; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
1672; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1673; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
1674; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 8
1675; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1676; GFX9-NEXT:    s_endpgm
1677;
1678; EG-LABEL: v_sdiv_i8:
1679; EG:       ; %bb.0:
1680; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1681; EG-NEXT:    TEX 1 @6
1682; EG-NEXT:    ALU 21, @11, KC0[CB0:0-32], KC1[]
1683; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1684; EG-NEXT:    CF_END
1685; EG-NEXT:    PAD
1686; EG-NEXT:    Fetch clause starting at 6:
1687; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 1, #1
1688; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1689; EG-NEXT:    ALU clause starting at 10:
1690; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1691; EG-NEXT:    ALU clause starting at 11:
1692; EG-NEXT:     BFE_INT * T0.W, T1.X, 0.0, literal.x,
1693; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1694; EG-NEXT:     INT_TO_FLT * T0.Y, PV.W,
1695; EG-NEXT:     BFE_INT T1.W, T0.X, 0.0, literal.x,
1696; EG-NEXT:     RECIP_IEEE * T0.X, PS,
1697; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1698; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
1699; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.X,
1700; EG-NEXT:     TRUNC T2.W, PV.W,
1701; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
1702; EG-NEXT:     ASHR T0.W, PS, literal.x,
1703; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.Y, T0.Z,
1704; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1705; EG-NEXT:     TRUNC T0.Z, T2.W,
1706; EG-NEXT:     SETGE T1.W, |PS|, |T0.Y|,
1707; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
1708; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
1709; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
1710; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1711; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, literal.x,
1712; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1713; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
1714  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
1715  %num = load i8, i8 addrspace(1) * %in
1716  %den = load i8, i8 addrspace(1) * %den_ptr
1717  %result = sdiv i8 %num, %den
1718  %result.ext = sext i8 %result to i32
1719  store i32 %result.ext, i32 addrspace(1)* %out
1720  ret void
1721}
1722
1723define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
1724; GCN-LABEL: v_sdiv_i23:
1725; GCN:       ; %bb.0:
1726; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1727; GCN-NEXT:    s_mov_b32 s7, 0xf000
1728; GCN-NEXT:    s_mov_b32 s6, -1
1729; GCN-NEXT:    s_mov_b32 s10, s6
1730; GCN-NEXT:    s_mov_b32 s11, s7
1731; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1732; GCN-NEXT:    s_mov_b32 s4, s0
1733; GCN-NEXT:    s_mov_b32 s5, s1
1734; GCN-NEXT:    s_mov_b32 s8, s2
1735; GCN-NEXT:    s_mov_b32 s9, s3
1736; GCN-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
1737; GCN-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:6
1738; GCN-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1739; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:4
1740; GCN-NEXT:    s_waitcnt vmcnt(3)
1741; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1742; GCN-NEXT:    s_waitcnt vmcnt(2)
1743; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1744; GCN-NEXT:    s_waitcnt vmcnt(1)
1745; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
1746; GCN-NEXT:    s_waitcnt vmcnt(0)
1747; GCN-NEXT:    v_or_b32_e32 v1, v2, v3
1748; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
1749; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 23
1750; GCN-NEXT:    v_xor_b32_e32 v2, v0, v1
1751; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
1752; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
1753; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
1754; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1755; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
1756; GCN-NEXT:    v_mul_f32_e32 v3, v0, v3
1757; GCN-NEXT:    v_trunc_f32_e32 v3, v3
1758; GCN-NEXT:    v_mad_f32 v0, -v3, v1, v0
1759; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
1760; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v1|
1761; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
1762; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
1763; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
1764; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1765; GCN-NEXT:    s_endpgm
1766;
1767; TONGA-LABEL: v_sdiv_i23:
1768; TONGA:       ; %bb.0:
1769; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1770; TONGA-NEXT:    s_mov_b32 s3, 0xf000
1771; TONGA-NEXT:    s_mov_b32 s2, -1
1772; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1773; TONGA-NEXT:    s_mov_b32 s0, s4
1774; TONGA-NEXT:    s_mov_b32 s1, s5
1775; TONGA-NEXT:    s_mov_b32 s4, s6
1776; TONGA-NEXT:    s_mov_b32 s5, s7
1777; TONGA-NEXT:    s_mov_b32 s6, s2
1778; TONGA-NEXT:    s_mov_b32 s7, s3
1779; TONGA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:2
1780; TONGA-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:4
1781; TONGA-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:6
1782; TONGA-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
1783; TONGA-NEXT:    s_waitcnt vmcnt(3)
1784; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1785; TONGA-NEXT:    s_waitcnt vmcnt(1)
1786; TONGA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1787; TONGA-NEXT:    v_or_b32_e32 v1, v1, v2
1788; TONGA-NEXT:    v_bfe_i32 v1, v1, 0, 23
1789; TONGA-NEXT:    v_cvt_f32_i32_e32 v2, v1
1790; TONGA-NEXT:    s_waitcnt vmcnt(0)
1791; TONGA-NEXT:    v_or_b32_e32 v0, v3, v0
1792; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 23
1793; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v0
1794; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1795; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
1796; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1797; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1798; TONGA-NEXT:    v_mul_f32_e32 v1, v3, v4
1799; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
1800; TONGA-NEXT:    v_mad_f32 v3, -v1, v2, v3
1801; TONGA-NEXT:    v_cvt_i32_f32_e32 v1, v1
1802; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1803; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1804; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1805; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 23
1806; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1807; TONGA-NEXT:    s_endpgm
1808;
1809; GFX9-LABEL: v_sdiv_i23:
1810; GFX9:       ; %bb.0:
1811; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1812; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1813; GFX9-NEXT:    s_mov_b32 s2, -1
1814; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1815; GFX9-NEXT:    s_mov_b32 s0, s4
1816; GFX9-NEXT:    s_mov_b32 s1, s5
1817; GFX9-NEXT:    s_mov_b32 s4, s6
1818; GFX9-NEXT:    s_mov_b32 s5, s7
1819; GFX9-NEXT:    s_mov_b32 s6, s2
1820; GFX9-NEXT:    s_mov_b32 s7, s3
1821; GFX9-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:2
1822; GFX9-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:4
1823; GFX9-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:6
1824; GFX9-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
1825; GFX9-NEXT:    s_waitcnt vmcnt(3)
1826; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1827; GFX9-NEXT:    s_waitcnt vmcnt(1)
1828; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1829; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
1830; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 23
1831; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v1
1832; GFX9-NEXT:    s_waitcnt vmcnt(0)
1833; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
1834; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 23
1835; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v0
1836; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1837; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
1838; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1839; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
1840; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v4
1841; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
1842; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v1
1843; GFX9-NEXT:    v_mad_f32 v1, -v1, v2, v3
1844; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
1845; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1846; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
1847; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 23
1848; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1849; GFX9-NEXT:    s_endpgm
1850;
1851; EG-LABEL: v_sdiv_i23:
1852; EG:       ; %bb.0:
1853; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1854; EG-NEXT:    TEX 3 @6
1855; EG-NEXT:    ALU 33, @15, KC0[CB0:0-32], KC1[]
1856; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1857; EG-NEXT:    CF_END
1858; EG-NEXT:    PAD
1859; EG-NEXT:    Fetch clause starting at 6:
1860; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1861; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1862; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1863; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1864; EG-NEXT:    ALU clause starting at 14:
1865; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1866; EG-NEXT:    ALU clause starting at 15:
1867; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1868; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1869; EG-NEXT:     OR_INT T0.W, T0.X, PV.W,
1870; EG-NEXT:     LSHL * T1.W, T3.X, literal.x,
1871; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1872; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1873; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1874; EG-NEXT:     ASHR T0.W, PV.W, literal.x,
1875; EG-NEXT:     OR_INT * T1.W, T2.X, T1.W,
1876; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1877; EG-NEXT:     LSHL T1.W, PS, literal.x,
1878; EG-NEXT:     INT_TO_FLT * T0.X, PV.W,
1879; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1880; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
1881; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1882; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1883; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
1884; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.Y,
1885; EG-NEXT:     TRUNC T2.W, PV.W,
1886; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
1887; EG-NEXT:     ASHR T0.W, PS, literal.x,
1888; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
1889; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1890; EG-NEXT:     TRUNC T0.Z, T2.W,
1891; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
1892; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
1893; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
1894; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
1895; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1896; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1897; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1898; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
1899; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1900; EG-NEXT:    9(1.261169e-44), 2(2.802597e-45)
1901  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
1902  %num = load i23, i23 addrspace(1) * %in
1903  %den = load i23, i23 addrspace(1) * %den_ptr
1904  %result = sdiv i23 %num, %den
1905  %result.ext = sext i23 %result to i32
1906  store i32 %result.ext, i32 addrspace(1)* %out
1907  ret void
1908}
1909
1910define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
1911; GCN-LABEL: v_sdiv_i24:
1912; GCN:       ; %bb.0:
1913; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1914; GCN-NEXT:    s_mov_b32 s7, 0xf000
1915; GCN-NEXT:    s_mov_b32 s6, -1
1916; GCN-NEXT:    s_mov_b32 s10, s6
1917; GCN-NEXT:    s_mov_b32 s11, s7
1918; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1919; GCN-NEXT:    s_mov_b32 s4, s0
1920; GCN-NEXT:    s_mov_b32 s5, s1
1921; GCN-NEXT:    s_mov_b32 s8, s2
1922; GCN-NEXT:    s_mov_b32 s9, s3
1923; GCN-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0 offset:2
1924; GCN-NEXT:    buffer_load_sbyte v3, off, s[8:11], 0 offset:6
1925; GCN-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1926; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:4
1927; GCN-NEXT:    s_waitcnt vmcnt(3)
1928; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1929; GCN-NEXT:    s_waitcnt vmcnt(2)
1930; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1931; GCN-NEXT:    s_waitcnt vmcnt(1)
1932; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
1933; GCN-NEXT:    s_waitcnt vmcnt(0)
1934; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
1935; GCN-NEXT:    v_xor_b32_e32 v1, v1, v3
1936; GCN-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
1937; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
1938; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v2
1939; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
1940; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v2
1941; GCN-NEXT:    v_mul_f32_e32 v3, v0, v3
1942; GCN-NEXT:    v_trunc_f32_e32 v3, v3
1943; GCN-NEXT:    v_mad_f32 v0, -v3, v2, v0
1944; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
1945; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v2|
1946; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1947; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
1948; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
1949; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1950; GCN-NEXT:    s_endpgm
1951;
1952; TONGA-LABEL: v_sdiv_i24:
1953; TONGA:       ; %bb.0:
1954; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1955; TONGA-NEXT:    s_mov_b32 s3, 0xf000
1956; TONGA-NEXT:    s_mov_b32 s2, -1
1957; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1958; TONGA-NEXT:    s_mov_b32 s0, s4
1959; TONGA-NEXT:    s_mov_b32 s1, s5
1960; TONGA-NEXT:    s_mov_b32 s4, s6
1961; TONGA-NEXT:    s_mov_b32 s5, s7
1962; TONGA-NEXT:    s_mov_b32 s6, s2
1963; TONGA-NEXT:    s_mov_b32 s7, s3
1964; TONGA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 offset:2
1965; TONGA-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 offset:4
1966; TONGA-NEXT:    buffer_load_sbyte v2, off, s[4:7], 0 offset:6
1967; TONGA-NEXT:    buffer_load_ushort v3, off, s[4:7], 0
1968; TONGA-NEXT:    s_waitcnt vmcnt(3)
1969; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1970; TONGA-NEXT:    s_waitcnt vmcnt(1)
1971; TONGA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1972; TONGA-NEXT:    v_or_b32_e32 v1, v1, v2
1973; TONGA-NEXT:    v_cvt_f32_i32_e32 v1, v1
1974; TONGA-NEXT:    s_waitcnt vmcnt(0)
1975; TONGA-NEXT:    v_or_b32_e32 v3, v3, v0
1976; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v3
1977; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v2
1978; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1979; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1980; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1981; TONGA-NEXT:    v_mul_f32_e32 v2, v3, v4
1982; TONGA-NEXT:    v_trunc_f32_e32 v2, v2
1983; TONGA-NEXT:    v_mad_f32 v3, -v2, v1, v3
1984; TONGA-NEXT:    v_cvt_i32_f32_e32 v2, v2
1985; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1986; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1987; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1988; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 24
1989; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1990; TONGA-NEXT:    s_endpgm
1991;
1992; GFX9-LABEL: v_sdiv_i24:
1993; GFX9:       ; %bb.0:
1994; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1995; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1996; GFX9-NEXT:    s_mov_b32 s2, -1
1997; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1998; GFX9-NEXT:    s_mov_b32 s0, s4
1999; GFX9-NEXT:    s_mov_b32 s1, s5
2000; GFX9-NEXT:    s_mov_b32 s4, s6
2001; GFX9-NEXT:    s_mov_b32 s5, s7
2002; GFX9-NEXT:    s_mov_b32 s6, s2
2003; GFX9-NEXT:    s_mov_b32 s7, s3
2004; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
2005; GFX9-NEXT:    buffer_load_sbyte v1, off, s[4:7], 0 offset:2
2006; GFX9-NEXT:    buffer_load_ushort v2, off, s[4:7], 0 offset:4
2007; GFX9-NEXT:    buffer_load_sbyte v3, off, s[4:7], 0 offset:6
2008; GFX9-NEXT:    s_waitcnt vmcnt(2)
2009; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2010; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
2011; GFX9-NEXT:    s_waitcnt vmcnt(0)
2012; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2013; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
2014; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
2015; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
2016; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v3
2017; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
2018; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2019; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
2020; GFX9-NEXT:    v_mul_f32_e32 v3, v0, v4
2021; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2022; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v3
2023; GFX9-NEXT:    v_mad_f32 v0, -v3, v2, v0
2024; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v2|
2025; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
2026; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
2027; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 24
2028; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2029; GFX9-NEXT:    s_endpgm
2030;
2031; EG-LABEL: v_sdiv_i24:
2032; EG:       ; %bb.0:
2033; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
2034; EG-NEXT:    TEX 3 @6
2035; EG-NEXT:    ALU 43, @15, KC0[CB0:0-32], KC1[]
2036; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2037; EG-NEXT:    CF_END
2038; EG-NEXT:    PAD
2039; EG-NEXT:    Fetch clause starting at 6:
2040; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
2041; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
2042; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
2043; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
2044; EG-NEXT:    ALU clause starting at 14:
2045; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2046; EG-NEXT:    ALU clause starting at 15:
2047; EG-NEXT:     BFE_INT * T0.W, T1.X, 0.0, literal.x,
2048; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2049; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2050; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2051; EG-NEXT:     OR_INT * T0.W, T0.X, PV.W,
2052; EG-NEXT:     SETGT_INT * T1.W, 0.0, PV.W,
2053; EG-NEXT:     ADD_INT * T0.W, T0.W, PV.W,
2054; EG-NEXT:     XOR_INT * T0.W, PV.W, T1.W,
2055; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2056; EG-NEXT:     BFE_INT T2.W, T3.X, 0.0, literal.x,
2057; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2058; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2059; EG-NEXT:     LSHL T0.Z, PV.W, literal.x,
2060; EG-NEXT:     SUB_INT T2.W, 0.0, PS,
2061; EG-NEXT:     MULHI * T1.X, T0.X, T0.W,
2062; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2063; EG-NEXT:     CNDE_INT T2.W, PS, PV.W, T0.Y,
2064; EG-NEXT:     OR_INT * T3.W, T2.X, PV.Z,
2065; EG-NEXT:     SETGT_INT T4.W, 0.0, PS,
2066; EG-NEXT:     MULHI * T0.Y, PV.W, T0.X,
2067; EG-NEXT:     ADD_INT T0.Z, T3.W, PV.W,
2068; EG-NEXT:     ADD_INT T2.W, T0.X, PS,
2069; EG-NEXT:     SUB_INT * T3.W, T0.X, PS,
2070; EG-NEXT:     CNDE_INT T2.W, T1.X, PV.W, PS,
2071; EG-NEXT:     XOR_INT * T3.W, PV.Z, T4.W,
2072; EG-NEXT:     MULHI * T0.X, PV.W, PS,
2073; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2074; EG-NEXT:     SUB_INT * T2.W, T3.W, PS,
2075; EG-NEXT:     SETGE_UINT T0.W, PV.W, T0.W,
2076; EG-NEXT:     SETGE_UINT * T2.W, T3.W, T0.Y,
2077; EG-NEXT:     AND_INT T0.W, PV.W, PS,
2078; EG-NEXT:     ADD_INT * T3.W, T0.X, 1,
2079; EG-NEXT:     CNDE_INT T0.W, PV.W, T0.X, PS,
2080; EG-NEXT:     ADD_INT * T3.W, T0.X, literal.x,
2081; EG-NEXT:    -1(nan), 0(0.000000e+00)
2082; EG-NEXT:     CNDE_INT T0.W, T2.W, PS, PV.W,
2083; EG-NEXT:     XOR_INT * T1.W, T4.W, T1.W,
2084; EG-NEXT:     XOR_INT * T0.W, PV.W, PS,
2085; EG-NEXT:     SUB_INT * T0.W, PV.W, T1.W,
2086; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2087; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2088; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
2089; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
2090; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
2091  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
2092  %num = load i24, i24 addrspace(1) * %in
2093  %den = load i24, i24 addrspace(1) * %den_ptr
2094  %result = sdiv i24 %num, %den
2095  %result.ext = sext i24 %result to i32
2096  store i32 %result.ext, i32 addrspace(1)* %out
2097  ret void
2098}
2099
2100define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
2101; GCN-LABEL: v_sdiv_i25:
2102; GCN:       ; %bb.0:
2103; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2104; GCN-NEXT:    s_mov_b32 s7, 0xf000
2105; GCN-NEXT:    s_mov_b32 s6, -1
2106; GCN-NEXT:    s_mov_b32 s10, s6
2107; GCN-NEXT:    s_mov_b32 s11, s7
2108; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2109; GCN-NEXT:    s_mov_b32 s8, s2
2110; GCN-NEXT:    s_mov_b32 s9, s3
2111; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2112; GCN-NEXT:    s_mov_b32 s4, s0
2113; GCN-NEXT:    s_mov_b32 s5, s1
2114; GCN-NEXT:    s_waitcnt vmcnt(0)
2115; GCN-NEXT:    v_bfe_i32 v2, v0, 0, 25
2116; GCN-NEXT:    v_bfe_i32 v3, v1, 0, 25
2117; GCN-NEXT:    v_bfe_i32 v0, v0, 24, 1
2118; GCN-NEXT:    v_bfe_i32 v1, v1, 24, 1
2119; GCN-NEXT:    v_xor_b32_e32 v4, v0, v1
2120; GCN-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
2121; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
2122; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
2123; GCN-NEXT:    v_xor_b32_e32 v1, v3, v1
2124; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
2125; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2126; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
2127; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
2128; GCN-NEXT:    v_mul_hi_u32 v3, v2, v1
2129; GCN-NEXT:    v_mul_lo_u32 v5, v2, v1
2130; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v5
2131; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
2132; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
2133; GCN-NEXT:    v_mul_hi_u32 v3, v3, v2
2134; GCN-NEXT:    v_add_i32_e32 v5, vcc, v3, v2
2135; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
2136; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2137; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
2138; GCN-NEXT:    v_mul_lo_u32 v3, v2, v1
2139; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
2140; GCN-NEXT:    v_add_i32_e32 v6, vcc, -1, v2
2141; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v0
2142; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
2143; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v1
2144; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
2145; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v5, s[0:1]
2146; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
2147; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
2148; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
2149; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
2150; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2151; GCN-NEXT:    s_endpgm
2152;
2153; TONGA-LABEL: v_sdiv_i25:
2154; TONGA:       ; %bb.0:
2155; TONGA-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
2156; TONGA-NEXT:    s_mov_b32 s7, 0xf000
2157; TONGA-NEXT:    s_mov_b32 s6, -1
2158; TONGA-NEXT:    s_mov_b32 s2, s6
2159; TONGA-NEXT:    s_mov_b32 s3, s7
2160; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
2161; TONGA-NEXT:    s_mov_b32 s0, s10
2162; TONGA-NEXT:    s_mov_b32 s1, s11
2163; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
2164; TONGA-NEXT:    s_mov_b32 s4, s8
2165; TONGA-NEXT:    s_mov_b32 s5, s9
2166; TONGA-NEXT:    s_waitcnt vmcnt(0)
2167; TONGA-NEXT:    v_bfe_i32 v2, v1, 0, 25
2168; TONGA-NEXT:    v_bfe_i32 v1, v1, 24, 1
2169; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v1, v2
2170; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v1
2171; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
2172; TONGA-NEXT:    v_bfe_i32 v4, v0, 0, 25
2173; TONGA-NEXT:    v_bfe_i32 v0, v0, 24, 1
2174; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v0, v4
2175; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2176; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v0
2177; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
2178; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
2179; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
2180; TONGA-NEXT:    v_mul_lo_u32 v5, v3, v2
2181; TONGA-NEXT:    v_mul_hi_u32 v6, v3, v2
2182; TONGA-NEXT:    v_sub_u32_e32 v7, vcc, 0, v5
2183; TONGA-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
2184; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
2185; TONGA-NEXT:    v_mul_hi_u32 v5, v5, v3
2186; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v5, v3
2187; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v5, v3
2188; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2189; TONGA-NEXT:    v_mul_hi_u32 v3, v3, v4
2190; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
2191; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
2192; TONGA-NEXT:    v_add_u32_e32 v6, vcc, -1, v3
2193; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, v1, v4
2194; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
2195; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v2
2196; TONGA-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
2197; TONGA-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s[0:1]
2198; TONGA-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
2199; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v0
2200; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
2201; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 25
2202; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2203; TONGA-NEXT:    s_endpgm
2204;
2205; GFX9-LABEL: v_sdiv_i25:
2206; GFX9:       ; %bb.0:
2207; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2208; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2209; GFX9-NEXT:    s_mov_b32 s6, -1
2210; GFX9-NEXT:    s_mov_b32 s10, s6
2211; GFX9-NEXT:    s_mov_b32 s11, s7
2212; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX9-NEXT:    s_mov_b32 s8, s2
2214; GFX9-NEXT:    s_mov_b32 s9, s3
2215; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2216; GFX9-NEXT:    s_mov_b32 s4, s0
2217; GFX9-NEXT:    s_mov_b32 s5, s1
2218; GFX9-NEXT:    s_waitcnt vmcnt(0)
2219; GFX9-NEXT:    v_bfe_i32 v2, v1, 0, 25
2220; GFX9-NEXT:    v_bfe_i32 v1, v1, 24, 1
2221; GFX9-NEXT:    v_add_u32_e32 v2, v2, v1
2222; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v1
2223; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v2
2224; GFX9-NEXT:    v_bfe_i32 v6, v0, 0, 25
2225; GFX9-NEXT:    v_bfe_i32 v0, v0, 24, 1
2226; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2227; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v3
2228; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2229; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v2
2230; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v2
2231; GFX9-NEXT:    v_sub_u32_e32 v7, 0, v4
2232; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
2233; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
2234; GFX9-NEXT:    v_mul_hi_u32 v4, v4, v3
2235; GFX9-NEXT:    v_add_u32_e32 v5, v6, v0
2236; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v0
2237; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
2238; GFX9-NEXT:    v_add_u32_e32 v6, v3, v4
2239; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v4
2240; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2241; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v5
2242; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v2
2243; GFX9-NEXT:    v_add_u32_e32 v1, 1, v3
2244; GFX9-NEXT:    v_add_u32_e32 v6, -1, v3
2245; GFX9-NEXT:    v_sub_u32_e32 v7, v5, v4
2246; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v4
2247; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v2
2248; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
2249; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
2250; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
2251; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
2252; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
2253; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 25
2254; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2255; GFX9-NEXT:    s_endpgm
2256;
2257; EG-LABEL: v_sdiv_i25:
2258; EG:       ; %bb.0:
2259; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
2260; EG-NEXT:    TEX 1 @6
2261; EG-NEXT:    ALU 41, @12, KC0[CB0:0-32], KC1[]
2262; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2263; EG-NEXT:    CF_END
2264; EG-NEXT:    PAD
2265; EG-NEXT:    Fetch clause starting at 6:
2266; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
2267; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
2268; EG-NEXT:    ALU clause starting at 10:
2269; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2270; EG-NEXT:     MOV * T1.X, PV.X,
2271; EG-NEXT:    ALU clause starting at 12:
2272; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
2273; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2274; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
2275; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2276; EG-NEXT:     SETGT_INT * T1.W, 0.0, PV.W,
2277; EG-NEXT:     ADD_INT * T0.W, T0.W, PV.W,
2278; EG-NEXT:     XOR_INT * T0.W, PV.W, T1.W,
2279; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2280; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2281; EG-NEXT:     LSHL T0.Z, T1.X, literal.x,
2282; EG-NEXT:     SUB_INT T2.W, 0.0, PS,
2283; EG-NEXT:     MULHI * T1.X, T0.X, T0.W,
2284; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2285; EG-NEXT:     CNDE_INT T2.W, PS, PV.W, T0.Y,
2286; EG-NEXT:     ASHR * T3.W, PV.Z, literal.x,
2287; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2288; EG-NEXT:     SETGT_INT T4.W, 0.0, PS,
2289; EG-NEXT:     MULHI * T0.Y, PV.W, T0.X,
2290; EG-NEXT:     ADD_INT T0.Z, T3.W, PV.W,
2291; EG-NEXT:     ADD_INT T2.W, T0.X, PS,
2292; EG-NEXT:     SUB_INT * T3.W, T0.X, PS,
2293; EG-NEXT:     CNDE_INT T2.W, T1.X, PV.W, PS,
2294; EG-NEXT:     XOR_INT * T3.W, PV.Z, T4.W,
2295; EG-NEXT:     MULHI * T0.X, PV.W, PS,
2296; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2297; EG-NEXT:     SUB_INT * T2.W, T3.W, PS,
2298; EG-NEXT:     SETGE_UINT T0.W, PV.W, T0.W,
2299; EG-NEXT:     SETGE_UINT * T2.W, T3.W, T0.Y,
2300; EG-NEXT:     AND_INT T0.W, PV.W, PS,
2301; EG-NEXT:     ADD_INT * T3.W, T0.X, 1,
2302; EG-NEXT:     CNDE_INT T0.W, PV.W, T0.X, PS,
2303; EG-NEXT:     ADD_INT * T3.W, T0.X, literal.x,
2304; EG-NEXT:    -1(nan), 0(0.000000e+00)
2305; EG-NEXT:     CNDE_INT T0.W, T2.W, PS, PV.W,
2306; EG-NEXT:     XOR_INT * T1.W, T4.W, T1.W,
2307; EG-NEXT:     XOR_INT * T0.W, PV.W, PS,
2308; EG-NEXT:     SUB_INT * T0.W, PV.W, T1.W,
2309; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2310; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2311; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
2312; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
2313; EG-NEXT:    7(9.809089e-45), 2(2.802597e-45)
2314  %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
2315  %num = load i25, i25 addrspace(1) * %in
2316  %den = load i25, i25 addrspace(1) * %den_ptr
2317  %result = sdiv i25 %num, %den
2318  %result.ext = sext i25 %result to i32
2319  store i32 %result.ext, i32 addrspace(1)* %out
2320  ret void
2321}
2322
2323; Tests for 64-bit divide bypass.
2324; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2325;   %result = sdiv i64 %a, %b
2326;   store i64 %result, i64 addrspace(1)* %out, align 8
2327;   ret void
2328; }
2329
2330; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2331;   %result = srem i64 %a, %b
2332;   store i64 %result, i64 addrspace(1)* %out, align 8
2333;   ret void
2334; }
2335
2336; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2337;   %resultdiv = sdiv i64 %a, %b
2338;   %resultrem = srem i64 %a, %b
2339;   %result = add i64 %resultdiv, %resultrem
2340;   store i64 %result, i64 addrspace(1)* %out, align 8
2341;   ret void
2342; }
2343
2344define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
2345; GCN-LABEL: scalarize_mulhs_4xi32:
2346; GCN:       ; %bb.0:
2347; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2348; GCN-NEXT:    s_mov_b32 s7, 0xf000
2349; GCN-NEXT:    s_mov_b32 s6, -1
2350; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2351; GCN-NEXT:    s_mov_b32 s4, s0
2352; GCN-NEXT:    s_mov_b32 s5, s1
2353; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2354; GCN-NEXT:    s_mov_b32 s0, 0x1389c755
2355; GCN-NEXT:    s_mov_b32 s4, s2
2356; GCN-NEXT:    s_mov_b32 s5, s3
2357; GCN-NEXT:    s_waitcnt vmcnt(0)
2358; GCN-NEXT:    v_mul_hi_i32 v0, v0, s0
2359; GCN-NEXT:    v_mul_hi_i32 v1, v1, s0
2360; GCN-NEXT:    v_mul_hi_i32 v2, v2, s0
2361; GCN-NEXT:    v_mul_hi_i32 v3, v3, s0
2362; GCN-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2363; GCN-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2364; GCN-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2365; GCN-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2366; GCN-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2367; GCN-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2368; GCN-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2369; GCN-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2370; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
2371; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
2372; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
2373; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
2374; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2375; GCN-NEXT:    s_endpgm
2376;
2377; TONGA-LABEL: scalarize_mulhs_4xi32:
2378; TONGA:       ; %bb.0:
2379; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2380; TONGA-NEXT:    s_mov_b32 s3, 0xf000
2381; TONGA-NEXT:    s_mov_b32 s2, -1
2382; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
2383; TONGA-NEXT:    s_mov_b32 s0, s4
2384; TONGA-NEXT:    s_mov_b32 s1, s5
2385; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
2386; TONGA-NEXT:    s_mov_b32 s0, 0x1389c755
2387; TONGA-NEXT:    s_mov_b32 s4, s6
2388; TONGA-NEXT:    s_mov_b32 s5, s7
2389; TONGA-NEXT:    s_mov_b32 s6, s2
2390; TONGA-NEXT:    s_mov_b32 s7, s3
2391; TONGA-NEXT:    s_waitcnt vmcnt(0)
2392; TONGA-NEXT:    v_mul_hi_i32 v0, v0, s0
2393; TONGA-NEXT:    v_mul_hi_i32 v1, v1, s0
2394; TONGA-NEXT:    v_mul_hi_i32 v2, v2, s0
2395; TONGA-NEXT:    v_mul_hi_i32 v3, v3, s0
2396; TONGA-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2397; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2398; TONGA-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2399; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2400; TONGA-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2401; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2402; TONGA-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2403; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2404; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
2405; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
2406; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
2407; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
2408; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2409; TONGA-NEXT:    s_endpgm
2410;
2411; GFX9-LABEL: scalarize_mulhs_4xi32:
2412; GFX9:       ; %bb.0:
2413; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2414; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2415; GFX9-NEXT:    s_mov_b32 s2, -1
2416; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2417; GFX9-NEXT:    s_mov_b32 s0, s4
2418; GFX9-NEXT:    s_mov_b32 s1, s5
2419; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
2420; GFX9-NEXT:    s_mov_b32 s0, 0x1389c755
2421; GFX9-NEXT:    s_mov_b32 s4, s6
2422; GFX9-NEXT:    s_mov_b32 s5, s7
2423; GFX9-NEXT:    s_mov_b32 s6, s2
2424; GFX9-NEXT:    s_mov_b32 s7, s3
2425; GFX9-NEXT:    s_waitcnt vmcnt(0)
2426; GFX9-NEXT:    v_mul_hi_i32 v0, v0, s0
2427; GFX9-NEXT:    v_mul_hi_i32 v1, v1, s0
2428; GFX9-NEXT:    v_mul_hi_i32 v2, v2, s0
2429; GFX9-NEXT:    v_mul_hi_i32 v3, v3, s0
2430; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2431; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2432; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2433; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2434; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2435; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2436; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2437; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2438; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
2439; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
2440; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
2441; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
2442; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2443; GFX9-NEXT:    s_endpgm
2444;
2445; EG-LABEL: scalarize_mulhs_4xi32:
2446; EG:       ; %bb.0:
2447; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2448; EG-NEXT:    TEX 0 @6
2449; EG-NEXT:    ALU 25, @9, KC0[CB0:0-32], KC1[]
2450; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2451; EG-NEXT:    CF_END
2452; EG-NEXT:    PAD
2453; EG-NEXT:    Fetch clause starting at 6:
2454; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2455; EG-NEXT:    ALU clause starting at 8:
2456; EG-NEXT:     MOV * T0.X, KC0[2].Y,
2457; EG-NEXT:    ALU clause starting at 9:
2458; EG-NEXT:     MULHI_INT * T0.W, T0.W, literal.x,
2459; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2460; EG-NEXT:     ASHR T1.Z, PS, literal.x,
2461; EG-NEXT:     LSHR T0.W, PS, literal.y,
2462; EG-NEXT:     MULHI_INT * T0.Z, T0.Z, literal.z,
2463; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2464; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2465; EG-NEXT:     ASHR T1.Y, PS, literal.x,
2466; EG-NEXT:     LSHR T0.Z, PS, literal.y,
2467; EG-NEXT:     ADD_INT T0.W, PV.Z, PV.W,
2468; EG-NEXT:     MULHI_INT * T0.Y, T0.Y, literal.z,
2469; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2470; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2471; EG-NEXT:     ASHR T2.Y, PS, literal.x,
2472; EG-NEXT:     ADD_INT T0.Z, PV.Y, PV.Z,
2473; EG-NEXT:     LSHR T1.W, PS, literal.y,
2474; EG-NEXT:     MULHI_INT * T0.X, T0.X, literal.z,
2475; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2476; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2477; EG-NEXT:     ADD_INT T0.Y, PV.Y, PV.W,
2478; EG-NEXT:     ASHR T1.W, PS, literal.x,
2479; EG-NEXT:     LSHR * T2.W, PS, literal.y,
2480; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2481; EG-NEXT:     ADD_INT T0.X, PV.W, PS,
2482; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.x,
2483; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2484  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
2485  %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2486  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
2487  ret void
2488}
2489