1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx600 | FileCheck %s --check-prefix=GCN
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=TONGA
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s --check-prefix=GFX9
5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG
6
7; The code generated by sdiv is long and complex and may frequently change.
8; The goal of this test is to make sure the ISel doesn't fail.
9;
10; This program was previously failing to compile when one of the selectcc
11; opcodes generated by the sdiv lowering was being legalized and optimized to:
12; selectcc Remainder -1, 0, -1, SETGT
13; This was fixed by adding an additional pattern in R600Instructions.td to
14; match this pattern with a CNDGE_INT.
15
16define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
17; GCN-LABEL: sdiv_i32:
18; GCN:       ; %bb.0:
19; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
20; GCN-NEXT:    s_mov_b32 s7, 0xf000
21; GCN-NEXT:    s_mov_b32 s6, -1
22; GCN-NEXT:    s_mov_b32 s10, s6
23; GCN-NEXT:    s_mov_b32 s11, s7
24; GCN-NEXT:    s_waitcnt lgkmcnt(0)
25; GCN-NEXT:    s_mov_b32 s8, s2
26; GCN-NEXT:    s_mov_b32 s9, s3
27; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
28; GCN-NEXT:    s_mov_b32 s4, s0
29; GCN-NEXT:    s_mov_b32 s5, s1
30; GCN-NEXT:    s_waitcnt vmcnt(0)
31; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
32; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
33; GCN-NEXT:    v_xor_b32_e32 v1, v1, v2
34; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
35; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
36; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
37; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
38; GCN-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
39; GCN-NEXT:    v_xor_b32_e32 v0, v0, v5
40; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
41; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
42; GCN-NEXT:    v_xor_b32_e32 v2, v5, v2
43; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
44; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
45; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
46; GCN-NEXT:    v_mul_hi_u32 v3, v0, v3
47; GCN-NEXT:    v_mul_lo_u32 v4, v3, v1
48; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
49; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v4, v0
50; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
51; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
52; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v1, v0
53; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
54; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
55; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
56; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
57; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
58; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
59; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
60; GCN-NEXT:    s_endpgm
61;
62; TONGA-LABEL: sdiv_i32:
63; TONGA:       ; %bb.0:
64; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
65; TONGA-NEXT:    s_mov_b32 s7, 0xf000
66; TONGA-NEXT:    s_mov_b32 s6, -1
67; TONGA-NEXT:    s_mov_b32 s10, s6
68; TONGA-NEXT:    s_mov_b32 s11, s7
69; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
70; TONGA-NEXT:    s_mov_b32 s8, s2
71; TONGA-NEXT:    s_mov_b32 s9, s3
72; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
73; TONGA-NEXT:    s_mov_b32 s4, s0
74; TONGA-NEXT:    s_mov_b32 s5, s1
75; TONGA-NEXT:    s_waitcnt vmcnt(0)
76; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
77; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
78; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v2
79; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v1
80; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v1
81; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
82; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
83; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
84; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v5
85; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
86; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
87; TONGA-NEXT:    v_xor_b32_e32 v2, v5, v2
88; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
89; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
90; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
91; TONGA-NEXT:    v_mul_hi_u32 v3, v0, v3
92; TONGA-NEXT:    v_mul_lo_u32 v4, v3, v1
93; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
94; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v4, v0
95; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
96; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
97; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v1, v0
98; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
99; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
100; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
101; TONGA-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
102; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v2
103; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
104; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
105; TONGA-NEXT:    s_endpgm
106;
107; GFX9-LABEL: sdiv_i32:
108; GFX9:       ; %bb.0:
109; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
110; GFX9-NEXT:    s_mov_b32 s7, 0xf000
111; GFX9-NEXT:    s_mov_b32 s6, -1
112; GFX9-NEXT:    s_mov_b32 s10, s6
113; GFX9-NEXT:    s_mov_b32 s11, s7
114; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX9-NEXT:    s_mov_b32 s8, s2
116; GFX9-NEXT:    s_mov_b32 s9, s3
117; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
118; GFX9-NEXT:    s_mov_b32 s4, s0
119; GFX9-NEXT:    s_mov_b32 s5, s1
120; GFX9-NEXT:    s_waitcnt vmcnt(0)
121; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
122; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
123; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
124; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
125; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v1
126; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
127; GFX9-NEXT:    v_add_u32_e32 v0, v0, v5
128; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
129; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
130; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v2
131; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
132; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
133; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
134; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
135; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
136; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
137; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
138; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
139; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
140; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
141; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v1
142; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
143; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
144; GFX9-NEXT:    v_add_u32_e32 v4, 1, v3
145; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
146; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
147; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
148; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
149; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
150; GFX9-NEXT:    s_endpgm
151;
152; EG-LABEL: sdiv_i32:
153; EG:       ; %bb.0:
154; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
155; EG-NEXT:    TEX 0 @6
156; EG-NEXT:    ALU 26, @9, KC0[CB0:0-32], KC1[]
157; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
158; EG-NEXT:    CF_END
159; EG-NEXT:    PAD
160; EG-NEXT:    Fetch clause starting at 6:
161; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
162; EG-NEXT:    ALU clause starting at 8:
163; EG-NEXT:     MOV * T0.X, KC0[2].Z,
164; EG-NEXT:    ALU clause starting at 9:
165; EG-NEXT:     SETGT_INT * T0.W, 0.0, T0.Y,
166; EG-NEXT:     ADD_INT * T1.W, T0.Y, PV.W,
167; EG-NEXT:     XOR_INT * T1.W, PV.W, T0.W,
168; EG-NEXT:     SUB_INT T2.W, 0.0, PV.W,
169; EG-NEXT:     RECIP_UINT * T0.Y, PV.W,
170; EG-NEXT:     SETGT_INT T3.W, 0.0, T0.X,
171; EG-NEXT:     MULLO_INT * T0.Z, PV.W, PS,
172; EG-NEXT:     ADD_INT T2.W, T0.X, PV.W,
173; EG-NEXT:     MULHI * T0.X, T0.Y, PS,
174; EG-NEXT:     ADD_INT T4.W, T0.Y, PS,
175; EG-NEXT:     XOR_INT * T2.W, PV.W, T3.W,
176; EG-NEXT:     MULHI * T0.X, PS, PV.W,
177; EG-NEXT:     MULLO_INT * T0.Y, PS, T1.W,
178; EG-NEXT:     SUB_INT * T2.W, T2.W, PS,
179; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
180; EG-NEXT:     SETGE_UINT T4.W, PV.W, T1.W,
181; EG-NEXT:     SUB_INT * T5.W, PV.W, T1.W,
182; EG-NEXT:     CNDE_INT T2.W, PV.W, T2.W, PS,
183; EG-NEXT:     CNDE_INT * T4.W, PV.W, T0.X, PV.Z,
184; EG-NEXT:     ADD_INT T5.W, PS, 1,
185; EG-NEXT:     SETGE_UINT * T1.W, PV.W, T1.W,
186; EG-NEXT:     CNDE_INT T1.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221
187; EG-NEXT:     XOR_INT * T0.W, T3.W, T0.W,
188; EG-NEXT:     XOR_INT * T1.W, PV.W, PS,
189; EG-NEXT:     SUB_INT T0.X, PV.W, T0.W,
190; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
191; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
192  %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
193  %num = load i32, i32 addrspace(1) * %in
194  %den = load i32, i32 addrspace(1) * %den_ptr
195  %result = sdiv i32 %num, %den
196  store i32 %result, i32 addrspace(1)* %out
197  ret void
198}
199
200define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
201; GCN-LABEL: sdiv_i32_4:
202; GCN:       ; %bb.0:
203; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
204; GCN-NEXT:    s_mov_b32 s7, 0xf000
205; GCN-NEXT:    s_mov_b32 s6, -1
206; GCN-NEXT:    s_mov_b32 s10, s6
207; GCN-NEXT:    s_mov_b32 s11, s7
208; GCN-NEXT:    s_waitcnt lgkmcnt(0)
209; GCN-NEXT:    s_mov_b32 s8, s2
210; GCN-NEXT:    s_mov_b32 s9, s3
211; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
212; GCN-NEXT:    s_mov_b32 s4, s0
213; GCN-NEXT:    s_mov_b32 s5, s1
214; GCN-NEXT:    s_waitcnt vmcnt(0)
215; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
216; GCN-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
217; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
218; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
219; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
220; GCN-NEXT:    s_endpgm
221;
222; TONGA-LABEL: sdiv_i32_4:
223; TONGA:       ; %bb.0:
224; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
225; TONGA-NEXT:    s_mov_b32 s7, 0xf000
226; TONGA-NEXT:    s_mov_b32 s6, -1
227; TONGA-NEXT:    s_mov_b32 s10, s6
228; TONGA-NEXT:    s_mov_b32 s11, s7
229; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
230; TONGA-NEXT:    s_mov_b32 s8, s2
231; TONGA-NEXT:    s_mov_b32 s9, s3
232; TONGA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
233; TONGA-NEXT:    s_mov_b32 s4, s0
234; TONGA-NEXT:    s_mov_b32 s5, s1
235; TONGA-NEXT:    s_waitcnt vmcnt(0)
236; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
237; TONGA-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
238; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
239; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
240; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
241; TONGA-NEXT:    s_endpgm
242;
243; GFX9-LABEL: sdiv_i32_4:
244; GFX9:       ; %bb.0:
245; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
246; GFX9-NEXT:    s_mov_b32 s7, 0xf000
247; GFX9-NEXT:    s_mov_b32 s6, -1
248; GFX9-NEXT:    s_mov_b32 s10, s6
249; GFX9-NEXT:    s_mov_b32 s11, s7
250; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
251; GFX9-NEXT:    s_mov_b32 s8, s2
252; GFX9-NEXT:    s_mov_b32 s9, s3
253; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
254; GFX9-NEXT:    s_mov_b32 s4, s0
255; GFX9-NEXT:    s_mov_b32 s5, s1
256; GFX9-NEXT:    s_waitcnt vmcnt(0)
257; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
258; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
259; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
260; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
261; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
262; GFX9-NEXT:    s_endpgm
263;
264; EG-LABEL: sdiv_i32_4:
265; EG:       ; %bb.0:
266; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
267; EG-NEXT:    TEX 0 @6
268; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
269; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
270; EG-NEXT:    CF_END
271; EG-NEXT:    PAD
272; EG-NEXT:    Fetch clause starting at 6:
273; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
274; EG-NEXT:    ALU clause starting at 8:
275; EG-NEXT:     MOV * T0.X, KC0[2].Z,
276; EG-NEXT:    ALU clause starting at 9:
277; EG-NEXT:     ASHR * T0.W, T0.X, literal.x,
278; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
279; EG-NEXT:     LSHR * T0.W, PV.W, literal.x,
280; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
281; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
282; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
283; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
284; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
285  %num = load i32, i32 addrspace(1) * %in
286  %result = sdiv i32 %num, 4
287  store i32 %result, i32 addrspace(1)* %out
288  ret void
289}
290
291; Multiply by a weird constant to make sure setIntDivIsCheap is
292; working.
293
294define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
295; GCN-LABEL: slow_sdiv_i32_3435:
296; GCN:       ; %bb.0:
297; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
298; GCN-NEXT:    s_mov_b32 s7, 0xf000
299; GCN-NEXT:    s_mov_b32 s6, -1
300; GCN-NEXT:    s_mov_b32 s10, s6
301; GCN-NEXT:    s_mov_b32 s11, s7
302; GCN-NEXT:    s_waitcnt lgkmcnt(0)
303; GCN-NEXT:    s_mov_b32 s8, s2
304; GCN-NEXT:    s_mov_b32 s9, s3
305; GCN-NEXT:    buffer_load_dword v0, off, s[8:11], 0
306; GCN-NEXT:    s_mov_b32 s2, 0x98a1930b
307; GCN-NEXT:    s_mov_b32 s4, s0
308; GCN-NEXT:    s_mov_b32 s5, s1
309; GCN-NEXT:    s_waitcnt vmcnt(0)
310; GCN-NEXT:    v_mul_hi_i32 v1, v0, s2
311; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
312; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
313; GCN-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
314; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
315; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
316; GCN-NEXT:    s_endpgm
317;
318; TONGA-LABEL: slow_sdiv_i32_3435:
319; TONGA:       ; %bb.0:
320; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
321; TONGA-NEXT:    s_mov_b32 s7, 0xf000
322; TONGA-NEXT:    s_mov_b32 s6, -1
323; TONGA-NEXT:    s_mov_b32 s10, s6
324; TONGA-NEXT:    s_mov_b32 s11, s7
325; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
326; TONGA-NEXT:    s_mov_b32 s8, s2
327; TONGA-NEXT:    s_mov_b32 s9, s3
328; TONGA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
329; TONGA-NEXT:    s_mov_b32 s2, 0x98a1930b
330; TONGA-NEXT:    s_mov_b32 s4, s0
331; TONGA-NEXT:    s_mov_b32 s5, s1
332; TONGA-NEXT:    s_waitcnt vmcnt(0)
333; TONGA-NEXT:    v_mul_hi_i32 v1, v0, s2
334; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
335; TONGA-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
336; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
337; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
338; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
339; TONGA-NEXT:    s_endpgm
340;
341; GFX9-LABEL: slow_sdiv_i32_3435:
342; GFX9:       ; %bb.0:
343; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
344; GFX9-NEXT:    s_mov_b32 s7, 0xf000
345; GFX9-NEXT:    s_mov_b32 s6, -1
346; GFX9-NEXT:    s_mov_b32 s10, s6
347; GFX9-NEXT:    s_mov_b32 s11, s7
348; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
349; GFX9-NEXT:    s_mov_b32 s8, s2
350; GFX9-NEXT:    s_mov_b32 s9, s3
351; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
352; GFX9-NEXT:    s_mov_b32 s2, 0x98a1930b
353; GFX9-NEXT:    s_mov_b32 s4, s0
354; GFX9-NEXT:    s_mov_b32 s5, s1
355; GFX9-NEXT:    s_waitcnt vmcnt(0)
356; GFX9-NEXT:    v_mul_hi_i32 v1, v0, s2
357; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
358; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
359; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
360; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
361; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
362; GFX9-NEXT:    s_endpgm
363;
364; EG-LABEL: slow_sdiv_i32_3435:
365; EG:       ; %bb.0:
366; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
367; EG-NEXT:    TEX 0 @6
368; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
369; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
370; EG-NEXT:    CF_END
371; EG-NEXT:    PAD
372; EG-NEXT:    Fetch clause starting at 6:
373; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
374; EG-NEXT:    ALU clause starting at 8:
375; EG-NEXT:     MOV * T0.X, KC0[2].Z,
376; EG-NEXT:    ALU clause starting at 9:
377; EG-NEXT:     MULHI_INT * T0.Y, T0.X, literal.x,
378; EG-NEXT:    -1734241525(-4.176600e-24), 0(0.000000e+00)
379; EG-NEXT:     ADD_INT * T0.W, PS, T0.X,
380; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
381; EG-NEXT:     LSHR * T0.W, PV.W, literal.y,
382; EG-NEXT:    11(1.541428e-44), 31(4.344025e-44)
383; EG-NEXT:     ADD_INT T0.X, PV.W, PS,
384; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
385; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
386  %num = load i32, i32 addrspace(1) * %in
387  %result = sdiv i32 %num, 3435
388  store i32 %result, i32 addrspace(1)* %out
389  ret void
390}
391
392define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
393; GCN-LABEL: sdiv_v2i32:
394; GCN:       ; %bb.0:
395; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
396; GCN-NEXT:    s_mov_b32 s7, 0xf000
397; GCN-NEXT:    s_mov_b32 s6, -1
398; GCN-NEXT:    s_mov_b32 s10, s6
399; GCN-NEXT:    s_mov_b32 s11, s7
400; GCN-NEXT:    s_waitcnt lgkmcnt(0)
401; GCN-NEXT:    s_mov_b32 s8, s2
402; GCN-NEXT:    s_mov_b32 s9, s3
403; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
404; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
405; GCN-NEXT:    s_mov_b32 s4, s0
406; GCN-NEXT:    s_mov_b32 s5, s1
407; GCN-NEXT:    s_waitcnt vmcnt(0)
408; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
409; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
410; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
411; GCN-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
412; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
413; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
414; GCN-NEXT:    v_xor_b32_e32 v2, v2, v5
415; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
416; GCN-NEXT:    v_xor_b32_e32 v8, v4, v5
417; GCN-NEXT:    v_xor_b32_e32 v9, v6, v7
418; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v2
419; GCN-NEXT:    v_cvt_f32_u32_e32 v7, v3
420; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
421; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
422; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v7
423; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 0, v3
424; GCN-NEXT:    v_mul_f32_e32 v5, s2, v5
425; GCN-NEXT:    v_mul_f32_e32 v7, s2, v7
426; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
427; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v7
428; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
429; GCN-NEXT:    v_mul_lo_u32 v10, v10, v5
430; GCN-NEXT:    v_mul_lo_u32 v11, v11, v7
431; GCN-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
432; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
433; GCN-NEXT:    v_mul_hi_u32 v4, v5, v10
434; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
435; GCN-NEXT:    v_mul_hi_u32 v6, v7, v11
436; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
437; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
438; GCN-NEXT:    v_mul_hi_u32 v4, v0, v4
439; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
440; GCN-NEXT:    v_mul_lo_u32 v6, v4, v2
441; GCN-NEXT:    v_mul_lo_u32 v10, v5, v3
442; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
443; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
444; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v10, v1
445; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
446; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
447; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
448; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
449; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
450; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
451; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
452; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
453; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
454; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
455; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
456; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
457; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
458; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
459; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
460; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
461; GCN-NEXT:    v_xor_b32_e32 v1, v1, v9
462; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
463; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
464; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
465; GCN-NEXT:    s_endpgm
466;
467; TONGA-LABEL: sdiv_v2i32:
468; TONGA:       ; %bb.0:
469; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
470; TONGA-NEXT:    s_mov_b32 s7, 0xf000
471; TONGA-NEXT:    s_mov_b32 s6, -1
472; TONGA-NEXT:    s_mov_b32 s10, s6
473; TONGA-NEXT:    s_mov_b32 s11, s7
474; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
475; TONGA-NEXT:    s_mov_b32 s8, s2
476; TONGA-NEXT:    s_mov_b32 s9, s3
477; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
478; TONGA-NEXT:    s_mov_b32 s2, 0x4f7ffffe
479; TONGA-NEXT:    s_mov_b32 s4, s0
480; TONGA-NEXT:    s_mov_b32 s5, s1
481; TONGA-NEXT:    s_waitcnt vmcnt(0)
482; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
483; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
484; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
485; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
486; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
487; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
488; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v5
489; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
490; TONGA-NEXT:    v_xor_b32_e32 v8, v4, v5
491; TONGA-NEXT:    v_xor_b32_e32 v9, v6, v7
492; TONGA-NEXT:    v_cvt_f32_u32_e32 v5, v2
493; TONGA-NEXT:    v_cvt_f32_u32_e32 v7, v3
494; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v2
495; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v5
496; TONGA-NEXT:    v_rcp_iflag_f32_e32 v7, v7
497; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v3
498; TONGA-NEXT:    v_mul_f32_e32 v5, s2, v5
499; TONGA-NEXT:    v_mul_f32_e32 v7, s2, v7
500; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
501; TONGA-NEXT:    v_cvt_u32_f32_e32 v7, v7
502; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
503; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v5
504; TONGA-NEXT:    v_mul_lo_u32 v11, v11, v7
505; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
506; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
507; TONGA-NEXT:    v_mul_hi_u32 v4, v5, v10
508; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
509; TONGA-NEXT:    v_mul_hi_u32 v6, v7, v11
510; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
511; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v6, v7
512; TONGA-NEXT:    v_mul_hi_u32 v4, v0, v4
513; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
514; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v2
515; TONGA-NEXT:    v_mul_lo_u32 v10, v5, v3
516; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
517; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v6, v0
518; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
519; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
520; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
521; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
522; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
523; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
524; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
525; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
526; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
527; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
528; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
529; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
530; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
531; TONGA-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
532; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
533; TONGA-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
534; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
535; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v9
536; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
537; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v9
538; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
539; TONGA-NEXT:    s_endpgm
540;
541; GFX9-LABEL: sdiv_v2i32:
542; GFX9:       ; %bb.0:
543; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
544; GFX9-NEXT:    s_mov_b32 s7, 0xf000
545; GFX9-NEXT:    s_mov_b32 s6, -1
546; GFX9-NEXT:    s_mov_b32 s10, s6
547; GFX9-NEXT:    s_mov_b32 s11, s7
548; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX9-NEXT:    s_mov_b32 s8, s2
550; GFX9-NEXT:    s_mov_b32 s9, s3
551; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
552; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
553; GFX9-NEXT:    s_mov_b32 s4, s0
554; GFX9-NEXT:    s_mov_b32 s5, s1
555; GFX9-NEXT:    s_waitcnt vmcnt(0)
556; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
557; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
558; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
559; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
560; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
561; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v5
562; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, v2
563; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v3
564; GFX9-NEXT:    v_sub_u32_e32 v10, 0, v2
565; GFX9-NEXT:    v_sub_u32_e32 v11, 0, v3
566; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
567; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
568; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
569; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
570; GFX9-NEXT:    v_mul_f32_e32 v6, s2, v6
571; GFX9-NEXT:    v_mul_f32_e32 v7, s2, v7
572; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
573; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
574; GFX9-NEXT:    v_add_u32_e32 v0, v0, v8
575; GFX9-NEXT:    v_add_u32_e32 v1, v1, v9
576; GFX9-NEXT:    v_mul_lo_u32 v10, v10, v6
577; GFX9-NEXT:    v_mul_lo_u32 v11, v11, v7
578; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
579; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v9
580; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v10
581; GFX9-NEXT:    v_mul_hi_u32 v11, v7, v11
582; GFX9-NEXT:    v_xor_b32_e32 v4, v8, v4
583; GFX9-NEXT:    v_xor_b32_e32 v5, v9, v5
584; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
585; GFX9-NEXT:    v_add_u32_e32 v7, v7, v11
586; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v6
587; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v7
588; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v2
589; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v3
590; GFX9-NEXT:    v_add_u32_e32 v10, 1, v6
591; GFX9-NEXT:    v_add_u32_e32 v11, 1, v7
592; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v8
593; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v9
594; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
595; GFX9-NEXT:    v_sub_u32_e32 v8, v0, v2
596; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
597; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v3
598; GFX9-NEXT:    v_sub_u32_e32 v9, v1, v3
599; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
600; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[0:1]
601; GFX9-NEXT:    v_add_u32_e32 v8, 1, v6
602; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[0:1]
603; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
604; GFX9-NEXT:    v_add_u32_e32 v9, 1, v7
605; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
606; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
607; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
608; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
609; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
610; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
611; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v5
612; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
613; GFX9-NEXT:    s_endpgm
614;
615; EG-LABEL: sdiv_v2i32:
616; EG:       ; %bb.0:
617; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
618; EG-NEXT:    TEX 1 @6
619; EG-NEXT:    ALU 51, @11, KC0[CB0:0-32], KC1[]
620; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
621; EG-NEXT:    CF_END
622; EG-NEXT:    PAD
623; EG-NEXT:    Fetch clause starting at 6:
624; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
625; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
626; EG-NEXT:    ALU clause starting at 10:
627; EG-NEXT:     MOV * T0.X, KC0[2].Z,
628; EG-NEXT:    ALU clause starting at 11:
629; EG-NEXT:     SETGT_INT * T0.W, 0.0, T1.Y,
630; EG-NEXT:     ADD_INT T1.W, T1.Y, PV.W,
631; EG-NEXT:     SETGT_INT * T2.W, 0.0, T1.X,
632; EG-NEXT:     XOR_INT * T1.W, PV.W, T0.W,
633; EG-NEXT:     SUB_INT T0.Z, 0.0, PV.W,
634; EG-NEXT:     ADD_INT T3.W, T1.X, T2.W,
635; EG-NEXT:     RECIP_UINT * T1.X, PV.W,
636; EG-NEXT:     XOR_INT T3.W, PV.W, T2.W,
637; EG-NEXT:     MULLO_INT * T0.Z, PV.Z, PS,
638; EG-NEXT:     SUB_INT T4.W, 0.0, PV.W,
639; EG-NEXT:     RECIP_UINT * T1.Y, PV.W,
640; EG-NEXT:     SETGT_INT T5.W, 0.0, T0.X,
641; EG-NEXT:     MULLO_INT * T1.Z, PV.W, PS,
642; EG-NEXT:     SETGT_INT T2.Z, 0.0, T0.Y,
643; EG-NEXT:     ADD_INT T4.W, T0.X, PV.W,
644; EG-NEXT:     MULHI * T0.X, T1.Y, PS,
645; EG-NEXT:     ADD_INT T1.Y, T1.Y, PS,
646; EG-NEXT:     XOR_INT T1.Z, PV.W, T5.W,
647; EG-NEXT:     ADD_INT T4.W, T0.Y, PV.Z, BS:VEC_120/SCL_212
648; EG-NEXT:     MULHI * T0.X, T1.X, T0.Z,
649; EG-NEXT:     ADD_INT T0.Z, T1.X, PS,
650; EG-NEXT:     XOR_INT T4.W, PV.W, T2.Z,
651; EG-NEXT:     MULHI * T0.X, PV.Z, PV.Y,
652; EG-NEXT:     MULHI * T0.Y, PV.W, PV.Z,
653; EG-NEXT:     MULLO_INT * T0.Z, PS, T1.W,
654; EG-NEXT:     SUB_INT T4.W, T4.W, PS,
655; EG-NEXT:     MULLO_INT * T0.Z, T0.X, T3.W,
656; EG-NEXT:     SUB_INT T1.Y, T1.Z, PS,
657; EG-NEXT:     ADD_INT T0.Z, T0.Y, 1,
658; EG-NEXT:     SETGE_UINT T6.W, PV.W, T1.W,
659; EG-NEXT:     SUB_INT * T7.W, PV.W, T1.W,
660; EG-NEXT:     CNDE_INT T1.X, PV.W, T4.W, PS, BS:VEC_021/SCL_122
661; EG-NEXT:     CNDE_INT T0.Y, PV.W, T0.Y, PV.Z,
662; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
663; EG-NEXT:     SETGE_UINT T4.W, PV.Y, T3.W,
664; EG-NEXT:     SUB_INT * T6.W, PV.Y, T3.W,
665; EG-NEXT:     CNDE_INT T1.Y, PV.W, T1.Y, PS,
666; EG-NEXT:     CNDE_INT T0.Z, PV.W, T0.X, PV.Z,
667; EG-NEXT:     ADD_INT T4.W, PV.Y, 1,
668; EG-NEXT:     SETGE_UINT * T1.W, PV.X, T1.W,
669; EG-NEXT:     CNDE_INT T0.Y, PS, T0.Y, PV.W,
670; EG-NEXT:     XOR_INT T1.Z, T2.Z, T0.W, BS:VEC_021/SCL_122
671; EG-NEXT:     ADD_INT T0.W, PV.Z, 1,
672; EG-NEXT:     SETGE_UINT * T1.W, PV.Y, T3.W,
673; EG-NEXT:     CNDE_INT T0.Z, PS, T0.Z, PV.W,
674; EG-NEXT:     XOR_INT T0.W, T5.W, T2.W,
675; EG-NEXT:     XOR_INT * T1.W, PV.Y, PV.Z,
676; EG-NEXT:     SUB_INT T0.Y, PS, T1.Z,
677; EG-NEXT:     XOR_INT * T1.W, PV.Z, PV.W,
678; EG-NEXT:     SUB_INT T0.X, PV.W, T0.W,
679; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
680; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
681  %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
682  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
683  %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
684  %result = sdiv <2 x i32> %num, %den
685  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
686  ret void
687}
688
689define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
690; GCN-LABEL: sdiv_v2i32_4:
691; GCN:       ; %bb.0:
692; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
693; GCN-NEXT:    s_mov_b32 s7, 0xf000
694; GCN-NEXT:    s_mov_b32 s6, -1
695; GCN-NEXT:    s_mov_b32 s10, s6
696; GCN-NEXT:    s_mov_b32 s11, s7
697; GCN-NEXT:    s_waitcnt lgkmcnt(0)
698; GCN-NEXT:    s_mov_b32 s8, s2
699; GCN-NEXT:    s_mov_b32 s9, s3
700; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
701; GCN-NEXT:    s_mov_b32 s4, s0
702; GCN-NEXT:    s_mov_b32 s5, s1
703; GCN-NEXT:    s_waitcnt vmcnt(0)
704; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
705; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
706; GCN-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
707; GCN-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
708; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
709; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
710; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
711; GCN-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
712; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
713; GCN-NEXT:    s_endpgm
714;
715; TONGA-LABEL: sdiv_v2i32_4:
716; TONGA:       ; %bb.0:
717; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
718; TONGA-NEXT:    s_mov_b32 s7, 0xf000
719; TONGA-NEXT:    s_mov_b32 s6, -1
720; TONGA-NEXT:    s_mov_b32 s10, s6
721; TONGA-NEXT:    s_mov_b32 s11, s7
722; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
723; TONGA-NEXT:    s_mov_b32 s8, s2
724; TONGA-NEXT:    s_mov_b32 s9, s3
725; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
726; TONGA-NEXT:    s_mov_b32 s4, s0
727; TONGA-NEXT:    s_mov_b32 s5, s1
728; TONGA-NEXT:    s_waitcnt vmcnt(0)
729; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
730; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
731; TONGA-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
732; TONGA-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
733; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
734; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
735; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
736; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
737; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
738; TONGA-NEXT:    s_endpgm
739;
740; GFX9-LABEL: sdiv_v2i32_4:
741; GFX9:       ; %bb.0:
742; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
743; GFX9-NEXT:    s_mov_b32 s7, 0xf000
744; GFX9-NEXT:    s_mov_b32 s6, -1
745; GFX9-NEXT:    s_mov_b32 s10, s6
746; GFX9-NEXT:    s_mov_b32 s11, s7
747; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX9-NEXT:    s_mov_b32 s8, s2
749; GFX9-NEXT:    s_mov_b32 s9, s3
750; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
751; GFX9-NEXT:    s_mov_b32 s4, s0
752; GFX9-NEXT:    s_mov_b32 s5, s1
753; GFX9-NEXT:    s_waitcnt vmcnt(0)
754; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
755; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
756; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
757; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 30, v3
758; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
759; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
760; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
761; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
762; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
763; GFX9-NEXT:    s_endpgm
764;
765; EG-LABEL: sdiv_v2i32_4:
766; EG:       ; %bb.0:
767; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
768; EG-NEXT:    TEX 0 @6
769; EG-NEXT:    ALU 13, @9, KC0[CB0:0-32], KC1[]
770; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
771; EG-NEXT:    CF_END
772; EG-NEXT:    PAD
773; EG-NEXT:    Fetch clause starting at 6:
774; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
775; EG-NEXT:    ALU clause starting at 8:
776; EG-NEXT:     MOV * T0.X, KC0[2].Z,
777; EG-NEXT:    ALU clause starting at 9:
778; EG-NEXT:     ASHR * T0.W, T0.Y, literal.x,
779; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
780; EG-NEXT:     LSHR T0.W, PV.W, literal.x,
781; EG-NEXT:     ASHR * T1.W, T0.X, literal.y,
782; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
783; EG-NEXT:     LSHR T1.W, PS, literal.x,
784; EG-NEXT:     ADD_INT * T0.W, T0.Y, PV.W,
785; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
786; EG-NEXT:     ASHR T0.Y, PS, literal.x,
787; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
788; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
789; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
790; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
791; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
792  %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
793  %result = sdiv <2 x i32> %num, <i32 4, i32 4>
794  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
795  ret void
796}
797
798define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
799; GCN-LABEL: sdiv_v4i32:
800; GCN:       ; %bb.0:
801; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
802; GCN-NEXT:    s_mov_b32 s11, 0xf000
803; GCN-NEXT:    s_mov_b32 s10, -1
804; GCN-NEXT:    s_mov_b32 s6, s10
805; GCN-NEXT:    s_mov_b32 s7, s11
806; GCN-NEXT:    s_waitcnt lgkmcnt(0)
807; GCN-NEXT:    s_mov_b32 s4, s2
808; GCN-NEXT:    s_mov_b32 s5, s3
809; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
810; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
811; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
812; GCN-NEXT:    s_mov_b32 s8, s0
813; GCN-NEXT:    s_mov_b32 s9, s1
814; GCN-NEXT:    s_waitcnt vmcnt(1)
815; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
816; GCN-NEXT:    s_waitcnt vmcnt(0)
817; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
818; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
819; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
820; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
821; GCN-NEXT:    v_xor_b32_e32 v5, v5, v11
822; GCN-NEXT:    v_xor_b32_e32 v15, v8, v9
823; GCN-NEXT:    v_xor_b32_e32 v4, v4, v9
824; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v5
825; GCN-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
826; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
827; GCN-NEXT:    v_cvt_f32_u32_e32 v8, v4
828; GCN-NEXT:    v_rcp_iflag_f32_e32 v9, v9
829; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
830; GCN-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
831; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
832; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v8
833; GCN-NEXT:    v_xor_b32_e32 v6, v6, v13
834; GCN-NEXT:    v_mul_f32_e32 v9, s2, v9
835; GCN-NEXT:    v_xor_b32_e32 v16, v10, v11
836; GCN-NEXT:    v_cvt_f32_u32_e32 v11, v6
837; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
838; GCN-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
839; GCN-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
840; GCN-NEXT:    v_xor_b32_e32 v17, v12, v13
841; GCN-NEXT:    v_xor_b32_e32 v2, v2, v12
842; GCN-NEXT:    v_mul_f32_e32 v8, s2, v8
843; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
844; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
845; GCN-NEXT:    v_rcp_iflag_f32_e32 v11, v11
846; GCN-NEXT:    v_mul_lo_u32 v12, v12, v9
847; GCN-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
848; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
849; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
850; GCN-NEXT:    v_mul_lo_u32 v10, v10, v8
851; GCN-NEXT:    v_mul_hi_u32 v12, v9, v12
852; GCN-NEXT:    v_mul_f32_e32 v11, s2, v11
853; GCN-NEXT:    v_cvt_u32_f32_e32 v11, v11
854; GCN-NEXT:    v_mul_hi_u32 v10, v8, v10
855; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
856; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v6
857; GCN-NEXT:    v_mul_lo_u32 v12, v12, v11
858; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
859; GCN-NEXT:    v_mul_hi_u32 v8, v0, v8
860; GCN-NEXT:    v_mul_hi_u32 v12, v11, v12
861; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
862; GCN-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
863; GCN-NEXT:    v_xor_b32_e32 v7, v7, v14
864; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v7
865; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
866; GCN-NEXT:    v_mul_lo_u32 v12, v8, v4
867; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
868; GCN-NEXT:    v_mul_hi_u32 v9, v1, v9
869; GCN-NEXT:    v_mul_hi_u32 v11, v2, v11
870; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
871; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
872; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
873; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
874; GCN-NEXT:    v_sub_i32_e32 v12, vcc, v0, v4
875; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
876; GCN-NEXT:    v_mul_f32_e32 v10, s2, v10
877; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
878; GCN-NEXT:    v_mul_lo_u32 v0, v9, v5
879; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v10
880; GCN-NEXT:    v_mul_lo_u32 v10, v11, v6
881; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
882; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
883; GCN-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
884; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
885; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
886; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v11
887; GCN-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
888; GCN-NEXT:    v_sub_i32_e32 v9, vcc, v0, v5
889; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
890; GCN-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
891; GCN-NEXT:    v_sub_i32_e32 v11, vcc, v2, v6
892; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[2:3]
893; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v1
894; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
895; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
896; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
897; GCN-NEXT:    v_xor_b32_e32 v1, v8, v15
898; GCN-NEXT:    v_xor_b32_e32 v5, v0, v16
899; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v15
900; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v16
901; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v7
902; GCN-NEXT:    v_mul_lo_u32 v5, v5, v4
903; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
904; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
905; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
906; GCN-NEXT:    v_xor_b32_e32 v3, v3, v9
907; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s[4:5]
908; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
909; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
910; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
911; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
912; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v8, vcc
913; GCN-NEXT:    v_xor_b32_e32 v2, v2, v17
914; GCN-NEXT:    v_mul_lo_u32 v5, v4, v7
915; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v17
916; GCN-NEXT:    v_xor_b32_e32 v6, v9, v14
917; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
918; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
919; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v7
920; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
921; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v3, v7
922; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
923; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
924; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
925; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
926; GCN-NEXT:    v_xor_b32_e32 v3, v3, v6
927; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
928; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
929; GCN-NEXT:    s_endpgm
930;
931; TONGA-LABEL: sdiv_v4i32:
932; TONGA:       ; %bb.0:
933; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
934; TONGA-NEXT:    s_mov_b32 s11, 0xf000
935; TONGA-NEXT:    s_mov_b32 s10, -1
936; TONGA-NEXT:    s_mov_b32 s6, s10
937; TONGA-NEXT:    s_mov_b32 s7, s11
938; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
939; TONGA-NEXT:    s_mov_b32 s4, s2
940; TONGA-NEXT:    s_mov_b32 s5, s3
941; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
942; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
943; TONGA-NEXT:    s_mov_b32 s2, 0x4f7ffffe
944; TONGA-NEXT:    s_mov_b32 s8, s0
945; TONGA-NEXT:    s_mov_b32 s9, s1
946; TONGA-NEXT:    s_waitcnt vmcnt(1)
947; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
948; TONGA-NEXT:    s_waitcnt vmcnt(0)
949; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
950; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
951; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v11, v5
952; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
953; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v11
954; TONGA-NEXT:    v_xor_b32_e32 v15, v8, v9
955; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v9
956; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v5
957; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
958; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
959; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v4
960; TONGA-NEXT:    v_rcp_iflag_f32_e32 v9, v9
961; TONGA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
962; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v13, v6
963; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
964; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
965; TONGA-NEXT:    v_xor_b32_e32 v6, v6, v13
966; TONGA-NEXT:    v_mul_f32_e32 v9, s2, v9
967; TONGA-NEXT:    v_xor_b32_e32 v16, v10, v11
968; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v6
969; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
970; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
971; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v12, v2
972; TONGA-NEXT:    v_xor_b32_e32 v17, v12, v13
973; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v12
974; TONGA-NEXT:    v_mul_f32_e32 v8, s2, v8
975; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v5
976; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
977; TONGA-NEXT:    v_rcp_iflag_f32_e32 v11, v11
978; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v9
979; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
980; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
981; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v4
982; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v8
983; TONGA-NEXT:    v_mul_hi_u32 v12, v9, v12
984; TONGA-NEXT:    v_mul_f32_e32 v11, s2, v11
985; TONGA-NEXT:    v_cvt_u32_f32_e32 v11, v11
986; TONGA-NEXT:    v_mul_hi_u32 v10, v8, v10
987; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v12, v9
988; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v6
989; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v11
990; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
991; TONGA-NEXT:    v_mul_hi_u32 v8, v0, v8
992; TONGA-NEXT:    v_mul_hi_u32 v12, v11, v12
993; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
994; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v14, v7
995; TONGA-NEXT:    v_xor_b32_e32 v7, v7, v14
996; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v7
997; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v12, v11
998; TONGA-NEXT:    v_mul_lo_u32 v12, v8, v4
999; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
1000; TONGA-NEXT:    v_mul_hi_u32 v9, v1, v9
1001; TONGA-NEXT:    v_mul_hi_u32 v11, v2, v11
1002; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v12
1003; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
1004; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
1005; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
1006; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v0, v4
1007; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
1008; TONGA-NEXT:    v_mul_f32_e32 v10, s2, v10
1009; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
1010; TONGA-NEXT:    v_mul_lo_u32 v0, v9, v5
1011; TONGA-NEXT:    v_cvt_u32_f32_e32 v4, v10
1012; TONGA-NEXT:    v_mul_lo_u32 v10, v11, v6
1013; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
1014; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
1015; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v9
1016; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v10
1017; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
1018; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v11
1019; TONGA-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
1020; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v0, v5
1021; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
1022; TONGA-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
1023; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, v2, v6
1024; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[2:3]
1025; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v1
1026; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
1027; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
1028; TONGA-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
1029; TONGA-NEXT:    v_xor_b32_e32 v1, v8, v15
1030; TONGA-NEXT:    v_xor_b32_e32 v5, v0, v16
1031; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v15, v1
1032; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v16, v5
1033; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v7
1034; TONGA-NEXT:    v_mul_lo_u32 v5, v5, v4
1035; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
1036; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
1037; TONGA-NEXT:    v_mul_hi_u32 v5, v4, v5
1038; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v9
1039; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s[4:5]
1040; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v10
1041; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
1042; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
1043; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
1044; TONGA-NEXT:    v_cndmask_b32_e32 v2, v10, v8, vcc
1045; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v17
1046; TONGA-NEXT:    v_mul_lo_u32 v5, v4, v7
1047; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, v17, v2
1048; TONGA-NEXT:    v_xor_b32_e32 v6, v9, v14
1049; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
1050; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
1051; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v7
1052; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1053; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v3, v7
1054; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1055; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
1056; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
1057; TONGA-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1058; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v6
1059; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
1060; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1061; TONGA-NEXT:    s_endpgm
1062;
1063; GFX9-LABEL: sdiv_v4i32:
1064; GFX9:       ; %bb.0:
1065; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1066; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1067; GFX9-NEXT:    s_mov_b32 s10, -1
1068; GFX9-NEXT:    s_mov_b32 s6, s10
1069; GFX9-NEXT:    s_mov_b32 s7, s11
1070; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX9-NEXT:    s_mov_b32 s4, s2
1072; GFX9-NEXT:    s_mov_b32 s5, s3
1073; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
1074; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1075; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
1076; GFX9-NEXT:    s_mov_b32 s8, s0
1077; GFX9-NEXT:    s_mov_b32 s9, s1
1078; GFX9-NEXT:    s_waitcnt vmcnt(1)
1079; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
1080; GFX9-NEXT:    s_waitcnt vmcnt(0)
1081; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
1082; GFX9-NEXT:    v_add_u32_e32 v4, v4, v9
1083; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
1084; GFX9-NEXT:    v_add_u32_e32 v0, v0, v8
1085; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v9
1086; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
1087; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
1088; GFX9-NEXT:    v_xor_b32_e32 v16, v8, v9
1089; GFX9-NEXT:    v_add_u32_e32 v5, v5, v11
1090; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
1091; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v4
1092; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
1093; GFX9-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
1094; GFX9-NEXT:    v_add_u32_e32 v1, v1, v10
1095; GFX9-NEXT:    v_add_u32_e32 v6, v6, v13
1096; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v11
1097; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
1098; GFX9-NEXT:    v_add_u32_e32 v2, v2, v12
1099; GFX9-NEXT:    v_add_u32_e32 v7, v7, v15
1100; GFX9-NEXT:    v_xor_b32_e32 v17, v10, v11
1101; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v10
1102; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v13
1103; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, v5
1104; GFX9-NEXT:    v_add_u32_e32 v3, v3, v14
1105; GFX9-NEXT:    v_xor_b32_e32 v18, v12, v13
1106; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v12
1107; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v15
1108; GFX9-NEXT:    v_cvt_f32_u32_e32 v12, v6
1109; GFX9-NEXT:    v_xor_b32_e32 v19, v14, v15
1110; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v14
1111; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, v7
1112; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
1113; GFX9-NEXT:    v_rcp_iflag_f32_e32 v10, v10
1114; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v12
1115; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v14
1116; GFX9-NEXT:    v_mul_f32_e32 v8, s2, v8
1117; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v8
1118; GFX9-NEXT:    v_mul_f32_e32 v10, s2, v10
1119; GFX9-NEXT:    v_mul_f32_e32 v12, s2, v12
1120; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v10
1121; GFX9-NEXT:    v_sub_u32_e32 v9, 0, v4
1122; GFX9-NEXT:    v_mul_f32_e32 v14, s2, v14
1123; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
1124; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14
1125; GFX9-NEXT:    v_mul_lo_u32 v9, v9, v8
1126; GFX9-NEXT:    v_sub_u32_e32 v11, 0, v5
1127; GFX9-NEXT:    v_sub_u32_e32 v13, 0, v6
1128; GFX9-NEXT:    v_mul_lo_u32 v11, v11, v10
1129; GFX9-NEXT:    v_sub_u32_e32 v15, 0, v7
1130; GFX9-NEXT:    v_mul_lo_u32 v13, v13, v12
1131; GFX9-NEXT:    v_mul_lo_u32 v15, v15, v14
1132; GFX9-NEXT:    v_mul_hi_u32 v9, v8, v9
1133; GFX9-NEXT:    v_mul_hi_u32 v11, v10, v11
1134; GFX9-NEXT:    v_mul_hi_u32 v13, v12, v13
1135; GFX9-NEXT:    v_mul_hi_u32 v15, v14, v15
1136; GFX9-NEXT:    v_add_u32_e32 v8, v8, v9
1137; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v8
1138; GFX9-NEXT:    v_add_u32_e32 v9, v10, v11
1139; GFX9-NEXT:    v_add_u32_e32 v10, v12, v13
1140; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v9
1141; GFX9-NEXT:    v_add_u32_e32 v11, v14, v15
1142; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v10
1143; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v11
1144; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v4
1145; GFX9-NEXT:    v_mul_lo_u32 v14, v9, v5
1146; GFX9-NEXT:    v_mul_lo_u32 v15, v10, v6
1147; GFX9-NEXT:    v_add_u32_e32 v13, 1, v8
1148; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v12
1149; GFX9-NEXT:    v_mul_lo_u32 v12, v11, v7
1150; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v14
1151; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
1152; GFX9-NEXT:    v_add_u32_e32 v14, 1, v9
1153; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v15
1154; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc
1155; GFX9-NEXT:    v_sub_u32_e32 v13, v0, v4
1156; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v5
1157; GFX9-NEXT:    v_add_u32_e32 v15, 1, v10
1158; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v12
1159; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[0:1]
1160; GFX9-NEXT:    v_sub_u32_e32 v14, v1, v5
1161; GFX9-NEXT:    v_cmp_ge_u32_e64 s[2:3], v2, v6
1162; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
1163; GFX9-NEXT:    v_add_u32_e32 v12, 1, v11
1164; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[2:3]
1165; GFX9-NEXT:    v_sub_u32_e32 v15, v2, v6
1166; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v7
1167; GFX9-NEXT:    v_add_u32_e32 v13, 1, v8
1168; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v14, s[0:1]
1169; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
1170; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
1171; GFX9-NEXT:    v_sub_u32_e32 v12, v3, v7
1172; GFX9-NEXT:    v_add_u32_e32 v14, 1, v9
1173; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v15, s[2:3]
1174; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc
1175; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
1176; GFX9-NEXT:    v_add_u32_e32 v15, 1, v10
1177; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s[4:5]
1178; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
1179; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
1180; GFX9-NEXT:    v_add_u32_e32 v12, 1, v11
1181; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v15, vcc
1182; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
1183; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v12, vcc
1184; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v16
1185; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v17
1186; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v18
1187; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v19
1188; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v16
1189; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v17
1190; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v18
1191; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v19
1192; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1193; GFX9-NEXT:    s_endpgm
1194;
1195; EG-LABEL: sdiv_v4i32:
1196; EG:       ; %bb.0:
1197; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1198; EG-NEXT:    TEX 1 @6
1199; EG-NEXT:    ALU 101, @11, KC0[CB0:0-32], KC1[]
1200; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
1201; EG-NEXT:    CF_END
1202; EG-NEXT:    PAD
1203; EG-NEXT:    Fetch clause starting at 6:
1204; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
1205; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
1206; EG-NEXT:    ALU clause starting at 10:
1207; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1208; EG-NEXT:    ALU clause starting at 11:
1209; EG-NEXT:     SETGT_INT * T2.W, 0.0, T1.W,
1210; EG-NEXT:     ADD_INT * T1.W, T1.W, PV.W,
1211; EG-NEXT:     XOR_INT * T1.W, PV.W, T2.W,
1212; EG-NEXT:     SUB_INT T3.W, 0.0, PV.W,
1213; EG-NEXT:     RECIP_UINT * T2.X, PV.W,
1214; EG-NEXT:     SETGT_INT T4.W, 0.0, T0.W,
1215; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
1216; EG-NEXT:     SETGT_INT T2.Z, 0.0, T1.Y,
1217; EG-NEXT:     ADD_INT T0.W, T0.W, PV.W,
1218; EG-NEXT:     MULHI * T2.Y, T2.X, PS,
1219; EG-NEXT:     ADD_INT T3.Z, T2.X, PS,
1220; EG-NEXT:     XOR_INT T0.W, PV.W, T4.W,
1221; EG-NEXT:     ADD_INT * T3.W, T1.Y, PV.Z,
1222; EG-NEXT:     XOR_INT T3.W, PS, T2.Z,
1223; EG-NEXT:     MULHI * T1.Y, PV.W, PV.Z,
1224; EG-NEXT:     SUB_INT T5.W, 0.0, PV.W,
1225; EG-NEXT:     RECIP_UINT * T2.X, PV.W,
1226; EG-NEXT:     SETGT_INT T6.W, 0.0, T0.Y,
1227; EG-NEXT:     MULLO_INT * T2.Y, PV.W, PS,
1228; EG-NEXT:     ADD_INT T5.W, T0.Y, PV.W,
1229; EG-NEXT:     MULHI * T0.Y, T2.X, PS,
1230; EG-NEXT:     ADD_INT T0.Y, T2.X, PS,
1231; EG-NEXT:     XOR_INT T3.Z, PV.W, T6.W, BS:VEC_021/SCL_122
1232; EG-NEXT:     SETGT_INT T5.W, 0.0, T1.Z,
1233; EG-NEXT:     MULLO_INT * T2.X, T1.Y, T1.W,
1234; EG-NEXT:     ADD_INT T7.W, T1.Z, PV.W,
1235; EG-NEXT:     MULHI * T0.Y, PV.Z, PV.Y,
1236; EG-NEXT:     XOR_INT T7.W, PV.W, T5.W, BS:VEC_021/SCL_122
1237; EG-NEXT:     MULLO_INT * T1.Z, PS, T3.W,
1238; EG-NEXT:     SUB_INT T4.Z, 0.0, PV.W,
1239; EG-NEXT:     SETGT_INT T8.W, 0.0, T1.X,
1240; EG-NEXT:     RECIP_UINT * T2.Y, PV.W,
1241; EG-NEXT:     ADD_INT T9.W, T1.X, PV.W,
1242; EG-NEXT:     MULLO_INT * T1.X, PV.Z, PS,
1243; EG-NEXT:     SETGT_INT T4.Z, 0.0, T0.Z,
1244; EG-NEXT:     XOR_INT T9.W, PV.W, T8.W,
1245; EG-NEXT:     MULHI * T1.X, T2.Y, PS,
1246; EG-NEXT:     ADD_INT T1.X, T2.Y, PS,
1247; EG-NEXT:     SUB_INT T2.Y, 0.0, PV.W,
1248; EG-NEXT:     SUB_INT T1.Z, T3.Z, T1.Z,
1249; EG-NEXT:     ADD_INT T10.W, T0.Z, PV.Z, BS:VEC_201
1250; EG-NEXT:     RECIP_UINT * T0.Z, PV.W,
1251; EG-NEXT:     XOR_INT T3.X, PV.W, T4.Z,
1252; EG-NEXT:     ADD_INT T3.Y, T0.Y, 1,
1253; EG-NEXT:     SETGE_UINT T3.Z, PV.Z, T3.W,
1254; EG-NEXT:     SUB_INT T10.W, PV.Z, T3.W,
1255; EG-NEXT:     MULLO_INT * T2.Y, PV.Y, PS,
1256; EG-NEXT:     CNDE_INT T1.Z, PV.Z, T1.Z, PV.W,
1257; EG-NEXT:     CNDE_INT T10.W, PV.Z, T0.Y, PV.Y,
1258; EG-NEXT:     MULHI * T0.Y, PV.X, T1.X,
1259; EG-NEXT:     SETGT_INT T3.Y, 0.0, T0.X,
1260; EG-NEXT:     ADD_INT T3.Z, PV.W, 1,
1261; EG-NEXT:     SETGE_UINT T3.W, PV.Z, T3.W, BS:VEC_021/SCL_122
1262; EG-NEXT:     MULLO_INT * T1.X, PS, T7.W,
1263; EG-NEXT:     CNDE_INT T4.Y, PV.W, T10.W, PV.Z,
1264; EG-NEXT:     ADD_INT T1.Z, T0.X, PV.Y,
1265; EG-NEXT:     SUB_INT T3.W, T3.X, PS, BS:VEC_120/SCL_212
1266; EG-NEXT:     MULHI * T0.X, T0.Z, T2.Y,
1267; EG-NEXT:     ADD_INT T1.X, T0.Y, 1,
1268; EG-NEXT:     SETGE_UINT T2.Y, PV.W, T7.W,
1269; EG-NEXT:     ADD_INT T0.Z, T0.Z, PS,
1270; EG-NEXT:     XOR_INT T10.W, PV.Z, T3.Y,
1271; EG-NEXT:     SUB_INT * T0.W, T0.W, T2.X,
1272; EG-NEXT:     SUB_INT T0.X, T3.W, T7.W,
1273; EG-NEXT:     ADD_INT T5.Y, T1.Y, 1,
1274; EG-NEXT:     SETGE_UINT T1.Z, PS, T1.W, BS:VEC_021/SCL_122
1275; EG-NEXT:     SUB_INT T11.W, PS, T1.W, BS:VEC_021/SCL_122
1276; EG-NEXT:     MULHI * T0.Z, PV.W, PV.Z,
1277; EG-NEXT:     CNDE_INT T2.X, PV.Z, T0.W, PV.W, BS:VEC_021/SCL_122
1278; EG-NEXT:     CNDE_INT T1.Y, PV.Z, T1.Y, PV.Y,
1279; EG-NEXT:     CNDE_INT T1.Z, T2.Y, T3.W, PV.X, BS:VEC_201
1280; EG-NEXT:     CNDE_INT T0.W, T2.Y, T0.Y, T1.X, BS:VEC_201
1281; EG-NEXT:     MULLO_INT * T0.X, PS, T9.W,
1282; EG-NEXT:     ADD_INT T1.X, PV.W, 1,
1283; EG-NEXT:     SETGE_UINT T0.Y, PV.Z, T7.W,
1284; EG-NEXT:     ADD_INT T1.Z, PV.Y, 1,
1285; EG-NEXT:     SETGE_UINT T1.W, PV.X, T1.W, BS:VEC_102/SCL_221
1286; EG-NEXT:     SUB_INT * T3.W, T10.W, PS,
1287; EG-NEXT:     ADD_INT T0.X, T0.Z, 1,
1288; EG-NEXT:     SETGE_UINT T2.Y, PS, T9.W, BS:VEC_102/SCL_221
1289; EG-NEXT:     SUB_INT T3.Z, PS, T9.W, BS:VEC_102/SCL_221
1290; EG-NEXT:     CNDE_INT T1.W, PV.W, T1.Y, PV.Z,
1291; EG-NEXT:     XOR_INT * T2.W, T4.W, T2.W,
1292; EG-NEXT:     XOR_INT T2.X, PV.W, PS,
1293; EG-NEXT:     CNDE_INT T1.Y, PV.Y, T3.W, PV.Z, BS:VEC_021/SCL_122
1294; EG-NEXT:     CNDE_INT T0.Z, PV.Y, T0.Z, PV.X,
1295; EG-NEXT:     CNDE_INT T0.W, T0.Y, T0.W, T1.X, BS:VEC_102/SCL_221
1296; EG-NEXT:     XOR_INT * T1.W, T4.Z, T5.W,
1297; EG-NEXT:     XOR_INT T0.X, T6.W, T2.Z,
1298; EG-NEXT:     XOR_INT T0.Y, PV.W, PS,
1299; EG-NEXT:     ADD_INT T1.Z, PV.Z, 1,
1300; EG-NEXT:     SETGE_UINT T0.W, PV.Y, T9.W, BS:VEC_021/SCL_122
1301; EG-NEXT:     SUB_INT * T2.W, PV.X, T2.W,
1302; EG-NEXT:     CNDE_INT T1.Y, PV.W, T0.Z, PV.Z,
1303; EG-NEXT:     SUB_INT T2.Z, PV.Y, T1.W,
1304; EG-NEXT:     XOR_INT T0.W, T3.Y, T8.W, BS:VEC_021/SCL_122
1305; EG-NEXT:     XOR_INT * T1.W, T4.Y, PV.X,
1306; EG-NEXT:     SUB_INT T2.Y, PS, T0.X,
1307; EG-NEXT:     XOR_INT * T1.W, PV.Y, PV.W,
1308; EG-NEXT:     SUB_INT T2.X, PV.W, T0.W,
1309; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1310; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1311  %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1312  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
1313  %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
1314  %result = sdiv <4 x i32> %num, %den
1315  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1316  ret void
1317}
1318
1319define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
1320; GCN-LABEL: sdiv_v4i32_4:
1321; GCN:       ; %bb.0:
1322; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1323; GCN-NEXT:    s_mov_b32 s7, 0xf000
1324; GCN-NEXT:    s_mov_b32 s6, -1
1325; GCN-NEXT:    s_mov_b32 s10, s6
1326; GCN-NEXT:    s_mov_b32 s11, s7
1327; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1328; GCN-NEXT:    s_mov_b32 s8, s2
1329; GCN-NEXT:    s_mov_b32 s9, s3
1330; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1331; GCN-NEXT:    s_mov_b32 s4, s0
1332; GCN-NEXT:    s_mov_b32 s5, s1
1333; GCN-NEXT:    s_waitcnt vmcnt(0)
1334; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1335; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1336; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1337; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1338; GCN-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1339; GCN-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1340; GCN-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1341; GCN-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1342; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
1343; GCN-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
1344; GCN-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
1345; GCN-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
1346; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1347; GCN-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1348; GCN-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1349; GCN-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1350; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1351; GCN-NEXT:    s_endpgm
1352;
1353; TONGA-LABEL: sdiv_v4i32_4:
1354; TONGA:       ; %bb.0:
1355; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1356; TONGA-NEXT:    s_mov_b32 s7, 0xf000
1357; TONGA-NEXT:    s_mov_b32 s6, -1
1358; TONGA-NEXT:    s_mov_b32 s10, s6
1359; TONGA-NEXT:    s_mov_b32 s11, s7
1360; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1361; TONGA-NEXT:    s_mov_b32 s8, s2
1362; TONGA-NEXT:    s_mov_b32 s9, s3
1363; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1364; TONGA-NEXT:    s_mov_b32 s4, s0
1365; TONGA-NEXT:    s_mov_b32 s5, s1
1366; TONGA-NEXT:    s_waitcnt vmcnt(0)
1367; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1368; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1369; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1370; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1371; TONGA-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1372; TONGA-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1373; TONGA-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1374; TONGA-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1375; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
1376; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
1377; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
1378; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
1379; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1380; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1381; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1382; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1383; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1384; TONGA-NEXT:    s_endpgm
1385;
1386; GFX9-LABEL: sdiv_v4i32_4:
1387; GFX9:       ; %bb.0:
1388; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1389; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1390; GFX9-NEXT:    s_mov_b32 s6, -1
1391; GFX9-NEXT:    s_mov_b32 s10, s6
1392; GFX9-NEXT:    s_mov_b32 s11, s7
1393; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1394; GFX9-NEXT:    s_mov_b32 s8, s2
1395; GFX9-NEXT:    s_mov_b32 s9, s3
1396; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
1397; GFX9-NEXT:    s_mov_b32 s4, s0
1398; GFX9-NEXT:    s_mov_b32 s5, s1
1399; GFX9-NEXT:    s_waitcnt vmcnt(0)
1400; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
1401; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
1402; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
1403; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
1404; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
1405; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 30, v5
1406; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 30, v6
1407; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 30, v7
1408; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
1409; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
1410; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
1411; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
1412; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
1413; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 2, v1
1414; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
1415; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 2, v3
1416; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1417; GFX9-NEXT:    s_endpgm
1418;
1419; EG-LABEL: sdiv_v4i32_4:
1420; EG:       ; %bb.0:
1421; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1422; EG-NEXT:    TEX 0 @6
1423; EG-NEXT:    ALU 24, @9, KC0[CB0:0-32], KC1[]
1424; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
1425; EG-NEXT:    CF_END
1426; EG-NEXT:    PAD
1427; EG-NEXT:    Fetch clause starting at 6:
1428; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
1429; EG-NEXT:    ALU clause starting at 8:
1430; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1431; EG-NEXT:    ALU clause starting at 9:
1432; EG-NEXT:     ASHR T1.W, T0.W, literal.x,
1433; EG-NEXT:     ASHR * T2.W, T0.Z, literal.x,
1434; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1435; EG-NEXT:     LSHR * T1.W, PV.W, literal.x,
1436; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1437; EG-NEXT:     ADD_INT T1.Z, T0.W, PV.W,
1438; EG-NEXT:     LSHR T0.W, T2.W, literal.x, BS:VEC_120/SCL_212
1439; EG-NEXT:     ASHR * T1.W, T0.Y, literal.y,
1440; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
1441; EG-NEXT:     LSHR T1.Y, PS, literal.x,
1442; EG-NEXT:     ASHR T2.Z, T0.X, literal.y,
1443; EG-NEXT:     ADD_INT T0.W, T0.Z, PV.W,
1444; EG-NEXT:     ASHR * T1.W, PV.Z, literal.z,
1445; EG-NEXT:    30(4.203895e-44), 31(4.344025e-44)
1446; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1447; EG-NEXT:     ASHR T1.Z, PV.W, literal.x,
1448; EG-NEXT:     LSHR T0.W, PV.Z, literal.y,
1449; EG-NEXT:     ADD_INT * T2.W, T0.Y, PV.Y,
1450; EG-NEXT:    2(2.802597e-45), 30(4.203895e-44)
1451; EG-NEXT:     ASHR T1.Y, PS, literal.x,
1452; EG-NEXT:     ADD_INT * T0.W, T0.X, PV.W,
1453; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1454; EG-NEXT:     ASHR T1.X, PV.W, literal.x,
1455; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1456; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1457  %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
1458  %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
1459  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
1460  ret void
1461}
1462
1463define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1464; GCN-LABEL: v_sdiv_i8:
1465; GCN:       ; %bb.0:
1466; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1467; GCN-NEXT:    s_mov_b32 s7, 0xf000
1468; GCN-NEXT:    s_mov_b32 s6, -1
1469; GCN-NEXT:    s_mov_b32 s10, s6
1470; GCN-NEXT:    s_mov_b32 s11, s7
1471; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1472; GCN-NEXT:    s_mov_b32 s8, s2
1473; GCN-NEXT:    s_mov_b32 s9, s3
1474; GCN-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:1
1475; GCN-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
1476; GCN-NEXT:    s_mov_b32 s4, s0
1477; GCN-NEXT:    s_mov_b32 s5, s1
1478; GCN-NEXT:    s_waitcnt vmcnt(1)
1479; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v0
1480; GCN-NEXT:    s_waitcnt vmcnt(0)
1481; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v1
1482; GCN-NEXT:    v_xor_b32_e32 v0, v1, v0
1483; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1484; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1485; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
1486; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
1487; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1488; GCN-NEXT:    v_mad_f32 v3, -v1, v2, v3
1489; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
1490; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1491; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1492; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1493; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 8
1494; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1495; GCN-NEXT:    s_endpgm
1496;
1497; TONGA-LABEL: v_sdiv_i8:
1498; TONGA:       ; %bb.0:
1499; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1500; TONGA-NEXT:    s_mov_b32 s7, 0xf000
1501; TONGA-NEXT:    s_mov_b32 s6, -1
1502; TONGA-NEXT:    s_mov_b32 s10, s6
1503; TONGA-NEXT:    s_mov_b32 s11, s7
1504; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1505; TONGA-NEXT:    s_mov_b32 s8, s2
1506; TONGA-NEXT:    s_mov_b32 s9, s3
1507; TONGA-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:1
1508; TONGA-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
1509; TONGA-NEXT:    s_mov_b32 s4, s0
1510; TONGA-NEXT:    s_mov_b32 s5, s1
1511; TONGA-NEXT:    s_waitcnt vmcnt(1)
1512; TONGA-NEXT:    v_cvt_f32_i32_e32 v2, v0
1513; TONGA-NEXT:    s_waitcnt vmcnt(0)
1514; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v1
1515; TONGA-NEXT:    v_xor_b32_e32 v0, v1, v0
1516; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1517; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1518; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1519; TONGA-NEXT:    v_mul_f32_e32 v1, v3, v4
1520; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
1521; TONGA-NEXT:    v_mad_f32 v3, -v1, v2, v3
1522; TONGA-NEXT:    v_cvt_i32_f32_e32 v1, v1
1523; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1524; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1525; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
1526; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 8
1527; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1528; TONGA-NEXT:    s_endpgm
1529;
1530; GFX9-LABEL: v_sdiv_i8:
1531; GFX9:       ; %bb.0:
1532; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1533; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1534; GFX9-NEXT:    s_mov_b32 s6, -1
1535; GFX9-NEXT:    s_mov_b32 s10, s6
1536; GFX9-NEXT:    s_mov_b32 s11, s7
1537; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1538; GFX9-NEXT:    s_mov_b32 s8, s2
1539; GFX9-NEXT:    s_mov_b32 s9, s3
1540; GFX9-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:1
1541; GFX9-NEXT:    buffer_load_sbyte v1, off, s[8:11], 0
1542; GFX9-NEXT:    s_mov_b32 s4, s0
1543; GFX9-NEXT:    s_mov_b32 s5, s1
1544; GFX9-NEXT:    s_waitcnt vmcnt(1)
1545; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v0
1546; GFX9-NEXT:    s_waitcnt vmcnt(0)
1547; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
1548; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
1549; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1550; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1551; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
1552; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v4
1553; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
1554; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v1
1555; GFX9-NEXT:    v_mad_f32 v1, -v1, v2, v3
1556; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
1557; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1558; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
1559; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 8
1560; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1561; GFX9-NEXT:    s_endpgm
1562;
1563; EG-LABEL: v_sdiv_i8:
1564; EG:       ; %bb.0:
1565; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1566; EG-NEXT:    TEX 1 @6
1567; EG-NEXT:    ALU 21, @11, KC0[CB0:0-32], KC1[]
1568; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1569; EG-NEXT:    CF_END
1570; EG-NEXT:    PAD
1571; EG-NEXT:    Fetch clause starting at 6:
1572; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 1, #1
1573; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1574; EG-NEXT:    ALU clause starting at 10:
1575; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1576; EG-NEXT:    ALU clause starting at 11:
1577; EG-NEXT:     BFE_INT * T0.W, T1.X, 0.0, literal.x,
1578; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1579; EG-NEXT:     INT_TO_FLT * T0.Y, PV.W,
1580; EG-NEXT:     BFE_INT T1.W, T0.X, 0.0, literal.x,
1581; EG-NEXT:     RECIP_IEEE * T0.X, PS,
1582; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1583; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
1584; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.X,
1585; EG-NEXT:     TRUNC T2.W, PV.W,
1586; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
1587; EG-NEXT:     ASHR T0.W, PS, literal.x,
1588; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.Y, T0.Z,
1589; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1590; EG-NEXT:     TRUNC T0.Z, T2.W,
1591; EG-NEXT:     SETGE T1.W, |PS|, |T0.Y|,
1592; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
1593; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
1594; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
1595; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1596; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, literal.x,
1597; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1598; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
1599  %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
1600  %num = load i8, i8 addrspace(1) * %in
1601  %den = load i8, i8 addrspace(1) * %den_ptr
1602  %result = sdiv i8 %num, %den
1603  %result.ext = sext i8 %result to i32
1604  store i32 %result.ext, i32 addrspace(1)* %out
1605  ret void
1606}
1607
1608define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
1609; GCN-LABEL: v_sdiv_i23:
1610; GCN:       ; %bb.0:
1611; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1612; GCN-NEXT:    s_mov_b32 s7, 0xf000
1613; GCN-NEXT:    s_mov_b32 s6, -1
1614; GCN-NEXT:    s_mov_b32 s10, s6
1615; GCN-NEXT:    s_mov_b32 s11, s7
1616; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1617; GCN-NEXT:    s_mov_b32 s8, s2
1618; GCN-NEXT:    s_mov_b32 s9, s3
1619; GCN-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
1620; GCN-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:6
1621; GCN-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:4
1622; GCN-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1623; GCN-NEXT:    s_mov_b32 s4, s0
1624; GCN-NEXT:    s_mov_b32 s5, s1
1625; GCN-NEXT:    s_waitcnt vmcnt(3)
1626; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1627; GCN-NEXT:    s_waitcnt vmcnt(2)
1628; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1629; GCN-NEXT:    s_waitcnt vmcnt(1)
1630; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
1631; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 23
1632; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v1
1633; GCN-NEXT:    s_waitcnt vmcnt(0)
1634; GCN-NEXT:    v_or_b32_e32 v0, v3, v0
1635; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
1636; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v0
1637; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1638; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
1639; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1640; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
1641; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
1642; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1643; GCN-NEXT:    v_mad_f32 v3, -v1, v2, v3
1644; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
1645; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1646; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1647; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1648; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
1649; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1650; GCN-NEXT:    s_endpgm
1651;
1652; TONGA-LABEL: v_sdiv_i23:
1653; TONGA:       ; %bb.0:
1654; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1655; TONGA-NEXT:    s_mov_b32 s7, 0xf000
1656; TONGA-NEXT:    s_mov_b32 s6, -1
1657; TONGA-NEXT:    s_mov_b32 s10, s6
1658; TONGA-NEXT:    s_mov_b32 s11, s7
1659; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1660; TONGA-NEXT:    s_mov_b32 s8, s2
1661; TONGA-NEXT:    s_mov_b32 s9, s3
1662; TONGA-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
1663; TONGA-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:6
1664; TONGA-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:4
1665; TONGA-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1666; TONGA-NEXT:    s_mov_b32 s4, s0
1667; TONGA-NEXT:    s_mov_b32 s5, s1
1668; TONGA-NEXT:    s_waitcnt vmcnt(3)
1669; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1670; TONGA-NEXT:    s_waitcnt vmcnt(2)
1671; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1672; TONGA-NEXT:    s_waitcnt vmcnt(1)
1673; TONGA-NEXT:    v_or_b32_e32 v1, v2, v1
1674; TONGA-NEXT:    v_bfe_i32 v1, v1, 0, 23
1675; TONGA-NEXT:    v_cvt_f32_i32_e32 v2, v1
1676; TONGA-NEXT:    s_waitcnt vmcnt(0)
1677; TONGA-NEXT:    v_or_b32_e32 v0, v3, v0
1678; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 23
1679; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v0
1680; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1681; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
1682; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1683; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1684; TONGA-NEXT:    v_mul_f32_e32 v1, v3, v4
1685; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
1686; TONGA-NEXT:    v_mad_f32 v3, -v1, v2, v3
1687; TONGA-NEXT:    v_cvt_i32_f32_e32 v1, v1
1688; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
1689; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1690; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
1691; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 23
1692; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1693; TONGA-NEXT:    s_endpgm
1694;
1695; GFX9-LABEL: v_sdiv_i23:
1696; GFX9:       ; %bb.0:
1697; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1698; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1699; GFX9-NEXT:    s_mov_b32 s6, -1
1700; GFX9-NEXT:    s_mov_b32 s10, s6
1701; GFX9-NEXT:    s_mov_b32 s11, s7
1702; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1703; GFX9-NEXT:    s_mov_b32 s8, s2
1704; GFX9-NEXT:    s_mov_b32 s9, s3
1705; GFX9-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
1706; GFX9-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:6
1707; GFX9-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 offset:4
1708; GFX9-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1709; GFX9-NEXT:    s_mov_b32 s4, s0
1710; GFX9-NEXT:    s_mov_b32 s5, s1
1711; GFX9-NEXT:    s_waitcnt vmcnt(3)
1712; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1713; GFX9-NEXT:    s_waitcnt vmcnt(2)
1714; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1715; GFX9-NEXT:    s_waitcnt vmcnt(1)
1716; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
1717; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 23
1718; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v1
1719; GFX9-NEXT:    s_waitcnt vmcnt(0)
1720; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
1721; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 23
1722; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v0
1723; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
1724; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
1725; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1726; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
1727; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v4
1728; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
1729; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v1
1730; GFX9-NEXT:    v_mad_f32 v1, -v1, v2, v3
1731; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
1732; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1733; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
1734; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 23
1735; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1736; GFX9-NEXT:    s_endpgm
1737;
1738; EG-LABEL: v_sdiv_i23:
1739; EG:       ; %bb.0:
1740; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1741; EG-NEXT:    TEX 3 @6
1742; EG-NEXT:    ALU 33, @15, KC0[CB0:0-32], KC1[]
1743; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1744; EG-NEXT:    CF_END
1745; EG-NEXT:    PAD
1746; EG-NEXT:    Fetch clause starting at 6:
1747; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1748; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1749; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1750; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1751; EG-NEXT:    ALU clause starting at 14:
1752; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1753; EG-NEXT:    ALU clause starting at 15:
1754; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1755; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1756; EG-NEXT:     OR_INT T0.W, T0.X, PV.W,
1757; EG-NEXT:     LSHL * T1.W, T3.X, literal.x,
1758; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1759; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1760; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1761; EG-NEXT:     ASHR T0.W, PV.W, literal.x,
1762; EG-NEXT:     OR_INT * T1.W, T2.X, T1.W,
1763; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1764; EG-NEXT:     LSHL T1.W, PS, literal.x,
1765; EG-NEXT:     INT_TO_FLT * T0.X, PV.W,
1766; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1767; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
1768; EG-NEXT:     RECIP_IEEE * T0.Y, PS,
1769; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1770; EG-NEXT:     INT_TO_FLT * T0.Z, PV.W,
1771; EG-NEXT:     MUL_IEEE * T2.W, PS, T0.Y,
1772; EG-NEXT:     TRUNC T2.W, PV.W,
1773; EG-NEXT:     XOR_INT * T0.W, T1.W, T0.W,
1774; EG-NEXT:     ASHR T0.W, PS, literal.x,
1775; EG-NEXT:     MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z,
1776; EG-NEXT:    30(4.203895e-44), 0(0.000000e+00)
1777; EG-NEXT:     TRUNC T0.Z, T2.W,
1778; EG-NEXT:     SETGE T1.W, |PS|, |T0.X|,
1779; EG-NEXT:     OR_INT * T0.W, PV.W, 1,
1780; EG-NEXT:     CNDE T0.W, PV.W, 0.0, PS,
1781; EG-NEXT:     FLT_TO_INT * T1.W, PV.Z,
1782; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
1783; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1784; EG-NEXT:    9(1.261169e-44), 0(0.000000e+00)
1785; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
1786; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1787; EG-NEXT:    9(1.261169e-44), 2(2.802597e-45)
1788  %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
1789  %num = load i23, i23 addrspace(1) * %in
1790  %den = load i23, i23 addrspace(1) * %den_ptr
1791  %result = sdiv i23 %num, %den
1792  %result.ext = sext i23 %result to i32
1793  store i32 %result.ext, i32 addrspace(1)* %out
1794  ret void
1795}
1796
1797define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
1798; GCN-LABEL: v_sdiv_i24:
1799; GCN:       ; %bb.0:
1800; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1801; GCN-NEXT:    s_mov_b32 s7, 0xf000
1802; GCN-NEXT:    s_mov_b32 s6, -1
1803; GCN-NEXT:    s_mov_b32 s10, s6
1804; GCN-NEXT:    s_mov_b32 s11, s7
1805; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1806; GCN-NEXT:    s_mov_b32 s8, s2
1807; GCN-NEXT:    s_mov_b32 s9, s3
1808; GCN-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:6
1809; GCN-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1810; GCN-NEXT:    buffer_load_sbyte v2, off, s[8:11], 0 offset:2
1811; GCN-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1812; GCN-NEXT:    s_mov_b32 s4, s0
1813; GCN-NEXT:    s_mov_b32 s5, s1
1814; GCN-NEXT:    s_waitcnt vmcnt(3)
1815; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
1816; GCN-NEXT:    s_waitcnt vmcnt(2)
1817; GCN-NEXT:    v_or_b32_e32 v1, v1, v4
1818; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v1
1819; GCN-NEXT:    s_waitcnt vmcnt(1)
1820; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
1821; GCN-NEXT:    s_waitcnt vmcnt(0)
1822; GCN-NEXT:    v_or_b32_e32 v3, v3, v4
1823; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v3
1824; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1825; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
1826; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1827; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
1828; GCN-NEXT:    v_mul_f32_e32 v2, v3, v4
1829; GCN-NEXT:    v_trunc_f32_e32 v2, v2
1830; GCN-NEXT:    v_mad_f32 v3, -v2, v1, v3
1831; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
1832; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1833; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1834; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1835; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
1836; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1837; GCN-NEXT:    s_endpgm
1838;
1839; TONGA-LABEL: v_sdiv_i24:
1840; TONGA:       ; %bb.0:
1841; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1842; TONGA-NEXT:    s_mov_b32 s7, 0xf000
1843; TONGA-NEXT:    s_mov_b32 s6, -1
1844; TONGA-NEXT:    s_mov_b32 s10, s6
1845; TONGA-NEXT:    s_mov_b32 s11, s7
1846; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
1847; TONGA-NEXT:    s_mov_b32 s8, s2
1848; TONGA-NEXT:    s_mov_b32 s9, s3
1849; TONGA-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:6
1850; TONGA-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1851; TONGA-NEXT:    buffer_load_sbyte v2, off, s[8:11], 0 offset:2
1852; TONGA-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1853; TONGA-NEXT:    s_mov_b32 s4, s0
1854; TONGA-NEXT:    s_mov_b32 s5, s1
1855; TONGA-NEXT:    s_waitcnt vmcnt(3)
1856; TONGA-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
1857; TONGA-NEXT:    s_waitcnt vmcnt(2)
1858; TONGA-NEXT:    v_or_b32_e32 v1, v1, v4
1859; TONGA-NEXT:    v_cvt_f32_i32_e32 v1, v1
1860; TONGA-NEXT:    s_waitcnt vmcnt(1)
1861; TONGA-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
1862; TONGA-NEXT:    s_waitcnt vmcnt(0)
1863; TONGA-NEXT:    v_or_b32_e32 v3, v3, v4
1864; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v3
1865; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1866; TONGA-NEXT:    v_xor_b32_e32 v0, v2, v0
1867; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1868; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
1869; TONGA-NEXT:    v_mul_f32_e32 v2, v3, v4
1870; TONGA-NEXT:    v_trunc_f32_e32 v2, v2
1871; TONGA-NEXT:    v_mad_f32 v3, -v2, v1, v3
1872; TONGA-NEXT:    v_cvt_i32_f32_e32 v2, v2
1873; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
1874; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1875; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
1876; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 24
1877; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1878; TONGA-NEXT:    s_endpgm
1879;
1880; GFX9-LABEL: v_sdiv_i24:
1881; GFX9:       ; %bb.0:
1882; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1883; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1884; GFX9-NEXT:    s_mov_b32 s6, -1
1885; GFX9-NEXT:    s_mov_b32 s10, s6
1886; GFX9-NEXT:    s_mov_b32 s11, s7
1887; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1888; GFX9-NEXT:    s_mov_b32 s8, s2
1889; GFX9-NEXT:    s_mov_b32 s9, s3
1890; GFX9-NEXT:    buffer_load_sbyte v0, off, s[8:11], 0 offset:6
1891; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:4
1892; GFX9-NEXT:    buffer_load_sbyte v2, off, s[8:11], 0 offset:2
1893; GFX9-NEXT:    buffer_load_ushort v3, off, s[8:11], 0
1894; GFX9-NEXT:    s_mov_b32 s4, s0
1895; GFX9-NEXT:    s_mov_b32 s5, s1
1896; GFX9-NEXT:    s_waitcnt vmcnt(3)
1897; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
1898; GFX9-NEXT:    s_waitcnt vmcnt(2)
1899; GFX9-NEXT:    v_or_b32_e32 v1, v1, v4
1900; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
1901; GFX9-NEXT:    s_waitcnt vmcnt(1)
1902; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
1903; GFX9-NEXT:    s_waitcnt vmcnt(0)
1904; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
1905; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
1906; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v1
1907; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
1908; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
1909; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
1910; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v4
1911; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1912; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v2
1913; GFX9-NEXT:    v_mad_f32 v2, -v2, v1, v3
1914; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
1915; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
1916; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
1917; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 24
1918; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1919; GFX9-NEXT:    s_endpgm
1920;
1921; EG-LABEL: v_sdiv_i24:
1922; EG:       ; %bb.0:
1923; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1924; EG-NEXT:    TEX 3 @6
1925; EG-NEXT:    ALU 39, @15, KC0[CB0:0-32], KC1[]
1926; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1927; EG-NEXT:    CF_END
1928; EG-NEXT:    PAD
1929; EG-NEXT:    Fetch clause starting at 6:
1930; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 6, #1
1931; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
1932; EG-NEXT:     VTX_READ_8 T3.X, T0.X, 2, #1
1933; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1934; EG-NEXT:    ALU clause starting at 14:
1935; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1936; EG-NEXT:    ALU clause starting at 15:
1937; EG-NEXT:     BFE_INT * T0.W, T1.X, 0.0, literal.x,
1938; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1939; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1940; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1941; EG-NEXT:     OR_INT * T0.W, T0.X, PV.W,
1942; EG-NEXT:     SETGT_INT * T1.W, 0.0, PV.W,
1943; EG-NEXT:     BFE_INT T2.W, T3.X, 0.0, literal.x,
1944; EG-NEXT:     ADD_INT * T0.W, T0.W, PV.W,
1945; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1946; EG-NEXT:     LSHL T2.W, PV.W, literal.x,
1947; EG-NEXT:     XOR_INT * T0.W, PS, T1.W,
1948; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1949; EG-NEXT:     SUB_INT T0.Z, 0.0, PS,
1950; EG-NEXT:     OR_INT T2.W, T2.X, PV.W,
1951; EG-NEXT:     RECIP_UINT * T0.X, PS,
1952; EG-NEXT:     SETGT_INT T3.W, 0.0, PV.W,
1953; EG-NEXT:     MULLO_INT * T0.Y, PV.Z, PS,
1954; EG-NEXT:     ADD_INT T2.W, T2.W, PV.W,
1955; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
1956; EG-NEXT:     ADD_INT T4.W, T0.X, PS,
1957; EG-NEXT:     XOR_INT * T2.W, PV.W, T3.W,
1958; EG-NEXT:     MULHI * T0.X, PS, PV.W,
1959; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
1960; EG-NEXT:     SUB_INT * T2.W, T2.W, PS,
1961; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
1962; EG-NEXT:     SETGE_UINT T4.W, PV.W, T0.W,
1963; EG-NEXT:     SUB_INT * T5.W, PV.W, T0.W,
1964; EG-NEXT:     CNDE_INT T2.W, PV.W, T2.W, PS,
1965; EG-NEXT:     CNDE_INT * T4.W, PV.W, T0.X, PV.Z,
1966; EG-NEXT:     ADD_INT T5.W, PS, 1,
1967; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.W,
1968; EG-NEXT:     CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221
1969; EG-NEXT:     XOR_INT * T1.W, T3.W, T1.W,
1970; EG-NEXT:     XOR_INT * T0.W, PV.W, PS,
1971; EG-NEXT:     SUB_INT * T0.W, PV.W, T1.W,
1972; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
1973; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1974; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
1975; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1976; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
1977  %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
1978  %num = load i24, i24 addrspace(1) * %in
1979  %den = load i24, i24 addrspace(1) * %den_ptr
1980  %result = sdiv i24 %num, %den
1981  %result.ext = sext i24 %result to i32
1982  store i32 %result.ext, i32 addrspace(1)* %out
1983  ret void
1984}
1985
1986define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
1987; GCN-LABEL: v_sdiv_i25:
1988; GCN:       ; %bb.0:
1989; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1990; GCN-NEXT:    s_mov_b32 s7, 0xf000
1991; GCN-NEXT:    s_mov_b32 s6, -1
1992; GCN-NEXT:    s_mov_b32 s10, s6
1993; GCN-NEXT:    s_mov_b32 s11, s7
1994; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1995; GCN-NEXT:    s_mov_b32 s8, s2
1996; GCN-NEXT:    s_mov_b32 s9, s3
1997; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
1998; GCN-NEXT:    s_mov_b32 s4, s0
1999; GCN-NEXT:    s_mov_b32 s5, s1
2000; GCN-NEXT:    s_waitcnt vmcnt(0)
2001; GCN-NEXT:    v_bfe_i32 v2, v1, 0, 25
2002; GCN-NEXT:    v_bfe_i32 v1, v1, 24, 1
2003; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v2
2004; GCN-NEXT:    v_xor_b32_e32 v2, v2, v1
2005; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
2006; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
2007; GCN-NEXT:    v_bfe_i32 v5, v0, 0, 25
2008; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2009; GCN-NEXT:    v_bfe_i32 v0, v0, 24, 1
2010; GCN-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
2011; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
2012; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
2013; GCN-NEXT:    v_xor_b32_e32 v5, v5, v0
2014; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
2015; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
2016; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
2017; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
2018; GCN-NEXT:    v_mul_hi_u32 v3, v5, v3
2019; GCN-NEXT:    v_mul_lo_u32 v1, v3, v2
2020; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
2021; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v1, v5
2022; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v2
2023; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
2024; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v1
2025; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
2026; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
2027; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
2028; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2029; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
2030; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
2031; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
2032; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2033; GCN-NEXT:    s_endpgm
2034;
2035; TONGA-LABEL: v_sdiv_i25:
2036; TONGA:       ; %bb.0:
2037; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2038; TONGA-NEXT:    s_mov_b32 s7, 0xf000
2039; TONGA-NEXT:    s_mov_b32 s6, -1
2040; TONGA-NEXT:    s_mov_b32 s10, s6
2041; TONGA-NEXT:    s_mov_b32 s11, s7
2042; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
2043; TONGA-NEXT:    s_mov_b32 s8, s2
2044; TONGA-NEXT:    s_mov_b32 s9, s3
2045; TONGA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2046; TONGA-NEXT:    s_mov_b32 s4, s0
2047; TONGA-NEXT:    s_mov_b32 s5, s1
2048; TONGA-NEXT:    s_waitcnt vmcnt(0)
2049; TONGA-NEXT:    v_bfe_i32 v2, v1, 0, 25
2050; TONGA-NEXT:    v_bfe_i32 v1, v1, 24, 1
2051; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v1, v2
2052; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v1
2053; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
2054; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
2055; TONGA-NEXT:    v_bfe_i32 v5, v0, 0, 25
2056; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2057; TONGA-NEXT:    v_bfe_i32 v0, v0, 24, 1
2058; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
2059; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
2060; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
2061; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v0
2062; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
2063; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
2064; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
2065; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
2066; TONGA-NEXT:    v_mul_hi_u32 v3, v5, v3
2067; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
2068; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
2069; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v1, v5
2070; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v2
2071; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
2072; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v2, v1
2073; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
2074; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
2075; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
2076; TONGA-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2077; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v0
2078; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v0, v1
2079; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 25
2080; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2081; TONGA-NEXT:    s_endpgm
2082;
2083; GFX9-LABEL: v_sdiv_i25:
2084; GFX9:       ; %bb.0:
2085; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2086; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2087; GFX9-NEXT:    s_mov_b32 s6, -1
2088; GFX9-NEXT:    s_mov_b32 s10, s6
2089; GFX9-NEXT:    s_mov_b32 s11, s7
2090; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2091; GFX9-NEXT:    s_mov_b32 s8, s2
2092; GFX9-NEXT:    s_mov_b32 s9, s3
2093; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
2094; GFX9-NEXT:    s_mov_b32 s4, s0
2095; GFX9-NEXT:    s_mov_b32 s5, s1
2096; GFX9-NEXT:    s_waitcnt vmcnt(0)
2097; GFX9-NEXT:    v_bfe_i32 v2, v1, 0, 25
2098; GFX9-NEXT:    v_bfe_i32 v1, v1, 24, 1
2099; GFX9-NEXT:    v_add_u32_e32 v2, v2, v1
2100; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v1
2101; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v2
2102; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v2
2103; GFX9-NEXT:    v_bfe_i32 v5, v0, 0, 25
2104; GFX9-NEXT:    v_bfe_i32 v0, v0, 24, 1
2105; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2106; GFX9-NEXT:    v_add_u32_e32 v5, v5, v0
2107; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v0
2108; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
2109; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
2110; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2111; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
2112; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
2113; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
2114; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
2115; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v2
2116; GFX9-NEXT:    v_add_u32_e32 v1, 1, v3
2117; GFX9-NEXT:    v_sub_u32_e32 v4, v5, v4
2118; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
2119; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
2120; GFX9-NEXT:    v_sub_u32_e32 v3, v4, v2
2121; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
2122; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
2123; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
2124; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2125; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
2126; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
2127; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 25
2128; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2129; GFX9-NEXT:    s_endpgm
2130;
2131; EG-LABEL: v_sdiv_i25:
2132; EG:       ; %bb.0:
2133; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
2134; EG-NEXT:    TEX 1 @6
2135; EG-NEXT:    ALU 37, @12, KC0[CB0:0-32], KC1[]
2136; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2137; EG-NEXT:    CF_END
2138; EG-NEXT:    PAD
2139; EG-NEXT:    Fetch clause starting at 6:
2140; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
2141; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
2142; EG-NEXT:    ALU clause starting at 10:
2143; EG-NEXT:     MOV * T0.X, KC0[2].Z,
2144; EG-NEXT:     MOV * T1.X, PV.X,
2145; EG-NEXT:    ALU clause starting at 12:
2146; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
2147; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2148; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
2149; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2150; EG-NEXT:     SETGT_INT * T1.W, 0.0, PV.W,
2151; EG-NEXT:     ADD_INT T0.W, T0.W, PV.W,
2152; EG-NEXT:     LSHL * T2.W, T1.X, literal.x,
2153; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2154; EG-NEXT:     XOR_INT * T0.W, PV.W, T1.W,
2155; EG-NEXT:     SUB_INT T0.Z, 0.0, PV.W,
2156; EG-NEXT:     ASHR T2.W, T2.W, literal.x,
2157; EG-NEXT:     RECIP_UINT * T0.X, PV.W,
2158; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2159; EG-NEXT:     SETGT_INT T3.W, 0.0, PV.W,
2160; EG-NEXT:     MULLO_INT * T0.Y, PV.Z, PS,
2161; EG-NEXT:     ADD_INT T2.W, T2.W, PV.W,
2162; EG-NEXT:     MULHI * T0.Y, T0.X, PS,
2163; EG-NEXT:     ADD_INT T4.W, T0.X, PS,
2164; EG-NEXT:     XOR_INT * T2.W, PV.W, T3.W,
2165; EG-NEXT:     MULHI * T0.X, PS, PV.W,
2166; EG-NEXT:     MULLO_INT * T0.Y, PS, T0.W,
2167; EG-NEXT:     SUB_INT * T2.W, T2.W, PS,
2168; EG-NEXT:     ADD_INT T0.Z, T0.X, 1,
2169; EG-NEXT:     SETGE_UINT T4.W, PV.W, T0.W,
2170; EG-NEXT:     SUB_INT * T5.W, PV.W, T0.W,
2171; EG-NEXT:     CNDE_INT T2.W, PV.W, T2.W, PS,
2172; EG-NEXT:     CNDE_INT * T4.W, PV.W, T0.X, PV.Z,
2173; EG-NEXT:     ADD_INT T5.W, PS, 1,
2174; EG-NEXT:     SETGE_UINT * T0.W, PV.W, T0.W,
2175; EG-NEXT:     CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221
2176; EG-NEXT:     XOR_INT * T1.W, T3.W, T1.W,
2177; EG-NEXT:     XOR_INT * T0.W, PV.W, PS,
2178; EG-NEXT:     SUB_INT * T0.W, PV.W, T1.W,
2179; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2180; EG-NEXT:    7(9.809089e-45), 0(0.000000e+00)
2181; EG-NEXT:     ASHR T0.X, PV.W, literal.x,
2182; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
2183; EG-NEXT:    7(9.809089e-45), 2(2.802597e-45)
2184  %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
2185  %num = load i25, i25 addrspace(1) * %in
2186  %den = load i25, i25 addrspace(1) * %den_ptr
2187  %result = sdiv i25 %num, %den
2188  %result.ext = sext i25 %result to i32
2189  store i32 %result.ext, i32 addrspace(1)* %out
2190  ret void
2191}
2192
2193; Tests for 64-bit divide bypass.
2194; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2195;   %result = sdiv i64 %a, %b
2196;   store i64 %result, i64 addrspace(1)* %out, align 8
2197;   ret void
2198; }
2199
2200; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2201;   %result = srem i64 %a, %b
2202;   store i64 %result, i64 addrspace(1)* %out, align 8
2203;   ret void
2204; }
2205
2206; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
2207;   %resultdiv = sdiv i64 %a, %b
2208;   %resultrem = srem i64 %a, %b
2209;   %result = add i64 %resultdiv, %resultrem
2210;   store i64 %result, i64 addrspace(1)* %out, align 8
2211;   ret void
2212; }
2213
2214define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
2215; GCN-LABEL: scalarize_mulhs_4xi32:
2216; GCN:       ; %bb.0:
2217; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2218; GCN-NEXT:    s_mov_b32 s7, 0xf000
2219; GCN-NEXT:    s_mov_b32 s6, -1
2220; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2221; GCN-NEXT:    s_mov_b32 s4, s0
2222; GCN-NEXT:    s_mov_b32 s5, s1
2223; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2224; GCN-NEXT:    s_mov_b32 s0, 0x1389c755
2225; GCN-NEXT:    s_mov_b32 s4, s2
2226; GCN-NEXT:    s_mov_b32 s5, s3
2227; GCN-NEXT:    s_waitcnt vmcnt(0)
2228; GCN-NEXT:    v_mul_hi_i32 v0, v0, s0
2229; GCN-NEXT:    v_mul_hi_i32 v1, v1, s0
2230; GCN-NEXT:    v_mul_hi_i32 v2, v2, s0
2231; GCN-NEXT:    v_mul_hi_i32 v3, v3, s0
2232; GCN-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2233; GCN-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2234; GCN-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2235; GCN-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2236; GCN-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2237; GCN-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2238; GCN-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2239; GCN-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2240; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
2241; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
2242; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
2243; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
2244; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2245; GCN-NEXT:    s_endpgm
2246;
2247; TONGA-LABEL: scalarize_mulhs_4xi32:
2248; TONGA:       ; %bb.0:
2249; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2250; TONGA-NEXT:    s_mov_b32 s7, 0xf000
2251; TONGA-NEXT:    s_mov_b32 s6, -1
2252; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
2253; TONGA-NEXT:    s_mov_b32 s4, s0
2254; TONGA-NEXT:    s_mov_b32 s5, s1
2255; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2256; TONGA-NEXT:    s_mov_b32 s0, 0x1389c755
2257; TONGA-NEXT:    s_mov_b32 s4, s2
2258; TONGA-NEXT:    s_mov_b32 s5, s3
2259; TONGA-NEXT:    s_waitcnt vmcnt(0)
2260; TONGA-NEXT:    v_mul_hi_i32 v0, v0, s0
2261; TONGA-NEXT:    v_mul_hi_i32 v1, v1, s0
2262; TONGA-NEXT:    v_mul_hi_i32 v2, v2, s0
2263; TONGA-NEXT:    v_mul_hi_i32 v3, v3, s0
2264; TONGA-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2265; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2266; TONGA-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2267; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2268; TONGA-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2269; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2270; TONGA-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2271; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2272; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
2273; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
2274; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
2275; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
2276; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2277; TONGA-NEXT:    s_endpgm
2278;
2279; GFX9-LABEL: scalarize_mulhs_4xi32:
2280; GFX9:       ; %bb.0:
2281; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2282; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2283; GFX9-NEXT:    s_mov_b32 s6, -1
2284; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2285; GFX9-NEXT:    s_mov_b32 s4, s0
2286; GFX9-NEXT:    s_mov_b32 s5, s1
2287; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
2288; GFX9-NEXT:    s_mov_b32 s0, 0x1389c755
2289; GFX9-NEXT:    s_mov_b32 s4, s2
2290; GFX9-NEXT:    s_mov_b32 s5, s3
2291; GFX9-NEXT:    s_waitcnt vmcnt(0)
2292; GFX9-NEXT:    v_mul_hi_i32 v0, v0, s0
2293; GFX9-NEXT:    v_mul_hi_i32 v1, v1, s0
2294; GFX9-NEXT:    v_mul_hi_i32 v2, v2, s0
2295; GFX9-NEXT:    v_mul_hi_i32 v3, v3, s0
2296; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v0
2297; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
2298; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 31, v1
2299; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 12, v1
2300; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 31, v2
2301; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
2302; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
2303; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
2304; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
2305; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
2306; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
2307; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
2308; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2309; GFX9-NEXT:    s_endpgm
2310;
2311; EG-LABEL: scalarize_mulhs_4xi32:
2312; EG:       ; %bb.0:
2313; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
2314; EG-NEXT:    TEX 0 @6
2315; EG-NEXT:    ALU 25, @9, KC0[CB0:0-32], KC1[]
2316; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2317; EG-NEXT:    CF_END
2318; EG-NEXT:    PAD
2319; EG-NEXT:    Fetch clause starting at 6:
2320; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
2321; EG-NEXT:    ALU clause starting at 8:
2322; EG-NEXT:     MOV * T0.X, KC0[2].Y,
2323; EG-NEXT:    ALU clause starting at 9:
2324; EG-NEXT:     MULHI_INT * T0.W, T0.W, literal.x,
2325; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2326; EG-NEXT:     ASHR T1.Z, PS, literal.x,
2327; EG-NEXT:     LSHR T0.W, PS, literal.y,
2328; EG-NEXT:     MULHI_INT * T0.Z, T0.Z, literal.z,
2329; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2330; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2331; EG-NEXT:     ASHR T1.Y, PS, literal.x,
2332; EG-NEXT:     LSHR T0.Z, PS, literal.y,
2333; EG-NEXT:     ADD_INT T0.W, PV.Z, PV.W,
2334; EG-NEXT:     MULHI_INT * T0.Y, T0.Y, literal.z,
2335; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2336; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2337; EG-NEXT:     ASHR T2.Y, PS, literal.x,
2338; EG-NEXT:     ADD_INT T0.Z, PV.Y, PV.Z,
2339; EG-NEXT:     LSHR T1.W, PS, literal.y,
2340; EG-NEXT:     MULHI_INT * T0.X, T0.X, literal.z,
2341; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2342; EG-NEXT:    327796565(3.478022e-27), 0(0.000000e+00)
2343; EG-NEXT:     ADD_INT T0.Y, PV.Y, PV.W,
2344; EG-NEXT:     ASHR T1.W, PS, literal.x,
2345; EG-NEXT:     LSHR * T2.W, PS, literal.y,
2346; EG-NEXT:    12(1.681558e-44), 31(4.344025e-44)
2347; EG-NEXT:     ADD_INT T0.X, PV.W, PS,
2348; EG-NEXT:     LSHR * T1.X, KC0[2].Z, literal.x,
2349; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2350  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
2351  %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
2352  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
2353  ret void
2354}
2355