1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX90A %s
7
8define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
9; CHECK-LABEL: @udiv_i32(
10; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
11; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
12; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
13; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
14; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
15; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
16; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
17; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
18; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
19; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
20; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
21; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
22; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
23; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
24; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
25; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
26; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
27; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
28; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
29; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
30; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
31; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
32; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
33; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
34; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
35; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
36; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
37; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
38; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
39; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
40; CHECK-NEXT:    ret void
41;
42; GFX6-LABEL: udiv_i32:
43; GFX6:       ; %bb.0:
44; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
45; GFX6-NEXT:    s_mov_b32 s7, 0xf000
46; GFX6-NEXT:    s_mov_b32 s6, -1
47; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
49; GFX6-NEXT:    s_sub_i32 s4, 0, s3
50; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
51; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
52; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
53; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
54; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
55; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
56; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
57; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
58; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
59; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
60; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
61; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
62; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
63; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
64; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
65; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
66; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
67; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
68; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
70; GFX6-NEXT:    s_endpgm
71;
72; GFX9-LABEL: udiv_i32:
73; GFX9:       ; %bb.0:
74; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
75; GFX9-NEXT:    v_mov_b32_e32 v2, 0
76; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
77; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
79; GFX9-NEXT:    s_sub_i32 s4, 0, s3
80; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
81; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
82; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
83; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
84; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
85; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
86; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
87; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
88; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
89; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
90; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
91; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
92; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
93; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
94; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
95; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
96; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
97; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
98; GFX9-NEXT:    s_endpgm
99;
100; GFX90A-LABEL: udiv_i32:
101; GFX90A:       ; %bb.0:
102; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
103; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
104; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
105; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
107; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
108; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
109; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
110; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
111; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v0
112; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
113; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
114; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
115; GFX90A-NEXT:    v_mul_lo_u32 v2, v0, s3
116; GFX90A-NEXT:    v_sub_u32_e32 v2, s2, v2
117; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
118; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
119; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
120; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v2
121; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
122; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
123; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
124; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
125; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
126; GFX90A-NEXT:    s_endpgm
127  %r = udiv i32 %x, %y
128  store i32 %r, i32 addrspace(1)* %out
129  ret void
130}
131
132define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
133; CHECK-LABEL: @urem_i32(
134; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
135; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
136; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
137; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
138; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
139; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
140; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
141; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
142; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
143; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
144; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
145; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
146; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
147; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
148; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
149; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
150; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
151; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
152; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
153; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
154; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
155; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
156; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
157; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
158; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
159; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
160; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
161; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
162; CHECK-NEXT:    ret void
163;
164; GFX6-LABEL: urem_i32:
165; GFX6:       ; %bb.0:
166; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
167; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
168; GFX6-NEXT:    s_mov_b32 s3, 0xf000
169; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
171; GFX6-NEXT:    s_sub_i32 s2, 0, s5
172; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
173; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
174; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
175; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
176; GFX6-NEXT:    s_mov_b32 s2, -1
177; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
178; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
179; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
180; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
181; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
182; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
183; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
184; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
185; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
186; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
187; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
188; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
189; GFX6-NEXT:    s_endpgm
190;
191; GFX9-LABEL: urem_i32:
192; GFX9:       ; %bb.0:
193; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
194; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
195; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
196; GFX9-NEXT:    s_sub_i32 s4, 0, s3
197; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
198; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
199; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
200; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
201; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
202; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
203; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
204; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
205; GFX9-NEXT:    v_mov_b32_e32 v1, 0
206; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
207; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
208; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
209; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
210; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
211; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
212; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
213; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
216; GFX9-NEXT:    s_endpgm
217;
218; GFX90A-LABEL: urem_i32:
219; GFX90A:       ; %bb.0:
220; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
221; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
222; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
223; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
225; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
226; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
227; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
228; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
229; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v0
230; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
231; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
232; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
233; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s3
234; GFX90A-NEXT:    v_sub_u32_e32 v0, s2, v0
235; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
236; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
237; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
238; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
239; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
240; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
241; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
242; GFX90A-NEXT:    s_endpgm
243  %r = urem i32 %x, %y
244  store i32 %r, i32 addrspace(1)* %out
245  ret void
246}
247
248define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
249; CHECK-LABEL: @sdiv_i32(
250; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
251; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
252; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
253; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
254; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
255; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
256; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
257; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
258; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
259; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
260; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
261; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
262; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
263; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
264; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
265; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
266; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
267; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
268; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
269; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
270; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
271; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
272; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
273; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
274; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
275; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
276; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
277; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
278; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
279; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
280; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
281; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
282; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
283; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
284; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
285; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
286; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
287; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
288; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
289; CHECK-NEXT:    ret void
290;
291; GFX6-LABEL: sdiv_i32:
292; GFX6:       ; %bb.0:
293; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
294; GFX6-NEXT:    s_mov_b32 s7, 0xf000
295; GFX6-NEXT:    s_mov_b32 s6, -1
296; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
298; GFX6-NEXT:    s_add_i32 s3, s3, s8
299; GFX6-NEXT:    s_xor_b32 s3, s3, s8
300; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
301; GFX6-NEXT:    s_sub_i32 s4, 0, s3
302; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
303; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
304; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
305; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
306; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
307; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
308; GFX6-NEXT:    s_add_i32 s1, s2, s0
309; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
310; GFX6-NEXT:    s_xor_b32 s1, s1, s0
311; GFX6-NEXT:    s_xor_b32 s2, s0, s8
312; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
313; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
314; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
315; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
316; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
317; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
318; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
319; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
320; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
321; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
322; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
323; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
324; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
325; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
326; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
328; GFX6-NEXT:    s_endpgm
329;
330; GFX9-LABEL: sdiv_i32:
331; GFX9:       ; %bb.0:
332; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
333; GFX9-NEXT:    v_mov_b32_e32 v2, 0
334; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
335; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
337; GFX9-NEXT:    s_add_i32 s3, s3, s4
338; GFX9-NEXT:    s_xor_b32 s3, s3, s4
339; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
340; GFX9-NEXT:    s_sub_i32 s5, 0, s3
341; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
342; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
343; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
344; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
345; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
346; GFX9-NEXT:    s_add_i32 s2, s2, s5
347; GFX9-NEXT:    s_xor_b32 s2, s2, s5
348; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
349; GFX9-NEXT:    s_xor_b32 s4, s5, s4
350; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
351; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
352; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
353; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
354; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
355; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
356; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
357; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
358; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
359; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
360; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
361; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
362; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
363; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
364; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
365; GFX9-NEXT:    s_endpgm
366;
367; GFX90A-LABEL: sdiv_i32:
368; GFX90A:       ; %bb.0:
369; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
370; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
371; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
372; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
374; GFX90A-NEXT:    s_add_i32 s3, s3, s4
375; GFX90A-NEXT:    s_xor_b32 s3, s3, s4
376; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
377; GFX90A-NEXT:    s_ashr_i32 s5, s2, 31
378; GFX90A-NEXT:    s_add_i32 s2, s2, s5
379; GFX90A-NEXT:    s_xor_b32 s4, s5, s4
380; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
381; GFX90A-NEXT:    s_xor_b32 s2, s2, s5
382; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
383; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
384; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
385; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v0
386; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
387; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
388; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
389; GFX90A-NEXT:    v_mul_lo_u32 v2, v0, s3
390; GFX90A-NEXT:    v_sub_u32_e32 v2, s2, v2
391; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
392; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
393; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
394; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v2
395; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
396; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
397; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
398; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
399; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
400; GFX90A-NEXT:    v_subrev_u32_e32 v0, s4, v0
401; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
402; GFX90A-NEXT:    s_endpgm
403  %r = sdiv i32 %x, %y
404  store i32 %r, i32 addrspace(1)* %out
405  ret void
406}
407
408define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
409; CHECK-LABEL: @srem_i32(
410; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
411; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
412; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
413; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
414; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
415; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
416; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
417; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
418; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
419; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
420; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
421; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
422; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
423; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
424; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
425; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
426; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
427; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
428; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
429; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
430; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
431; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
432; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
433; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
434; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
435; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
436; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
437; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
438; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
439; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
440; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
441; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
442; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
443; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
444; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
445; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
446; CHECK-NEXT:    ret void
447;
448; GFX6-LABEL: srem_i32:
449; GFX6:       ; %bb.0:
450; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
451; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
452; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
453; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
454; GFX6-NEXT:    s_add_i32 s3, s3, s4
455; GFX6-NEXT:    s_xor_b32 s4, s3, s4
456; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
457; GFX6-NEXT:    s_sub_i32 s3, 0, s4
458; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
459; GFX6-NEXT:    s_add_i32 s2, s2, s5
460; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
461; GFX6-NEXT:    s_xor_b32 s6, s2, s5
462; GFX6-NEXT:    s_mov_b32 s2, -1
463; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
464; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
465; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
466; GFX6-NEXT:    s_mov_b32 s3, 0xf000
467; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
468; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
469; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
470; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
471; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
472; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
473; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
474; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
475; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
476; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
477; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
478; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
479; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
480; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
481; GFX6-NEXT:    s_endpgm
482;
483; GFX9-LABEL: srem_i32:
484; GFX9:       ; %bb.0:
485; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
486; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
488; GFX9-NEXT:    s_add_i32 s3, s3, s4
489; GFX9-NEXT:    s_xor_b32 s3, s3, s4
490; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
491; GFX9-NEXT:    s_sub_i32 s4, 0, s3
492; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
493; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
494; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
495; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
496; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
497; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
498; GFX9-NEXT:    s_add_i32 s2, s2, s4
499; GFX9-NEXT:    s_xor_b32 s2, s2, s4
500; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
501; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
502; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
503; GFX9-NEXT:    v_mov_b32_e32 v1, 0
504; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
505; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
506; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
507; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
508; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
509; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
510; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
511; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
512; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
513; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
514; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
515; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
516; GFX9-NEXT:    s_endpgm
517;
518; GFX90A-LABEL: srem_i32:
519; GFX90A:       ; %bb.0:
520; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
521; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
522; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
523; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
525; GFX90A-NEXT:    s_add_i32 s3, s3, s4
526; GFX90A-NEXT:    s_xor_b32 s3, s3, s4
527; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
528; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
529; GFX90A-NEXT:    s_ashr_i32 s4, s2, 31
530; GFX90A-NEXT:    s_add_i32 s2, s2, s4
531; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
532; GFX90A-NEXT:    s_xor_b32 s2, s2, s4
533; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
534; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
535; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v0
536; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
537; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
538; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
539; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s3
540; GFX90A-NEXT:    v_sub_u32_e32 v0, s2, v0
541; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
542; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
543; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
544; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
545; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
546; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
547; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
548; GFX90A-NEXT:    v_subrev_u32_e32 v0, s4, v0
549; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
550; GFX90A-NEXT:    s_endpgm
551  %r = srem i32 %x, %y
552  store i32 %r, i32 addrspace(1)* %out
553  ret void
554}
555
556define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
557; CHECK-LABEL: @udiv_i16(
558; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
559; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
560; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
561; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
562; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
563; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
564; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
565; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
566; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
567; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
568; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
569; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
570; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
571; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
572; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
573; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
574; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
575; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
576; CHECK-NEXT:    ret void
577;
578; GFX6-LABEL: udiv_i16:
579; GFX6:       ; %bb.0:
580; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
581; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
582; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
584; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
585; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
586; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s2
587; GFX6-NEXT:    s_mov_b32 s3, 0xf000
588; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
589; GFX6-NEXT:    s_mov_b32 s2, -1
590; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
591; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
592; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
593; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
594; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
595; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
596; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
597; GFX6-NEXT:    s_endpgm
598;
599; GFX9-LABEL: udiv_i16:
600; GFX9:       ; %bb.0:
601; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
602; GFX9-NEXT:    v_mov_b32_e32 v3, 0
603; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
604; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
606; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
607; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
608; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
609; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
610; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
611; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
612; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
613; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
614; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
615; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
616; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
617; GFX9-NEXT:    s_endpgm
618;
619; GFX90A-LABEL: udiv_i16:
620; GFX90A:       ; %bb.0:
621; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x2c
622; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
623; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
624; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX90A-NEXT:    s_lshr_b32 s3, s2, 16
626; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
627; GFX90A-NEXT:    s_and_b32 s2, s2, 0xffff
628; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s2
629; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
630; GFX90A-NEXT:    v_mul_f32_e32 v2, v1, v2
631; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
632; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v2
633; GFX90A-NEXT:    v_mad_f32 v1, -v2, v0, v1
634; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
635; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
636; GFX90A-NEXT:    global_store_short v3, v0, s[0:1]
637; GFX90A-NEXT:    s_endpgm
638  %r = udiv i16 %x, %y
639  store i16 %r, i16 addrspace(1)* %out
640  ret void
641}
642
643define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
644; CHECK-LABEL: @urem_i16(
645; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
646; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
647; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
648; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
649; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
650; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
651; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
652; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
653; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
654; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
655; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
656; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
657; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
658; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
659; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
660; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
661; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
662; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
663; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
664; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
665; CHECK-NEXT:    ret void
666;
667; GFX6-LABEL: urem_i16:
668; GFX6:       ; %bb.0:
669; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
670; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
671; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX6-NEXT:    s_lshr_b32 s2, s4, 16
673; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
674; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
675; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
676; GFX6-NEXT:    s_mov_b32 s3, 0xf000
677; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
678; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
679; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
680; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
681; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
682; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
683; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
684; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
685; GFX6-NEXT:    s_mov_b32 s2, -1
686; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
687; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
688; GFX6-NEXT:    s_endpgm
689;
690; GFX9-LABEL: urem_i16:
691; GFX9:       ; %bb.0:
692; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
693; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
694; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
695; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
696; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
697; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
698; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
699; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
700; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
701; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
702; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
703; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
704; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
705; GFX9-NEXT:    v_mov_b32_e32 v1, 0
706; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
707; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
708; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
709; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
710; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
711; GFX9-NEXT:    s_endpgm
712;
713; GFX90A-LABEL: urem_i16:
714; GFX90A:       ; %bb.0:
715; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x2c
716; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
717; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
718; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
719; GFX90A-NEXT:    s_lshr_b32 s3, s2, 16
720; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
721; GFX90A-NEXT:    s_and_b32 s4, s2, 0xffff
722; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s4
723; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
724; GFX90A-NEXT:    v_mul_f32_e32 v2, v1, v2
725; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
726; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v2
727; GFX90A-NEXT:    v_mad_f32 v1, -v2, v0, v1
728; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
729; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
730; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s3
731; GFX90A-NEXT:    v_sub_u32_e32 v0, s2, v0
732; GFX90A-NEXT:    global_store_short v3, v0, s[0:1]
733; GFX90A-NEXT:    s_endpgm
734  %r = urem i16 %x, %y
735  store i16 %r, i16 addrspace(1)* %out
736  ret void
737}
738
739define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
740; CHECK-LABEL: @sdiv_i16(
741; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
742; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
743; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
744; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
745; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
746; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
747; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
748; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
749; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
750; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
751; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
752; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
753; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
754; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
755; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
756; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
757; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
758; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
759; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
760; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
761; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
762; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
763; CHECK-NEXT:    ret void
764;
765; GFX6-LABEL: sdiv_i16:
766; GFX6:       ; %bb.0:
767; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
768; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
769; GFX6-NEXT:    s_mov_b32 s7, 0xf000
770; GFX6-NEXT:    s_mov_b32 s6, -1
771; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
772; GFX6-NEXT:    s_ashr_i32 s1, s0, 16
773; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
774; GFX6-NEXT:    s_sext_i32_i16 s0, s0
775; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
776; GFX6-NEXT:    s_xor_b32 s0, s0, s1
777; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
778; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
779; GFX6-NEXT:    s_or_b32 s0, s0, 1
780; GFX6-NEXT:    v_mov_b32_e32 v3, s0
781; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
782; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
783; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
784; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
785; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
786; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
787; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
788; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
789; GFX6-NEXT:    s_endpgm
790;
791; GFX9-LABEL: sdiv_i16:
792; GFX9:       ; %bb.0:
793; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
794; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
795; GFX9-NEXT:    v_mov_b32_e32 v1, 0
796; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
797; GFX9-NEXT:    s_ashr_i32 s0, s4, 16
798; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
799; GFX9-NEXT:    s_sext_i32_i16 s1, s4
800; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
801; GFX9-NEXT:    s_xor_b32 s0, s1, s0
802; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
803; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
804; GFX9-NEXT:    s_or_b32 s4, s0, 1
805; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
806; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
807; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
808; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
809; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
810; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
811; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
812; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
813; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
814; GFX9-NEXT:    s_endpgm
815;
816; GFX90A-LABEL: sdiv_i16:
817; GFX90A:       ; %bb.0:
818; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
819; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
820; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
821; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
822; GFX90A-NEXT:    s_ashr_i32 s0, s4, 16
823; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
824; GFX90A-NEXT:    s_sext_i32_i16 s1, s4
825; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
826; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
827; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
828; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
829; GFX90A-NEXT:    s_or_b32 s4, s0, 1
830; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
831; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
832; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
833; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
834; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
835; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
836; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
837; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
838; GFX90A-NEXT:    global_store_short v1, v0, s[2:3]
839; GFX90A-NEXT:    s_endpgm
840  %r = sdiv i16 %x, %y
841  store i16 %r, i16 addrspace(1)* %out
842  ret void
843}
844
845define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
846; CHECK-LABEL: @srem_i16(
847; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
848; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
849; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
850; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
851; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
852; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
853; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
854; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
855; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
856; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
857; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
858; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
859; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
860; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
861; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
862; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
863; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
864; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
865; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
866; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
867; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
868; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
869; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
870; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
871; CHECK-NEXT:    ret void
872;
873; GFX6-LABEL: srem_i16:
874; GFX6:       ; %bb.0:
875; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
876; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
877; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX6-NEXT:    s_ashr_i32 s2, s4, 16
879; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
880; GFX6-NEXT:    s_sext_i32_i16 s3, s4
881; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
882; GFX6-NEXT:    s_xor_b32 s3, s3, s2
883; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
884; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
885; GFX6-NEXT:    s_or_b32 s3, s3, 1
886; GFX6-NEXT:    v_mov_b32_e32 v3, s3
887; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
888; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
889; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
890; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
891; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
892; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
893; GFX6-NEXT:    s_mov_b32 s3, 0xf000
894; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
895; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
896; GFX6-NEXT:    s_mov_b32 s2, -1
897; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
898; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
899; GFX6-NEXT:    s_endpgm
900;
901; GFX9-LABEL: srem_i16:
902; GFX9:       ; %bb.0:
903; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
904; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
905; GFX9-NEXT:    s_ashr_i32 s5, s4, 16
906; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
907; GFX9-NEXT:    s_sext_i32_i16 s2, s4
908; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
909; GFX9-NEXT:    s_xor_b32 s2, s2, s5
910; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
911; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
912; GFX9-NEXT:    s_or_b32 s6, s2, 1
913; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
914; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
915; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
916; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
917; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
918; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
919; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
920; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
921; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
922; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
923; GFX9-NEXT:    v_mov_b32_e32 v1, 0
924; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
925; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
926; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
927; GFX9-NEXT:    s_endpgm
928;
929; GFX90A-LABEL: srem_i16:
930; GFX90A:       ; %bb.0:
931; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
932; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
933; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
934; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
935; GFX90A-NEXT:    s_ashr_i32 s5, s4, 16
936; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s5
937; GFX90A-NEXT:    s_sext_i32_i16 s0, s4
938; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s0
939; GFX90A-NEXT:    s_xor_b32 s0, s0, s5
940; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
941; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
942; GFX90A-NEXT:    s_or_b32 s6, s0, 1
943; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
944; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
945; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
946; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
947; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
948; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
949; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
950; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
951; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s5
952; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
953; GFX90A-NEXT:    global_store_short v1, v0, s[2:3]
954; GFX90A-NEXT:    s_endpgm
955  %r = srem i16 %x, %y
956  store i16 %r, i16 addrspace(1)* %out
957  ret void
958}
959
960define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
961; CHECK-LABEL: @udiv_i8(
962; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
963; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
964; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
965; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
966; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
967; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
968; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
969; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
970; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
971; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
972; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
973; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
974; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
975; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
976; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
977; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
978; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
979; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
980; CHECK-NEXT:    ret void
981;
982; GFX6-LABEL: udiv_i8:
983; GFX6:       ; %bb.0:
984; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
985; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
986; GFX6-NEXT:    s_mov_b32 s7, 0xf000
987; GFX6-NEXT:    s_mov_b32 s6, -1
988; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
989; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s0
990; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
991; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
992; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
993; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
994; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
995; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
996; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
997; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
998; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
999; GFX6-NEXT:    s_endpgm
1000;
1001; GFX9-LABEL: udiv_i8:
1002; GFX9:       ; %bb.0:
1003; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
1004; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1005; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1006; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
1008; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1009; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
1010; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
1011; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
1012; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
1013; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
1014; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1015; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
1016; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
1017; GFX9-NEXT:    s_endpgm
1018;
1019; GFX90A-LABEL: udiv_i8:
1020; GFX90A:       ; %bb.0:
1021; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x2c
1022; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1023; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1024; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX90A-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
1026; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1027; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
1028; GFX90A-NEXT:    v_mul_f32_e32 v1, v3, v1
1029; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
1030; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v1
1031; GFX90A-NEXT:    v_mad_f32 v1, -v1, v0, v3
1032; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1033; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
1034; GFX90A-NEXT:    global_store_byte v2, v0, s[0:1]
1035; GFX90A-NEXT:    s_endpgm
1036  %r = udiv i8 %x, %y
1037  store i8 %r, i8 addrspace(1)* %out
1038  ret void
1039}
1040
1041define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
1042; CHECK-LABEL: @urem_i8(
1043; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
1044; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
1045; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
1046; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
1047; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
1048; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
1049; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
1050; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
1051; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
1052; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
1053; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
1054; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
1055; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
1056; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
1057; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
1058; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
1059; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
1060; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
1061; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
1062; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
1063; CHECK-NEXT:    ret void
1064;
1065; GFX6-LABEL: urem_i8:
1066; GFX6:       ; %bb.0:
1067; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
1068; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1069; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1070; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1071; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
1072; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1073; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
1074; GFX6-NEXT:    s_lshr_b32 s2, s4, 8
1075; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
1076; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
1077; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
1078; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
1079; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1080; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
1081; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
1082; GFX6-NEXT:    s_mov_b32 s2, -1
1083; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1084; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1085; GFX6-NEXT:    s_endpgm
1086;
1087; GFX9-LABEL: urem_i8:
1088; GFX9:       ; %bb.0:
1089; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
1090; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1091; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
1092; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1093; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
1094; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
1095; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1096; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
1097; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
1098; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
1099; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
1100; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1101; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1102; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
1103; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
1104; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
1105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1106; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
1107; GFX9-NEXT:    s_endpgm
1108;
1109; GFX90A-LABEL: urem_i8:
1110; GFX90A:       ; %bb.0:
1111; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1112; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
1113; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
1114; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1115; GFX90A-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
1116; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v0
1117; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v3, s4
1118; GFX90A-NEXT:    s_lshr_b32 s0, s4, 8
1119; GFX90A-NEXT:    v_mul_f32_e32 v1, v3, v1
1120; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
1121; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v1
1122; GFX90A-NEXT:    v_mad_f32 v1, -v1, v0, v3
1123; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1124; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
1125; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s0
1126; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
1127; GFX90A-NEXT:    global_store_byte v2, v0, s[2:3]
1128; GFX90A-NEXT:    s_endpgm
1129  %r = urem i8 %x, %y
1130  store i8 %r, i8 addrspace(1)* %out
1131  ret void
1132}
1133
1134define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
1135; CHECK-LABEL: @sdiv_i8(
1136; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
1137; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
1138; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
1139; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
1140; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
1141; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
1142; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
1143; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
1144; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
1145; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
1146; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
1147; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
1148; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
1149; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
1150; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
1151; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
1152; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
1153; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
1154; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
1155; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
1156; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
1157; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
1158; CHECK-NEXT:    ret void
1159;
1160; GFX6-LABEL: sdiv_i8:
1161; GFX6:       ; %bb.0:
1162; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1163; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
1164; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1165; GFX6-NEXT:    s_mov_b32 s6, -1
1166; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1167; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x80008
1168; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
1169; GFX6-NEXT:    s_sext_i32_i8 s0, s0
1170; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
1171; GFX6-NEXT:    s_xor_b32 s0, s0, s1
1172; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1173; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
1174; GFX6-NEXT:    s_or_b32 s0, s0, 1
1175; GFX6-NEXT:    v_mov_b32_e32 v3, s0
1176; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
1177; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
1178; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
1179; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
1180; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
1181; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
1182; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1183; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1184; GFX6-NEXT:    s_endpgm
1185;
1186; GFX9-LABEL: sdiv_i8:
1187; GFX9:       ; %bb.0:
1188; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1189; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
1190; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1191; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1192; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x80008
1193; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
1194; GFX9-NEXT:    s_sext_i32_i8 s1, s4
1195; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
1196; GFX9-NEXT:    s_xor_b32 s0, s1, s0
1197; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
1198; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
1199; GFX9-NEXT:    s_or_b32 s4, s0, 1
1200; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
1201; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
1202; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
1203; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
1204; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
1205; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
1206; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
1207; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
1208; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
1209; GFX9-NEXT:    s_endpgm
1210;
1211; GFX90A-LABEL: sdiv_i8:
1212; GFX90A:       ; %bb.0:
1213; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1214; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
1215; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
1216; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1217; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0x80008
1218; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
1219; GFX90A-NEXT:    s_sext_i32_i8 s1, s4
1220; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
1221; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
1222; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
1223; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
1224; GFX90A-NEXT:    s_or_b32 s4, s0, 1
1225; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
1226; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
1227; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
1228; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
1229; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
1230; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
1231; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
1232; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
1233; GFX90A-NEXT:    global_store_byte v1, v0, s[2:3]
1234; GFX90A-NEXT:    s_endpgm
1235  %r = sdiv i8 %x, %y
1236  store i8 %r, i8 addrspace(1)* %out
1237  ret void
1238}
1239
1240define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
1241; CHECK-LABEL: @srem_i8(
1242; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
1243; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
1244; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
1245; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
1246; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
1247; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
1248; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
1249; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
1250; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
1251; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
1252; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
1253; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
1254; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
1255; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
1256; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
1257; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
1258; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
1259; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
1260; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
1261; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
1262; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
1263; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
1264; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
1265; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
1266; CHECK-NEXT:    ret void
1267;
1268; GFX6-LABEL: srem_i8:
1269; GFX6:       ; %bb.0:
1270; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1271; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
1272; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1273; GFX6-NEXT:    s_mov_b32 s6, -1
1274; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1275; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x80008
1276; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
1277; GFX6-NEXT:    s_sext_i32_i8 s3, s0
1278; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
1279; GFX6-NEXT:    s_xor_b32 s1, s3, s1
1280; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1281; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
1282; GFX6-NEXT:    s_or_b32 s1, s1, 1
1283; GFX6-NEXT:    v_mov_b32_e32 v3, s1
1284; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
1285; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
1286; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
1287; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
1288; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
1289; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
1290; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
1291; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1292; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
1293; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1294; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1295; GFX6-NEXT:    s_endpgm
1296;
1297; GFX9-LABEL: srem_i8:
1298; GFX9:       ; %bb.0:
1299; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
1300; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1301; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x80008
1302; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
1303; GFX9-NEXT:    s_sext_i32_i8 s3, s4
1304; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
1305; GFX9-NEXT:    s_xor_b32 s2, s3, s2
1306; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1307; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
1308; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
1309; GFX9-NEXT:    s_or_b32 s6, s2, 1
1310; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
1311; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1312; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
1313; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
1314; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
1315; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
1316; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
1317; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
1318; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
1319; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1320; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1321; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1323; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
1324; GFX9-NEXT:    s_endpgm
1325;
1326; GFX90A-LABEL: srem_i8:
1327; GFX90A:       ; %bb.0:
1328; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1329; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
1330; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
1331; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1332; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0x80008
1333; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s0
1334; GFX90A-NEXT:    s_sext_i32_i8 s1, s4
1335; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
1336; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
1337; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v1
1338; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
1339; GFX90A-NEXT:    s_lshr_b32 s5, s4, 8
1340; GFX90A-NEXT:    s_or_b32 s6, s0, 1
1341; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
1342; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
1343; GFX90A-NEXT:    v_mad_f32 v2, -v3, v1, v2
1344; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
1345; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v1|
1346; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
1347; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
1348; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v3
1349; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s5
1350; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
1351; GFX90A-NEXT:    global_store_byte v0, v1, s[2:3]
1352; GFX90A-NEXT:    s_endpgm
1353  %r = srem i8 %x, %y
1354  store i8 %r, i8 addrspace(1)* %out
1355  ret void
1356}
1357
1358define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1359; CHECK-LABEL: @udiv_v4i32(
1360; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1361; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1362; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1363; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1364; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1365; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1366; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1367; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1368; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1369; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1370; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1371; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1372; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1373; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1374; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1375; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1376; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1377; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1378; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1379; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1380; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1381; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1382; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1383; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1384; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
1385; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
1386; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1387; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
1388; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
1389; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
1390; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
1391; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0
1392; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
1393; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1394; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
1395; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
1396; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
1397; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
1398; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
1399; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
1400; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
1401; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
1402; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
1403; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1404; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
1405; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
1406; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
1407; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
1408; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
1409; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
1410; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1411; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
1412; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
1413; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
1414; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
1415; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
1416; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
1417; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
1418; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
1419; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
1420; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
1421; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
1422; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
1423; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
1424; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
1425; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1426; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
1427; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
1428; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
1429; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
1430; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
1431; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
1432; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
1433; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
1434; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
1435; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
1436; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
1437; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
1438; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
1439; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
1440; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
1441; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
1442; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
1443; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
1444; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
1445; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
1446; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
1447; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
1448; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
1449; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
1450; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
1451; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
1452; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
1453; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
1454; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
1455; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
1456; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
1457; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1458; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
1459; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
1460; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
1461; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
1462; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
1463; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
1464; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
1465; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1466; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1467; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1468; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1469; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1470; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
1471; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
1472; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
1473; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
1474; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
1475; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
1476; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
1477; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
1478; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
1479; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
1480; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
1481; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
1482; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
1483; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
1484; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
1485; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
1486; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
1487; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
1488; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1489; CHECK-NEXT:    ret void
1490;
1491; GFX6-LABEL: udiv_v4i32:
1492; GFX6:       ; %bb.0:
1493; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1494; GFX6-NEXT:    s_mov_b32 s3, 0x4f7ffffe
1495; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1496; GFX6-NEXT:    s_mov_b32 s15, 0xf000
1497; GFX6-NEXT:    s_mov_b32 s14, -1
1498; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1499; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1500; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1501; GFX6-NEXT:    s_sub_i32 s2, 0, s8
1502; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s10
1503; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1504; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1505; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s11
1506; GFX6-NEXT:    v_mul_f32_e32 v0, s3, v0
1507; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1508; GFX6-NEXT:    v_mul_f32_e32 v1, s3, v1
1509; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1510; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
1511; GFX6-NEXT:    s_sub_i32 s2, 0, s9
1512; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
1513; GFX6-NEXT:    s_sub_i32 s2, 0, s10
1514; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1515; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
1516; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1517; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1518; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
1519; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1520; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
1521; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1522; GFX6-NEXT:    v_mul_lo_u32 v5, v1, s9
1523; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
1524; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
1525; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1526; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
1527; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1528; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1529; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
1530; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v4
1531; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1532; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
1533; GFX6-NEXT:    v_mul_f32_e32 v2, s3, v2
1534; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1535; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1536; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
1537; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1538; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
1539; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
1540; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1541; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
1542; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
1543; GFX6-NEXT:    s_sub_i32 s0, 0, s11
1544; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1545; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v6
1546; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1547; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1548; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1549; GFX6-NEXT:    v_mul_f32_e32 v4, s3, v4
1550; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1551; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s10
1552; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1553; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1554; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
1555; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
1556; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1557; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1558; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
1559; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1560; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
1561; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
1562; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1563; GFX6-NEXT:    v_mul_lo_u32 v6, v4, s11
1564; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1565; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1566; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1567; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
1568; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
1569; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1570; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
1571; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1572; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1573; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1574; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1575; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1576; GFX6-NEXT:    s_endpgm
1577;
1578; GFX9-LABEL: udiv_v4i32:
1579; GFX9:       ; %bb.0:
1580; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1581; GFX9-NEXT:    s_mov_b32 s12, 0x4f7ffffe
1582; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1583; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1584; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1586; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1587; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1588; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1589; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1590; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1591; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
1592; GFX9-NEXT:    v_mul_f32_e32 v0, s12, v0
1593; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1594; GFX9-NEXT:    v_mul_f32_e32 v1, s12, v1
1595; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1596; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1597; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
1598; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1599; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
1600; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1601; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1602; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1603; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1604; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1605; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v5
1606; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1607; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
1608; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s11
1609; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1610; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
1611; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v5
1612; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
1613; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
1614; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v5
1615; GFX9-NEXT:    v_mul_lo_u32 v6, v1, s9
1616; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1617; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
1618; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
1619; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1620; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
1621; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
1622; GFX9-NEXT:    v_sub_u32_e32 v6, s5, v6
1623; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1624; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
1625; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v2
1626; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1627; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v7
1628; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1629; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1630; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v6
1631; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
1632; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
1633; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
1634; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
1635; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
1636; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
1637; GFX9-NEXT:    v_mul_lo_u32 v8, v3, s10
1638; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
1639; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
1640; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
1641; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v2
1642; GFX9-NEXT:    v_sub_u32_e32 v6, s6, v8
1643; GFX9-NEXT:    v_add_u32_e32 v7, 1, v3
1644; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
1645; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc
1646; GFX9-NEXT:    v_subrev_u32_e32 v3, s10, v6
1647; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
1648; GFX9-NEXT:    v_mul_lo_u32 v6, v5, s11
1649; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
1650; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1651; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
1652; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v6
1653; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
1654; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1655; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1656; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v3
1657; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1658; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
1659; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1660; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
1661; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1662; GFX9-NEXT:    s_endpgm
1663;
1664; GFX90A-LABEL: udiv_v4i32:
1665; GFX90A:       ; %bb.0:
1666; GFX90A-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1667; GFX90A-NEXT:    s_mov_b32 s3, 0x4f7ffffe
1668; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1669; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
1670; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
1671; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
1672; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
1673; GFX90A-NEXT:    s_sub_i32 s2, 0, s8
1674; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1675; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1676; GFX90A-NEXT:    v_mul_f32_e32 v0, s3, v0
1677; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
1678; GFX90A-NEXT:    v_mul_f32_e32 v1, s3, v1
1679; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
1680; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v0
1681; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
1682; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
1683; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
1684; GFX90A-NEXT:    v_mul_lo_u32 v2, v0, s8
1685; GFX90A-NEXT:    v_sub_u32_e32 v2, s4, v2
1686; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
1687; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
1688; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1689; GFX90A-NEXT:    v_subrev_u32_e32 v3, s8, v2
1690; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
1691; GFX90A-NEXT:    s_sub_i32 s2, 0, s9
1692; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
1693; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
1694; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v1
1695; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1696; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
1697; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s10
1698; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
1699; GFX90A-NEXT:    v_mul_hi_u32 v1, s5, v1
1700; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s9
1701; GFX90A-NEXT:    v_sub_u32_e32 v2, s5, v2
1702; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1703; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v1
1704; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1705; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1706; GFX90A-NEXT:    v_subrev_u32_e32 v5, s9, v2
1707; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1708; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v1
1709; GFX90A-NEXT:    v_mul_f32_e32 v3, s3, v3
1710; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1711; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
1712; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1713; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s11
1714; GFX90A-NEXT:    s_sub_i32 s2, 0, s10
1715; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v3
1716; GFX90A-NEXT:    v_mul_hi_u32 v2, v3, v2
1717; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1718; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
1719; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v2
1720; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, s10
1721; GFX90A-NEXT:    v_mul_f32_e32 v5, s3, v5
1722; GFX90A-NEXT:    v_sub_u32_e32 v3, s6, v3
1723; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
1724; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v2
1725; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1726; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1727; GFX90A-NEXT:    v_subrev_u32_e32 v6, s10, v3
1728; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1729; GFX90A-NEXT:    s_sub_i32 s2, 0, s11
1730; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1731; GFX90A-NEXT:    v_mul_lo_u32 v3, s2, v5
1732; GFX90A-NEXT:    v_mul_hi_u32 v3, v5, v3
1733; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
1734; GFX90A-NEXT:    v_mul_hi_u32 v3, s7, v3
1735; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s11
1736; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v2
1737; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v5
1738; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1739; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v3
1740; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
1741; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1742; GFX90A-NEXT:    v_subrev_u32_e32 v6, s11, v5
1743; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1744; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v3
1745; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
1746; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1747; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1748; GFX90A-NEXT:    s_endpgm
1749  %r = udiv <4 x i32> %x, %y
1750  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1751  ret void
1752}
1753
1754define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1755; CHECK-LABEL: @urem_v4i32(
1756; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1757; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1758; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1759; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1760; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1761; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1762; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1763; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1764; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1765; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1766; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1767; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1768; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1769; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1770; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1771; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1772; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1773; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1774; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1775; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1776; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1777; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1778; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1779; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1780; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1781; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
1782; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
1783; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
1784; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
1785; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0
1786; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
1787; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1788; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
1789; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
1790; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
1791; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
1792; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
1793; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
1794; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
1795; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
1796; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
1797; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
1798; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
1799; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1800; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
1801; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
1802; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1803; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1804; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1805; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1806; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1807; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1808; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1809; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1810; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1811; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1812; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1813; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1814; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1815; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1816; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1817; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1818; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1819; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1820; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1821; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1822; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1823; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1824; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1825; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1826; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1827; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1828; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1829; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1830; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1831; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1832; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1833; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1834; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1835; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1836; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1837; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1838; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1839; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1840; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1841; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1842; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1843; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1844; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1845; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1846; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1847; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1848; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1849; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1850; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1851; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1852; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1853; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1854; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1855; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1856; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1857; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1858; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1859; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1860; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1861; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1862; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1863; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1864; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1865; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1866; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1867; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1868; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1869; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1870; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1871; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1872; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1873; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1874; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1875; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1876; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1877; CHECK-NEXT:    ret void
1878;
1879; GFX6-LABEL: urem_v4i32:
1880; GFX6:       ; %bb.0:
1881; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1882; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
1883; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1884; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1885; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1886; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1887; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1888; GFX6-NEXT:    s_sub_i32 s2, 0, s8
1889; GFX6-NEXT:    s_sub_i32 s12, 0, s9
1890; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1891; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1892; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
1893; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
1894; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
1895; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1896; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v1
1897; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1898; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1899; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
1900; GFX6-NEXT:    s_mov_b32 s2, -1
1901; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v1
1902; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1903; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
1904; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1905; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1906; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1907; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1908; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
1909; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
1910; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1911; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
1912; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1913; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1914; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1915; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1916; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1917; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1918; GFX6-NEXT:    s_sub_i32 s4, 0, s10
1919; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1920; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v2
1921; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1922; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1923; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1924; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
1925; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1926; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
1927; GFX6-NEXT:    s_sub_i32 s4, 0, s11
1928; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1929; GFX6-NEXT:    v_mul_f32_e32 v3, s13, v4
1930; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1931; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1932; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1933; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
1934; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1935; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1936; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
1937; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
1938; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
1939; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1940; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
1941; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1942; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1943; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
1944; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1945; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1946; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1947; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1948; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1949; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1950; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1951; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1952; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1953; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1954; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1955; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1956; GFX6-NEXT:    s_endpgm
1957;
1958; GFX9-LABEL: urem_v4i32:
1959; GFX9:       ; %bb.0:
1960; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1961; GFX9-NEXT:    s_mov_b32 s12, 0x4f7ffffe
1962; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1963; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1964; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1965; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1966; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1967; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1968; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
1969; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1970; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1971; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1972; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1973; GFX9-NEXT:    v_mul_f32_e32 v0, s12, v0
1974; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1975; GFX9-NEXT:    v_mul_f32_e32 v1, s12, v1
1976; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1977; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
1978; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
1979; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1980; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
1981; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1982; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1983; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1984; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v5
1985; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1986; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1987; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v6
1988; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1989; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
1990; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1991; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v3
1992; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1993; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
1994; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1995; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
1996; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
1997; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
1998; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
1999; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
2000; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
2001; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
2002; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
2003; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2004; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2005; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
2006; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
2007; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s10
2008; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
2009; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
2010; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2011; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
2012; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
2013; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2014; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2015; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s11
2016; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
2017; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2018; GFX9-NEXT:    v_sub_u32_e32 v2, s6, v2
2019; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2020; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
2021; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
2022; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2023; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
2024; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
2025; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
2026; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2027; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
2028; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
2029; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2030; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
2031; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
2032; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2033; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2034; GFX9-NEXT:    s_endpgm
2035;
2036; GFX90A-LABEL: urem_v4i32:
2037; GFX90A:       ; %bb.0:
2038; GFX90A-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
2039; GFX90A-NEXT:    s_mov_b32 s12, 0x4f7ffffe
2040; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2041; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2042; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
2043; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
2044; GFX90A-NEXT:    s_sub_i32 s2, 0, s8
2045; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
2046; GFX90A-NEXT:    s_sub_i32 s3, 0, s9
2047; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2048; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2049; GFX90A-NEXT:    v_mul_f32_e32 v0, s12, v0
2050; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
2051; GFX90A-NEXT:    v_mul_f32_e32 v1, s12, v1
2052; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
2053; GFX90A-NEXT:    v_mul_lo_u32 v2, s2, v0
2054; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
2055; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
2056; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
2057; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s8
2058; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
2059; GFX90A-NEXT:    v_subrev_u32_e32 v2, s8, v0
2060; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2061; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2062; GFX90A-NEXT:    v_subrev_u32_e32 v2, s8, v0
2063; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2064; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2065; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s10
2066; GFX90A-NEXT:    v_mul_lo_u32 v3, s3, v1
2067; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
2068; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
2069; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2070; GFX90A-NEXT:    v_mul_hi_u32 v1, s5, v1
2071; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
2072; GFX90A-NEXT:    v_sub_u32_e32 v1, s5, v1
2073; GFX90A-NEXT:    v_mul_f32_e32 v2, s12, v2
2074; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
2075; GFX90A-NEXT:    v_subrev_u32_e32 v3, s9, v1
2076; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2077; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2078; GFX90A-NEXT:    v_subrev_u32_e32 v3, s9, v1
2079; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2080; GFX90A-NEXT:    s_sub_i32 s2, 0, s10
2081; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2082; GFX90A-NEXT:    v_mul_lo_u32 v3, s2, v2
2083; GFX90A-NEXT:    v_mul_hi_u32 v3, v2, v3
2084; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
2085; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s11
2086; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v2
2087; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, s10
2088; GFX90A-NEXT:    v_sub_u32_e32 v2, s6, v2
2089; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2090; GFX90A-NEXT:    v_subrev_u32_e32 v5, s10, v2
2091; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
2092; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2093; GFX90A-NEXT:    v_mul_f32_e32 v3, s12, v3
2094; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
2095; GFX90A-NEXT:    v_subrev_u32_e32 v5, s10, v2
2096; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
2097; GFX90A-NEXT:    s_sub_i32 s2, 0, s11
2098; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2099; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v3
2100; GFX90A-NEXT:    v_mul_hi_u32 v5, v3, v5
2101; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
2102; GFX90A-NEXT:    v_mul_hi_u32 v3, s7, v3
2103; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s11
2104; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v3
2105; GFX90A-NEXT:    v_subrev_u32_e32 v5, s11, v3
2106; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
2107; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2108; GFX90A-NEXT:    v_subrev_u32_e32 v5, s11, v3
2109; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
2110; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2111; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2112; GFX90A-NEXT:    s_endpgm
2113  %r = urem <4 x i32> %x, %y
2114  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2115  ret void
2116}
2117
2118define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
2119; CHECK-LABEL: @sdiv_v4i32(
2120; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
2121; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
2122; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
2123; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
2124; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2125; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
2126; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
2127; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
2128; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
2129; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
2130; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
2131; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
2132; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
2133; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
2134; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
2135; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
2136; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
2137; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
2138; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
2139; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
2140; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
2141; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
2142; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
2143; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
2144; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
2145; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
2146; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
2147; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
2148; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
2149; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
2150; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
2151; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
2152; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
2153; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
2154; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
2155; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
2156; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
2157; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
2158; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
2159; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
2160; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0
2161; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
2162; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
2163; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
2164; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
2165; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
2166; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
2167; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
2168; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
2169; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
2170; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
2171; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
2172; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
2173; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
2174; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
2175; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
2176; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
2177; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
2178; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
2179; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
2180; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
2181; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
2182; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
2183; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
2184; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
2185; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
2186; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
2187; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
2188; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
2189; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
2190; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
2191; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
2192; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
2193; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
2194; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
2195; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
2196; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
2197; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
2198; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
2199; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
2200; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
2201; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
2202; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
2203; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
2204; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
2205; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
2206; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
2207; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
2208; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
2209; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
2210; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
2211; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
2212; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
2213; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
2214; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
2215; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
2216; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
2217; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
2218; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
2219; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
2220; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
2221; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
2222; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
2223; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
2224; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
2225; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
2226; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
2227; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
2228; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
2229; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
2230; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
2231; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
2232; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
2233; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
2234; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
2235; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
2236; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
2237; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
2238; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
2239; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
2240; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
2241; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
2242; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
2243; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
2244; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
2245; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
2246; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
2247; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
2248; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
2249; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
2250; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
2251; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
2252; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
2253; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
2254; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
2255; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
2256; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
2257; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
2258; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
2259; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
2260; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
2261; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
2262; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
2263; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
2264; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
2265; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
2266; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
2267; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
2268; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
2269; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
2270; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
2271; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
2272; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
2273; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
2274; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
2275; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
2276; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
2277; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
2278; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
2279; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
2280; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
2281; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
2282; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
2283; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
2284; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
2285; CHECK-NEXT:    ret void
2286;
2287; GFX6-LABEL: sdiv_v4i32:
2288; GFX6:       ; %bb.0:
2289; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
2290; GFX6-NEXT:    s_mov_b32 s16, 0x4f7ffffe
2291; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
2292; GFX6-NEXT:    s_mov_b32 s15, 0xf000
2293; GFX6-NEXT:    s_mov_b32 s14, -1
2294; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2295; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
2296; GFX6-NEXT:    s_add_i32 s3, s8, s2
2297; GFX6-NEXT:    s_xor_b32 s3, s3, s2
2298; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
2299; GFX6-NEXT:    s_ashr_i32 s8, s9, 31
2300; GFX6-NEXT:    s_add_i32 s0, s9, s8
2301; GFX6-NEXT:    s_xor_b32 s9, s0, s8
2302; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2303; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
2304; GFX6-NEXT:    s_sub_i32 s1, 0, s3
2305; GFX6-NEXT:    s_ashr_i32 s0, s4, 31
2306; GFX6-NEXT:    v_mul_f32_e32 v0, s16, v0
2307; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2308; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2309; GFX6-NEXT:    s_xor_b32 s2, s0, s2
2310; GFX6-NEXT:    v_mul_lo_u32 v2, s1, v0
2311; GFX6-NEXT:    s_add_i32 s1, s4, s0
2312; GFX6-NEXT:    v_mul_f32_e32 v1, s16, v1
2313; GFX6-NEXT:    s_xor_b32 s1, s1, s0
2314; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
2315; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2316; GFX6-NEXT:    s_sub_i32 s0, 0, s9
2317; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2318; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
2319; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
2320; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
2321; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
2322; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
2323; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
2324; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v3
2325; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
2326; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v3
2327; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
2328; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
2329; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2330; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
2331; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
2332; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
2333; GFX6-NEXT:    s_add_i32 s1, s5, s0
2334; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
2335; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
2336; GFX6-NEXT:    s_xor_b32 s1, s1, s0
2337; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
2338; GFX6-NEXT:    s_xor_b32 s2, s0, s8
2339; GFX6-NEXT:    s_add_i32 s0, s10, s3
2340; GFX6-NEXT:    s_xor_b32 s4, s0, s3
2341; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
2342; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
2343; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2344; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
2345; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
2346; GFX6-NEXT:    v_mul_f32_e32 v3, s16, v3
2347; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
2348; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
2349; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
2350; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
2351; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v2
2352; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
2353; GFX6-NEXT:    s_sub_i32 s0, 0, s4
2354; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
2355; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
2356; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
2357; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2358; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v5
2359; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
2360; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
2361; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
2362; GFX6-NEXT:    s_ashr_i32 s0, s6, 31
2363; GFX6-NEXT:    s_add_i32 s5, s11, s2
2364; GFX6-NEXT:    s_add_i32 s1, s6, s0
2365; GFX6-NEXT:    s_xor_b32 s5, s5, s2
2366; GFX6-NEXT:    s_xor_b32 s1, s1, s0
2367; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
2368; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
2369; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
2370; GFX6-NEXT:    s_xor_b32 s3, s0, s3
2371; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
2372; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s4
2373; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
2374; GFX6-NEXT:    v_mul_f32_e32 v4, s16, v4
2375; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
2376; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
2377; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
2378; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2379; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v3
2380; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
2381; GFX6-NEXT:    s_sub_i32 s0, 0, s5
2382; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
2383; GFX6-NEXT:    s_ashr_i32 s0, s7, 31
2384; GFX6-NEXT:    s_add_i32 s1, s7, s0
2385; GFX6-NEXT:    s_xor_b32 s1, s1, s0
2386; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
2387; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
2388; GFX6-NEXT:    s_xor_b32 s2, s0, s2
2389; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
2390; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
2391; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
2392; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2393; GFX6-NEXT:    v_xor_b32_e32 v2, s3, v2
2394; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s5
2395; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
2396; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
2397; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
2398; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v3
2399; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
2400; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v3
2401; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
2402; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
2403; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
2404; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
2405; GFX6-NEXT:    v_xor_b32_e32 v3, s2, v3
2406; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
2407; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
2408; GFX6-NEXT:    s_endpgm
2409;
2410; GFX9-LABEL: sdiv_v4i32:
2411; GFX9:       ; %bb.0:
2412; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
2413; GFX9-NEXT:    s_mov_b32 s15, 0x4f7ffffe
2414; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2415; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2416; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2417; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
2418; GFX9-NEXT:    s_add_i32 s3, s8, s2
2419; GFX9-NEXT:    s_xor_b32 s3, s3, s2
2420; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
2421; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
2422; GFX9-NEXT:    s_add_i32 s9, s9, s12
2423; GFX9-NEXT:    s_xor_b32 s9, s9, s12
2424; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2425; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
2426; GFX9-NEXT:    s_sub_i32 s14, 0, s3
2427; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
2428; GFX9-NEXT:    v_mul_f32_e32 v0, s15, v0
2429; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2430; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2431; GFX9-NEXT:    s_add_i32 s4, s4, s8
2432; GFX9-NEXT:    s_xor_b32 s4, s4, s8
2433; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v0
2434; GFX9-NEXT:    v_mul_f32_e32 v1, s15, v1
2435; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2436; GFX9-NEXT:    s_sub_i32 s14, 0, s9
2437; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
2438; GFX9-NEXT:    s_ashr_i32 s13, s5, 31
2439; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
2440; GFX9-NEXT:    s_add_i32 s5, s5, s13
2441; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
2442; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
2443; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
2444; GFX9-NEXT:    s_xor_b32 s5, s5, s13
2445; GFX9-NEXT:    s_xor_b32 s2, s8, s2
2446; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
2447; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
2448; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
2449; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
2450; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
2451; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
2452; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2453; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v3
2454; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
2455; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
2456; GFX9-NEXT:    s_ashr_i32 s3, s10, 31
2457; GFX9-NEXT:    s_add_i32 s4, s10, s3
2458; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2459; GFX9-NEXT:    s_xor_b32 s4, s4, s3
2460; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2461; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
2462; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s9
2463; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
2464; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
2465; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2466; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
2467; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
2468; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2469; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
2470; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2471; GFX9-NEXT:    v_subrev_u32_e32 v5, s9, v2
2472; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2473; GFX9-NEXT:    s_sub_i32 s5, 0, s4
2474; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
2475; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v3
2476; GFX9-NEXT:    s_add_i32 s9, s11, s8
2477; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
2478; GFX9-NEXT:    s_xor_b32 s9, s9, s8
2479; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2480; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
2481; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
2482; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
2483; GFX9-NEXT:    s_add_i32 s6, s6, s5
2484; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
2485; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v5
2486; GFX9-NEXT:    s_xor_b32 s6, s6, s5
2487; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
2488; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
2489; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
2490; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2491; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
2492; GFX9-NEXT:    s_xor_b32 s2, s13, s12
2493; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s4
2494; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
2495; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
2496; GFX9-NEXT:    s_xor_b32 s2, s5, s3
2497; GFX9-NEXT:    s_sub_i32 s3, 0, s9
2498; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v3
2499; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v5
2500; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2501; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2502; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2503; GFX9-NEXT:    v_subrev_u32_e32 v6, s4, v5
2504; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2505; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v7
2506; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
2507; GFX9-NEXT:    s_add_i32 s5, s7, s3
2508; GFX9-NEXT:    s_xor_b32 s5, s5, s3
2509; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
2510; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
2511; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2512; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2513; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2514; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s9
2515; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2516; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
2517; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
2518; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
2519; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2520; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2521; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v5
2522; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2523; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2524; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2525; GFX9-NEXT:    s_xor_b32 s2, s3, s8
2526; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2527; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
2528; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
2529; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2530; GFX9-NEXT:    s_endpgm
2531;
2532; GFX90A-LABEL: sdiv_v4i32:
2533; GFX90A:       ; %bb.0:
2534; GFX90A-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
2535; GFX90A-NEXT:    s_mov_b32 s13, 0x4f7ffffe
2536; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2537; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
2538; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
2539; GFX90A-NEXT:    s_ashr_i32 s2, s8, 31
2540; GFX90A-NEXT:    s_add_i32 s3, s8, s2
2541; GFX90A-NEXT:    s_xor_b32 s3, s3, s2
2542; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
2543; GFX90A-NEXT:    s_ashr_i32 s8, s4, 31
2544; GFX90A-NEXT:    s_add_i32 s4, s4, s8
2545; GFX90A-NEXT:    s_xor_b32 s2, s8, s2
2546; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2547; GFX90A-NEXT:    s_xor_b32 s4, s4, s8
2548; GFX90A-NEXT:    s_sub_i32 s8, 0, s3
2549; GFX90A-NEXT:    s_ashr_i32 s12, s9, 31
2550; GFX90A-NEXT:    v_mul_f32_e32 v0, s13, v0
2551; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
2552; GFX90A-NEXT:    v_mul_lo_u32 v1, s8, v0
2553; GFX90A-NEXT:    v_mul_hi_u32 v1, v0, v1
2554; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v1
2555; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
2556; GFX90A-NEXT:    v_mul_lo_u32 v1, v0, s3
2557; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
2558; GFX90A-NEXT:    s_add_i32 s4, s9, s12
2559; GFX90A-NEXT:    s_xor_b32 s4, s4, s12
2560; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
2561; GFX90A-NEXT:    v_add_u32_e32 v2, 1, v0
2562; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2563; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2564; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v1
2565; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
2566; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2567; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v3
2568; GFX90A-NEXT:    v_add_u32_e32 v2, 1, v0
2569; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2570; GFX90A-NEXT:    v_xor_b32_e32 v0, s2, v0
2571; GFX90A-NEXT:    v_mul_f32_e32 v1, s13, v1
2572; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
2573; GFX90A-NEXT:    v_subrev_u32_e32 v0, s2, v0
2574; GFX90A-NEXT:    s_ashr_i32 s2, s5, 31
2575; GFX90A-NEXT:    s_add_i32 s5, s5, s2
2576; GFX90A-NEXT:    s_xor_b32 s3, s2, s12
2577; GFX90A-NEXT:    s_xor_b32 s2, s5, s2
2578; GFX90A-NEXT:    s_sub_i32 s5, 0, s4
2579; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v1
2580; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
2581; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
2582; GFX90A-NEXT:    v_mul_hi_u32 v1, s2, v1
2583; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s4
2584; GFX90A-NEXT:    v_sub_u32_e32 v2, s2, v2
2585; GFX90A-NEXT:    s_ashr_i32 s2, s10, 31
2586; GFX90A-NEXT:    s_add_i32 s5, s10, s2
2587; GFX90A-NEXT:    s_xor_b32 s5, s5, s2
2588; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s5
2589; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v1
2590; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
2591; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2592; GFX90A-NEXT:    v_subrev_u32_e32 v3, s4, v2
2593; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2594; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
2595; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v5
2596; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v1
2597; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2598; GFX90A-NEXT:    v_xor_b32_e32 v1, s3, v1
2599; GFX90A-NEXT:    v_mul_f32_e32 v2, s13, v2
2600; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
2601; GFX90A-NEXT:    v_subrev_u32_e32 v1, s3, v1
2602; GFX90A-NEXT:    s_ashr_i32 s3, s6, 31
2603; GFX90A-NEXT:    s_add_i32 s4, s6, s3
2604; GFX90A-NEXT:    s_xor_b32 s2, s3, s2
2605; GFX90A-NEXT:    s_xor_b32 s3, s4, s3
2606; GFX90A-NEXT:    s_sub_i32 s4, 0, s5
2607; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v2
2608; GFX90A-NEXT:    v_mul_hi_u32 v3, v2, v3
2609; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
2610; GFX90A-NEXT:    v_mul_hi_u32 v2, s3, v2
2611; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, s5
2612; GFX90A-NEXT:    v_sub_u32_e32 v3, s3, v3
2613; GFX90A-NEXT:    s_ashr_i32 s3, s11, 31
2614; GFX90A-NEXT:    s_add_i32 s4, s11, s3
2615; GFX90A-NEXT:    s_xor_b32 s4, s4, s3
2616; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s4
2617; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v2
2618; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
2619; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2620; GFX90A-NEXT:    v_subrev_u32_e32 v5, s5, v3
2621; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2622; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
2623; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v6
2624; GFX90A-NEXT:    v_add_u32_e32 v5, 1, v2
2625; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2626; GFX90A-NEXT:    v_xor_b32_e32 v2, s2, v2
2627; GFX90A-NEXT:    v_mul_f32_e32 v3, s13, v3
2628; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
2629; GFX90A-NEXT:    v_subrev_u32_e32 v2, s2, v2
2630; GFX90A-NEXT:    s_ashr_i32 s2, s7, 31
2631; GFX90A-NEXT:    s_add_i32 s5, s7, s2
2632; GFX90A-NEXT:    s_xor_b32 s3, s2, s3
2633; GFX90A-NEXT:    s_xor_b32 s2, s5, s2
2634; GFX90A-NEXT:    s_sub_i32 s5, 0, s4
2635; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v3
2636; GFX90A-NEXT:    v_mul_hi_u32 v5, v3, v5
2637; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
2638; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v3
2639; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s4
2640; GFX90A-NEXT:    v_sub_u32_e32 v5, s2, v5
2641; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v3
2642; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2643; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2644; GFX90A-NEXT:    v_subrev_u32_e32 v6, s4, v5
2645; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2646; GFX90A-NEXT:    v_add_u32_e32 v6, 1, v3
2647; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2648; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2649; GFX90A-NEXT:    v_xor_b32_e32 v3, s3, v3
2650; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v3
2651; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2652; GFX90A-NEXT:    s_endpgm
2653  %r = sdiv <4 x i32> %x, %y
2654  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2655  ret void
2656}
2657
2658define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
2659; CHECK-LABEL: @srem_v4i32(
2660; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
2661; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
2662; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
2663; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
2664; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
2665; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
2666; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
2667; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
2668; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
2669; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2670; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
2671; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
2672; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
2673; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
2674; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
2675; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
2676; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
2677; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
2678; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
2679; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
2680; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
2681; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
2682; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
2683; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
2684; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
2685; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
2686; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
2687; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
2688; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
2689; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
2690; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
2691; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
2692; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
2693; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
2694; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
2695; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
2696; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
2697; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0
2698; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
2699; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
2700; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
2701; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
2702; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
2703; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
2704; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
2705; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
2706; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
2707; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
2708; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
2709; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
2710; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
2711; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
2712; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
2713; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
2714; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
2715; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
2716; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
2717; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
2718; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
2719; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
2720; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
2721; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
2722; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
2723; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
2724; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
2725; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
2726; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
2727; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
2728; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
2729; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
2730; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
2731; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
2732; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
2733; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
2734; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
2735; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
2736; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
2737; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
2738; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
2739; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
2740; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
2741; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
2742; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
2743; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
2744; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
2745; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
2746; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
2747; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
2748; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
2749; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
2750; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
2751; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
2752; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
2753; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
2754; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
2755; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
2756; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
2757; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
2758; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
2759; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
2760; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
2761; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
2762; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
2763; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
2764; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
2765; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
2766; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
2767; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
2768; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
2769; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
2770; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
2771; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
2772; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
2773; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
2774; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
2775; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
2776; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
2777; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
2778; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
2779; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
2780; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
2781; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
2782; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
2783; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
2784; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
2785; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
2786; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
2787; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
2788; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
2789; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
2790; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
2791; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
2792; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
2793; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
2794; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
2795; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
2796; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
2797; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
2798; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
2799; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
2800; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
2801; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
2802; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
2803; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
2804; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
2805; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
2806; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
2807; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
2808; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
2809; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
2810; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
2811; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
2812; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
2813; CHECK-NEXT:    ret void
2814;
2815; GFX6-LABEL: srem_v4i32:
2816; GFX6:       ; %bb.0:
2817; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
2818; GFX6-NEXT:    s_mov_b32 s14, 0x4f7ffffe
2819; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2820; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2821; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2822; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
2823; GFX6-NEXT:    s_add_i32 s8, s8, s2
2824; GFX6-NEXT:    s_xor_b32 s8, s8, s2
2825; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
2826; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
2827; GFX6-NEXT:    s_add_i32 s9, s9, s12
2828; GFX6-NEXT:    s_xor_b32 s9, s9, s12
2829; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2830; GFX6-NEXT:    s_sub_i32 s13, 0, s8
2831; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
2832; GFX6-NEXT:    s_ashr_i32 s12, s4, 31
2833; GFX6-NEXT:    v_mul_f32_e32 v0, s14, v0
2834; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2835; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2836; GFX6-NEXT:    s_add_i32 s4, s4, s12
2837; GFX6-NEXT:    s_xor_b32 s4, s4, s12
2838; GFX6-NEXT:    v_mul_lo_u32 v2, s13, v0
2839; GFX6-NEXT:    v_mul_f32_e32 v1, s14, v1
2840; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2841; GFX6-NEXT:    s_sub_i32 s13, 0, s9
2842; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
2843; GFX6-NEXT:    s_mov_b32 s2, -1
2844; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2845; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
2846; GFX6-NEXT:    v_mul_lo_u32 v2, s13, v1
2847; GFX6-NEXT:    s_ashr_i32 s13, s5, 31
2848; GFX6-NEXT:    s_add_i32 s5, s5, s13
2849; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
2850; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
2851; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
2852; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
2853; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2854; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2855; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
2856; GFX6-NEXT:    s_xor_b32 s4, s5, s13
2857; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2858; GFX6-NEXT:    s_ashr_i32 s5, s10, 31
2859; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2860; GFX6-NEXT:    s_add_i32 s8, s10, s5
2861; GFX6-NEXT:    s_xor_b32 s5, s8, s5
2862; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
2863; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
2864; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2865; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
2866; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2867; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
2868; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
2869; GFX6-NEXT:    v_mul_f32_e32 v2, s14, v2
2870; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2871; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
2872; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
2873; GFX6-NEXT:    s_sub_i32 s4, 0, s5
2874; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2875; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v2
2876; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2877; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
2878; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2879; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2880; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
2881; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
2882; GFX6-NEXT:    s_add_i32 s9, s11, s8
2883; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
2884; GFX6-NEXT:    s_xor_b32 s8, s9, s8
2885; GFX6-NEXT:    s_add_i32 s6, s6, s4
2886; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
2887; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
2888; GFX6-NEXT:    s_xor_b32 s6, s6, s4
2889; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
2890; GFX6-NEXT:    v_xor_b32_e32 v1, s13, v1
2891; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2892; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s13, v1
2893; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s5
2894; GFX6-NEXT:    v_mul_f32_e32 v3, s14, v3
2895; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
2896; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
2897; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v2
2898; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
2899; GFX6-NEXT:    s_sub_i32 s6, 0, s8
2900; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2901; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
2902; GFX6-NEXT:    s_ashr_i32 s6, s7, 31
2903; GFX6-NEXT:    s_add_i32 s7, s7, s6
2904; GFX6-NEXT:    s_xor_b32 s7, s7, s6
2905; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
2906; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v2
2907; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
2908; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
2909; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
2910; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2911; GFX6-NEXT:    v_xor_b32_e32 v2, s4, v2
2912; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s8
2913; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
2914; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
2915; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2916; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2917; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2918; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2919; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2920; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2921; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
2922; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v3
2923; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2924; GFX6-NEXT:    s_endpgm
2925;
2926; GFX9-LABEL: srem_v4i32:
2927; GFX9:       ; %bb.0:
2928; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
2929; GFX9-NEXT:    s_mov_b32 s13, 0x4f7ffffe
2930; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2931; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2932; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2933; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
2934; GFX9-NEXT:    s_add_i32 s8, s8, s2
2935; GFX9-NEXT:    s_xor_b32 s2, s8, s2
2936; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
2937; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
2938; GFX9-NEXT:    s_sub_i32 s12, 0, s2
2939; GFX9-NEXT:    s_add_i32 s8, s9, s3
2940; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2941; GFX9-NEXT:    s_xor_b32 s3, s8, s3
2942; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
2943; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
2944; GFX9-NEXT:    v_mul_f32_e32 v0, s13, v0
2945; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2946; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2947; GFX9-NEXT:    s_add_i32 s4, s4, s8
2948; GFX9-NEXT:    s_xor_b32 s4, s4, s8
2949; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
2950; GFX9-NEXT:    v_mul_f32_e32 v1, s13, v1
2951; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2952; GFX9-NEXT:    s_sub_i32 s12, 0, s3
2953; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
2954; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
2955; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
2956; GFX9-NEXT:    s_add_i32 s5, s5, s9
2957; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
2958; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
2959; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
2960; GFX9-NEXT:    s_xor_b32 s5, s5, s9
2961; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
2962; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
2963; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
2964; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
2965; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
2966; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2967; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2968; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
2969; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2970; GFX9-NEXT:    s_ashr_i32 s2, s10, 31
2971; GFX9-NEXT:    s_add_i32 s4, s10, s2
2972; GFX9-NEXT:    s_xor_b32 s2, s4, s2
2973; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2974; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
2975; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
2976; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
2977; GFX9-NEXT:    v_subrev_u32_e32 v0, s8, v0
2978; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2979; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
2980; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
2981; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2982; GFX9-NEXT:    v_mul_f32_e32 v2, s13, v2
2983; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2984; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2985; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
2986; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2987; GFX9-NEXT:    s_sub_i32 s3, 0, s2
2988; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2989; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
2990; GFX9-NEXT:    s_ashr_i32 s3, s11, 31
2991; GFX9-NEXT:    s_add_i32 s4, s11, s3
2992; GFX9-NEXT:    s_xor_b32 s3, s4, s3
2993; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s3
2994; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
2995; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
2996; GFX9-NEXT:    s_add_i32 s5, s6, s4
2997; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
2998; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
2999; GFX9-NEXT:    s_xor_b32 s5, s5, s4
3000; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
3001; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v5
3002; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
3003; GFX9-NEXT:    s_sub_i32 s6, 0, s3
3004; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
3005; GFX9-NEXT:    v_xor_b32_e32 v1, s9, v1
3006; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v3
3007; GFX9-NEXT:    v_subrev_u32_e32 v1, s9, v1
3008; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
3009; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
3010; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
3011; GFX9-NEXT:    s_add_i32 s6, s7, s5
3012; GFX9-NEXT:    s_xor_b32 s6, s6, s5
3013; GFX9-NEXT:    v_subrev_u32_e32 v6, s2, v2
3014; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
3015; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
3016; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
3017; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
3018; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v2
3019; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s3
3020; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
3021; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
3022; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
3023; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
3024; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
3025; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
3026; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
3027; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
3028; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
3029; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
3030; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
3031; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
3032; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
3033; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
3034; GFX9-NEXT:    s_endpgm
3035;
3036; GFX90A-LABEL: srem_v4i32:
3037; GFX90A:       ; %bb.0:
3038; GFX90A-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
3039; GFX90A-NEXT:    s_mov_b32 s12, 0x4f7ffffe
3040; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3041; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
3042; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
3043; GFX90A-NEXT:    s_ashr_i32 s2, s8, 31
3044; GFX90A-NEXT:    s_add_i32 s3, s8, s2
3045; GFX90A-NEXT:    s_xor_b32 s2, s3, s2
3046; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
3047; GFX90A-NEXT:    s_ashr_i32 s8, s9, 31
3048; GFX90A-NEXT:    s_add_i32 s9, s9, s8
3049; GFX90A-NEXT:    s_xor_b32 s8, s9, s8
3050; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3051; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s8
3052; GFX90A-NEXT:    s_sub_i32 s9, 0, s2
3053; GFX90A-NEXT:    s_ashr_i32 s3, s4, 31
3054; GFX90A-NEXT:    v_mul_f32_e32 v0, s12, v0
3055; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
3056; GFX90A-NEXT:    s_add_i32 s4, s4, s3
3057; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
3058; GFX90A-NEXT:    s_xor_b32 s4, s4, s3
3059; GFX90A-NEXT:    v_mul_lo_u32 v2, s9, v0
3060; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
3061; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
3062; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
3063; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s2
3064; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
3065; GFX90A-NEXT:    v_mul_f32_e32 v1, s12, v1
3066; GFX90A-NEXT:    v_subrev_u32_e32 v2, s2, v0
3067; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
3068; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
3069; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3070; GFX90A-NEXT:    v_subrev_u32_e32 v2, s2, v0
3071; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
3072; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3073; GFX90A-NEXT:    s_sub_i32 s4, 0, s8
3074; GFX90A-NEXT:    v_xor_b32_e32 v0, s3, v0
3075; GFX90A-NEXT:    s_ashr_i32 s2, s5, 31
3076; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v1
3077; GFX90A-NEXT:    v_subrev_u32_e32 v0, s3, v0
3078; GFX90A-NEXT:    s_add_i32 s3, s5, s2
3079; GFX90A-NEXT:    v_mul_hi_u32 v2, v1, v2
3080; GFX90A-NEXT:    s_xor_b32 s3, s3, s2
3081; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
3082; GFX90A-NEXT:    v_mul_hi_u32 v1, s3, v1
3083; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s8
3084; GFX90A-NEXT:    v_sub_u32_e32 v1, s3, v1
3085; GFX90A-NEXT:    s_ashr_i32 s3, s10, 31
3086; GFX90A-NEXT:    s_add_i32 s4, s10, s3
3087; GFX90A-NEXT:    v_subrev_u32_e32 v2, s8, v1
3088; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
3089; GFX90A-NEXT:    s_xor_b32 s3, s4, s3
3090; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3091; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s3
3092; GFX90A-NEXT:    v_subrev_u32_e32 v3, s8, v1
3093; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
3094; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3095; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v2
3096; GFX90A-NEXT:    v_xor_b32_e32 v1, s2, v1
3097; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
3098; GFX90A-NEXT:    v_subrev_u32_e32 v1, s2, v1
3099; GFX90A-NEXT:    v_mul_f32_e32 v2, s12, v2
3100; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
3101; GFX90A-NEXT:    s_ashr_i32 s2, s6, 31
3102; GFX90A-NEXT:    s_add_i32 s4, s6, s2
3103; GFX90A-NEXT:    s_xor_b32 s4, s4, s2
3104; GFX90A-NEXT:    v_mul_lo_u32 v3, s5, v2
3105; GFX90A-NEXT:    v_mul_hi_u32 v3, v2, v3
3106; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
3107; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v2
3108; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, s3
3109; GFX90A-NEXT:    v_sub_u32_e32 v2, s4, v2
3110; GFX90A-NEXT:    s_ashr_i32 s4, s11, 31
3111; GFX90A-NEXT:    s_add_i32 s5, s11, s4
3112; GFX90A-NEXT:    v_subrev_u32_e32 v3, s3, v2
3113; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
3114; GFX90A-NEXT:    s_xor_b32 s4, s5, s4
3115; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
3116; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
3117; GFX90A-NEXT:    v_subrev_u32_e32 v5, s3, v2
3118; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
3119; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
3120; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v3
3121; GFX90A-NEXT:    v_xor_b32_e32 v2, s2, v2
3122; GFX90A-NEXT:    s_sub_i32 s5, 0, s4
3123; GFX90A-NEXT:    v_subrev_u32_e32 v2, s2, v2
3124; GFX90A-NEXT:    v_mul_f32_e32 v3, s12, v3
3125; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
3126; GFX90A-NEXT:    s_ashr_i32 s2, s7, 31
3127; GFX90A-NEXT:    s_add_i32 s3, s7, s2
3128; GFX90A-NEXT:    s_xor_b32 s3, s3, s2
3129; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v3
3130; GFX90A-NEXT:    v_mul_hi_u32 v5, v3, v5
3131; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
3132; GFX90A-NEXT:    v_mul_hi_u32 v3, s3, v3
3133; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s4
3134; GFX90A-NEXT:    v_sub_u32_e32 v3, s3, v3
3135; GFX90A-NEXT:    v_subrev_u32_e32 v5, s4, v3
3136; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
3137; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
3138; GFX90A-NEXT:    v_subrev_u32_e32 v5, s4, v3
3139; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
3140; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
3141; GFX90A-NEXT:    v_xor_b32_e32 v3, s2, v3
3142; GFX90A-NEXT:    v_subrev_u32_e32 v3, s2, v3
3143; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
3144; GFX90A-NEXT:    s_endpgm
3145  %r = srem <4 x i32> %x, %y
3146  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
3147  ret void
3148}
3149
3150define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
3151; CHECK-LABEL: @udiv_v4i16(
3152; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3153; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3154; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3155; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3156; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3157; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3158; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3159; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3160; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3161; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3162; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3163; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3164; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3165; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3166; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3167; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3168; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3169; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
3170; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
3171; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
3172; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
3173; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3174; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
3175; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
3176; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3177; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3178; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3179; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3180; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3181; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3182; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3183; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3184; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3185; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3186; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3187; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3188; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3189; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
3190; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
3191; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
3192; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
3193; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3194; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
3195; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
3196; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3197; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3198; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3199; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3200; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3201; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3202; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3203; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3204; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3205; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3206; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3207; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3208; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3209; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
3210; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
3211; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
3212; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
3213; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3214; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
3215; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
3216; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
3217; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
3218; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
3219; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
3220; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
3221; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
3222; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
3223; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
3224; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
3225; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3226; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
3227; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
3228; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
3229; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
3230; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
3231; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
3232; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
3233; CHECK-NEXT:    ret void
3234;
3235; GFX6-LABEL: udiv_v4i16:
3236; GFX6:       ; %bb.0:
3237; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3238; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
3239; GFX6-NEXT:    s_mov_b32 s8, 0xffff
3240; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3241; GFX6-NEXT:    s_mov_b32 s6, -1
3242; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3243; GFX6-NEXT:    s_and_b32 s9, s2, s8
3244; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
3245; GFX6-NEXT:    s_lshr_b32 s9, s0, 16
3246; GFX6-NEXT:    s_and_b32 s0, s0, s8
3247; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
3248; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
3249; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3250; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
3251; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
3252; GFX6-NEXT:    s_and_b32 s2, s3, s8
3253; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3254; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3255; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3256; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3257; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
3258; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3259; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
3260; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3261; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
3262; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
3263; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s2
3264; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
3265; GFX6-NEXT:    s_lshr_b32 s10, s3, 16
3266; GFX6-NEXT:    s_and_b32 s1, s1, s8
3267; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
3268; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
3269; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3270; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
3271; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
3272; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
3273; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
3274; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
3275; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v3
3276; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3277; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
3278; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
3279; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
3280; GFX6-NEXT:    v_mul_f32_e32 v4, v6, v7
3281; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3282; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v4
3283; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3284; GFX6-NEXT:    v_mad_f32 v4, -v4, v3, v6
3285; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
3286; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
3287; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3288; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
3289; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3290; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
3291; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
3292; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3293; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3294; GFX6-NEXT:    s_endpgm
3295;
3296; GFX9-LABEL: udiv_v4i16:
3297; GFX9:       ; %bb.0:
3298; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3299; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3300; GFX9-NEXT:    s_mov_b32 s8, 0xffff
3301; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3302; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3303; GFX9-NEXT:    s_and_b32 s1, s6, s8
3304; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
3305; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
3306; GFX9-NEXT:    s_and_b32 s4, s4, s8
3307; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
3308; GFX9-NEXT:    s_lshr_b32 s4, s6, 16
3309; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3310; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
3311; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
3312; GFX9-NEXT:    s_and_b32 s0, s7, s8
3313; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3314; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3315; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3316; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3317; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
3318; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3319; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
3320; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3321; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3322; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
3323; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
3324; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
3325; GFX9-NEXT:    s_and_b32 s0, s5, s8
3326; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
3327; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
3328; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3329; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3330; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
3331; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
3332; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
3333; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
3334; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
3335; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
3336; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3337; GFX9-NEXT:    v_mad_f32 v6, -v1, v5, v6
3338; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
3339; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
3340; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
3341; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3342; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
3343; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3344; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
3345; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
3346; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
3347; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
3348; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
3349; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
3350; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
3351; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
3352; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3353; GFX9-NEXT:    s_endpgm
3354;
3355; GFX90A-LABEL: udiv_v4i16:
3356; GFX90A:       ; %bb.0:
3357; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3358; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3359; GFX90A-NEXT:    s_mov_b32 s8, 0xffff
3360; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3361; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
3362; GFX90A-NEXT:    s_and_b32 s1, s6, s8
3363; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s1
3364; GFX90A-NEXT:    s_lshr_b32 s0, s4, 16
3365; GFX90A-NEXT:    s_and_b32 s4, s4, s8
3366; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s4
3367; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
3368; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3369; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s4
3370; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s0
3371; GFX90A-NEXT:    s_and_b32 s0, s7, s8
3372; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
3373; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3374; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
3375; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
3376; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
3377; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3378; GFX90A-NEXT:    v_mul_f32_e32 v1, v5, v6
3379; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
3380; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3381; GFX90A-NEXT:    v_mad_f32 v3, -v1, v4, v5
3382; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s0
3383; GFX90A-NEXT:    s_lshr_b32 s6, s7, 16
3384; GFX90A-NEXT:    s_and_b32 s0, s5, s8
3385; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
3386; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
3387; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3388; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3389; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s6
3390; GFX90A-NEXT:    s_lshr_b32 s1, s5, 16
3391; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
3392; GFX90A-NEXT:    v_mul_f32_e32 v1, v6, v7
3393; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s1
3394; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v4
3395; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
3396; GFX90A-NEXT:    v_mad_f32 v6, -v1, v5, v6
3397; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
3398; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
3399; GFX90A-NEXT:    v_mul_f32_e32 v5, v7, v8
3400; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
3401; GFX90A-NEXT:    v_cvt_u32_f32_e32 v6, v5
3402; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3403; GFX90A-NEXT:    v_mad_f32 v5, -v5, v4, v7
3404; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
3405; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
3406; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
3407; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
3408; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
3409; GFX90A-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
3410; GFX90A-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
3411; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3412; GFX90A-NEXT:    s_endpgm
3413  %r = udiv <4 x i16> %x, %y
3414  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3415  ret void
3416}
3417
3418define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
3419; CHECK-LABEL: @urem_v4i16(
3420; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3421; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3422; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3423; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3424; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3425; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3426; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3427; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3428; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3429; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3430; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3431; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3432; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3433; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3434; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3435; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3436; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3437; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3438; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3439; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
3440; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
3441; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
3442; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
3443; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3444; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
3445; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
3446; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3447; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3448; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3449; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3450; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3451; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3452; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3453; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3454; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3455; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3456; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3457; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3458; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3459; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3460; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3461; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
3462; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
3463; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
3464; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
3465; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3466; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
3467; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
3468; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3469; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3470; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3471; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3472; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3473; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3474; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3475; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3476; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3477; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3478; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3479; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3480; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3481; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3482; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3483; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
3484; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
3485; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
3486; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
3487; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3488; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
3489; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
3490; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
3491; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
3492; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
3493; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
3494; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
3495; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
3496; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
3497; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
3498; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
3499; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
3500; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
3501; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
3502; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
3503; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
3504; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
3505; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
3506; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
3507; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
3508; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
3509; CHECK-NEXT:    ret void
3510;
3511; GFX6-LABEL: urem_v4i16:
3512; GFX6:       ; %bb.0:
3513; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3514; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
3515; GFX6-NEXT:    s_mov_b32 s8, 0xffff
3516; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3517; GFX6-NEXT:    s_mov_b32 s6, -1
3518; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3519; GFX6-NEXT:    s_and_b32 s9, s2, s8
3520; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
3521; GFX6-NEXT:    s_and_b32 s10, s0, s8
3522; GFX6-NEXT:    s_lshr_b32 s11, s2, 16
3523; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
3524; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3525; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s11
3526; GFX6-NEXT:    s_lshr_b32 s9, s0, 16
3527; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
3528; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3529; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3530; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3531; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3532; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
3533; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3534; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
3535; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3536; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
3537; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
3538; GFX6-NEXT:    v_mad_f32 v1, -v1, v3, v4
3539; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
3540; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
3541; GFX6-NEXT:    s_and_b32 s2, s3, s8
3542; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
3543; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
3544; GFX6-NEXT:    s_and_b32 s2, s1, s8
3545; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
3546; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
3547; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3548; GFX6-NEXT:    s_lshr_b32 s12, s3, 16
3549; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
3550; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
3551; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s12
3552; GFX6-NEXT:    s_lshr_b32 s10, s1, 16
3553; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s10
3554; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3555; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3556; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
3557; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
3558; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
3559; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
3560; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
3561; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3562; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
3563; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3564; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v6
3565; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3566; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
3567; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
3568; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s12
3569; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
3570; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
3571; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
3572; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3573; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
3574; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3575; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
3576; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3577; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3578; GFX6-NEXT:    s_endpgm
3579;
3580; GFX9-LABEL: urem_v4i16:
3581; GFX9:       ; %bb.0:
3582; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3583; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3584; GFX9-NEXT:    s_mov_b32 s8, 0xffff
3585; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3586; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3587; GFX9-NEXT:    s_and_b32 s1, s6, s8
3588; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
3589; GFX9-NEXT:    s_and_b32 s9, s4, s8
3590; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
3591; GFX9-NEXT:    s_lshr_b32 s9, s6, 16
3592; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3593; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s9
3594; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
3595; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
3596; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3597; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3598; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3599; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
3600; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3601; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3602; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
3603; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3604; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
3605; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
3606; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3607; GFX9-NEXT:    s_and_b32 s6, s7, s8
3608; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
3609; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
3610; GFX9-NEXT:    s_and_b32 s6, s5, s8
3611; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
3612; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3613; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3614; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
3615; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
3616; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
3617; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
3618; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
3619; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
3620; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3621; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3622; GFX9-NEXT:    v_mad_f32 v6, -v3, v5, v6
3623; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
3624; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
3625; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
3626; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3627; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
3628; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
3629; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
3630; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
3631; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
3632; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
3633; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
3634; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s10
3635; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3636; GFX9-NEXT:    v_sub_u32_e32 v5, s0, v1
3637; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v3
3638; GFX9-NEXT:    v_sub_u32_e32 v3, s1, v4
3639; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
3640; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
3641; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
3642; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
3643; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
3644; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3645; GFX9-NEXT:    s_endpgm
3646;
3647; GFX90A-LABEL: urem_v4i16:
3648; GFX90A:       ; %bb.0:
3649; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3650; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3651; GFX90A-NEXT:    s_mov_b32 s8, 0xffff
3652; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3653; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
3654; GFX90A-NEXT:    s_and_b32 s1, s6, s8
3655; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s1
3656; GFX90A-NEXT:    s_and_b32 s9, s4, s8
3657; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
3658; GFX90A-NEXT:    s_lshr_b32 s9, s6, 16
3659; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3660; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s9
3661; GFX90A-NEXT:    s_lshr_b32 s0, s4, 16
3662; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s0
3663; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
3664; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
3665; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
3666; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
3667; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3668; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3669; GFX90A-NEXT:    s_lshr_b32 s10, s7, 16
3670; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3671; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s6
3672; GFX90A-NEXT:    v_mul_f32_e32 v1, v5, v6
3673; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
3674; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
3675; GFX90A-NEXT:    s_and_b32 s4, s7, s8
3676; GFX90A-NEXT:    v_mad_f32 v3, -v1, v4, v5
3677; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s4
3678; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
3679; GFX90A-NEXT:    s_and_b32 s4, s5, s8
3680; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s4
3681; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3682; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3683; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s10
3684; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3685; GFX90A-NEXT:    s_lshr_b32 s1, s5, 16
3686; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
3687; GFX90A-NEXT:    v_sub_u32_e32 v3, s0, v1
3688; GFX90A-NEXT:    v_mul_f32_e32 v1, v6, v7
3689; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s1
3690; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v4
3691; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
3692; GFX90A-NEXT:    v_mad_f32 v6, -v1, v5, v6
3693; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
3694; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
3695; GFX90A-NEXT:    v_mul_f32_e32 v5, v7, v8
3696; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
3697; GFX90A-NEXT:    v_cvt_u32_f32_e32 v6, v5
3698; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
3699; GFX90A-NEXT:    v_mad_f32 v5, -v5, v4, v7
3700; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
3701; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s7
3702; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
3703; GFX90A-NEXT:    v_sub_u32_e32 v1, s5, v1
3704; GFX90A-NEXT:    v_mul_lo_u32 v4, v4, s10
3705; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
3706; GFX90A-NEXT:    v_sub_u32_e32 v4, s1, v4
3707; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
3708; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
3709; GFX90A-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
3710; GFX90A-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
3711; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3712; GFX90A-NEXT:    s_endpgm
3713  %r = urem <4 x i16> %x, %y
3714  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3715  ret void
3716}
3717
3718define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
3719; CHECK-LABEL: @sdiv_v4i16(
3720; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3721; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3722; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3723; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3724; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3725; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3726; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3727; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3728; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3729; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3730; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3731; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3732; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3733; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3734; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3735; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3736; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3737; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3738; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3739; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3740; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
3741; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
3742; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
3743; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
3744; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
3745; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3746; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
3747; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
3748; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
3749; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
3750; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
3751; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
3752; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
3753; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
3754; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
3755; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
3756; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
3757; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
3758; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
3759; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
3760; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3761; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
3762; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
3763; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
3764; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
3765; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
3766; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
3767; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
3768; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
3769; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3770; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
3771; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
3772; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
3773; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
3774; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
3775; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
3776; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
3777; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
3778; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
3779; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
3780; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
3781; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
3782; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
3783; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
3784; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
3785; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
3786; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
3787; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
3788; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
3789; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
3790; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
3791; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
3792; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
3793; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3794; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
3795; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
3796; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
3797; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
3798; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
3799; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
3800; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
3801; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
3802; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
3803; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
3804; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
3805; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
3806; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
3807; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
3808; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
3809; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
3810; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
3811; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
3812; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
3813; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
3814; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
3815; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
3816; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
3817; CHECK-NEXT:    ret void
3818;
3819; GFX6-LABEL: sdiv_v4i16:
3820; GFX6:       ; %bb.0:
3821; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3822; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
3823; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3824; GFX6-NEXT:    s_mov_b32 s6, -1
3825; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3826; GFX6-NEXT:    s_sext_i32_i16 s8, s2
3827; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
3828; GFX6-NEXT:    s_sext_i32_i16 s9, s0
3829; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
3830; GFX6-NEXT:    s_xor_b32 s8, s9, s8
3831; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3832; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3833; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3834; GFX6-NEXT:    s_or_b32 s8, s8, 1
3835; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3836; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3837; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3838; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3839; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3840; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
3841; GFX6-NEXT:    v_mov_b32_e32 v3, s8
3842; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3843; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3844; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3845; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3846; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3847; GFX6-NEXT:    s_xor_b32 s0, s0, s2
3848; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3849; GFX6-NEXT:    s_or_b32 s0, s0, 1
3850; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
3851; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3852; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
3853; GFX6-NEXT:    v_mov_b32_e32 v4, s0
3854; GFX6-NEXT:    s_sext_i32_i16 s0, s3
3855; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3856; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3857; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3858; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3859; GFX6-NEXT:    s_sext_i32_i16 s2, s1
3860; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
3861; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
3862; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3863; GFX6-NEXT:    s_xor_b32 s0, s2, s0
3864; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3865; GFX6-NEXT:    s_or_b32 s0, s0, 1
3866; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3867; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3868; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
3869; GFX6-NEXT:    v_mov_b32_e32 v5, s0
3870; GFX6-NEXT:    s_ashr_i32 s0, s3, 16
3871; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3872; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
3873; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3874; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
3875; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3876; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3877; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
3878; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3879; GFX6-NEXT:    s_xor_b32 s0, s1, s0
3880; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3881; GFX6-NEXT:    s_or_b32 s0, s0, 1
3882; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3883; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3884; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
3885; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3886; GFX6-NEXT:    v_mov_b32_e32 v6, s0
3887; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
3888; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
3889; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
3890; GFX6-NEXT:    s_mov_b32 s0, 0xffff
3891; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3892; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
3893; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3894; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
3895; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
3896; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3897; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3898; GFX6-NEXT:    s_endpgm
3899;
3900; GFX9-LABEL: sdiv_v4i16:
3901; GFX9:       ; %bb.0:
3902; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3903; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3904; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3905; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3906; GFX9-NEXT:    s_sext_i32_i16 s0, s6
3907; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3908; GFX9-NEXT:    s_sext_i32_i16 s1, s4
3909; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
3910; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3911; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3912; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3913; GFX9-NEXT:    s_or_b32 s8, s0, 1
3914; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3915; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3916; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3917; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3918; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3919; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3920; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
3921; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3922; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
3923; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s4
3924; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3925; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3926; GFX9-NEXT:    v_add_u32_e32 v3, s0, v3
3927; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
3928; GFX9-NEXT:    s_xor_b32 s0, s4, s1
3929; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3930; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3931; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
3932; GFX9-NEXT:    s_or_b32 s4, s0, 1
3933; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3934; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3935; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3936; GFX9-NEXT:    s_sext_i32_i16 s1, s7
3937; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3938; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3939; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
3940; GFX9-NEXT:    s_sext_i32_i16 s0, s5
3941; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
3942; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
3943; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3944; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3945; GFX9-NEXT:    s_or_b32 s4, s0, 1
3946; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
3947; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3948; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
3949; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3950; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3951; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3952; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3953; GFX9-NEXT:    s_ashr_i32 s1, s7, 16
3954; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3955; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
3956; GFX9-NEXT:    s_ashr_i32 s0, s5, 16
3957; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
3958; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
3959; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3960; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3961; GFX9-NEXT:    s_or_b32 s4, s0, 1
3962; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3963; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3964; GFX9-NEXT:    v_mad_f32 v5, -v6, v0, v5
3965; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3966; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
3967; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3968; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3969; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
3970; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
3971; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
3972; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
3973; GFX9-NEXT:    v_and_b32_e32 v0, v5, v3
3974; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
3975; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3976; GFX9-NEXT:    s_endpgm
3977;
3978; GFX90A-LABEL: sdiv_v4i16:
3979; GFX90A:       ; %bb.0:
3980; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3981; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3982; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
3983; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
3984; GFX90A-NEXT:    s_sext_i32_i16 s0, s6
3985; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
3986; GFX90A-NEXT:    s_sext_i32_i16 s1, s4
3987; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s1
3988; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
3989; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3990; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
3991; GFX90A-NEXT:    s_or_b32 s8, s0, 1
3992; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
3993; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
3994; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
3995; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3996; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3997; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
3998; GFX90A-NEXT:    s_ashr_i32 s1, s6, 16
3999; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
4000; GFX90A-NEXT:    s_ashr_i32 s4, s4, 16
4001; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s4
4002; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
4003; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4004; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v3
4005; GFX90A-NEXT:    v_mul_f32_e32 v4, v1, v4
4006; GFX90A-NEXT:    s_xor_b32 s0, s4, s1
4007; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
4008; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4009; GFX90A-NEXT:    v_mad_f32 v1, -v4, v0, v1
4010; GFX90A-NEXT:    s_or_b32 s4, s0, 1
4011; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
4012; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4013; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
4014; GFX90A-NEXT:    s_sext_i32_i16 s1, s7
4015; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
4016; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
4017; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v4
4018; GFX90A-NEXT:    s_sext_i32_i16 s0, s5
4019; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s0
4020; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v0
4021; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
4022; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4023; GFX90A-NEXT:    s_or_b32 s4, s0, 1
4024; GFX90A-NEXT:    v_mul_f32_e32 v5, v1, v5
4025; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
4026; GFX90A-NEXT:    v_mad_f32 v1, -v5, v0, v1
4027; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
4028; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4029; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
4030; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
4031; GFX90A-NEXT:    s_ashr_i32 s1, s7, 16
4032; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
4033; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v5
4034; GFX90A-NEXT:    s_ashr_i32 s0, s5, 16
4035; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s0
4036; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v0
4037; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
4038; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4039; GFX90A-NEXT:    s_or_b32 s4, s0, 1
4040; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
4041; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
4042; GFX90A-NEXT:    v_mad_f32 v5, -v6, v0, v5
4043; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
4044; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
4045; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4046; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
4047; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
4048; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v6
4049; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
4050; GFX90A-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
4051; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v3
4052; GFX90A-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
4053; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
4054; GFX90A-NEXT:    s_endpgm
4055  %r = sdiv <4 x i16> %x, %y
4056  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
4057  ret void
4058}
4059
4060define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
4061; CHECK-LABEL: @srem_v4i16(
4062; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
4063; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
4064; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4065; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4066; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4067; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4068; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4069; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4070; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4071; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4072; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4073; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4074; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4075; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4076; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4077; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4078; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4079; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4080; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4081; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4082; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
4083; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
4084; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
4085; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
4086; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
4087; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
4088; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
4089; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
4090; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
4091; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
4092; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
4093; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
4094; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
4095; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
4096; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
4097; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
4098; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
4099; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
4100; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
4101; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
4102; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
4103; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
4104; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
4105; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
4106; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
4107; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
4108; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
4109; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
4110; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
4111; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
4112; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
4113; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
4114; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
4115; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
4116; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
4117; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
4118; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
4119; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
4120; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
4121; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
4122; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
4123; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
4124; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
4125; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
4126; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
4127; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
4128; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
4129; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
4130; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
4131; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
4132; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
4133; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
4134; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
4135; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
4136; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
4137; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
4138; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
4139; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
4140; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
4141; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
4142; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
4143; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
4144; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
4145; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
4146; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
4147; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
4148; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
4149; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
4150; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
4151; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
4152; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
4153; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
4154; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
4155; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
4156; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
4157; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
4158; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
4159; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
4160; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
4161; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
4162; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
4163; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
4164; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
4165; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
4166; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
4167; CHECK-NEXT:    ret void
4168;
4169; GFX6-LABEL: srem_v4i16:
4170; GFX6:       ; %bb.0:
4171; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4172; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
4173; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4174; GFX6-NEXT:    s_mov_b32 s6, -1
4175; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4176; GFX6-NEXT:    s_sext_i32_i16 s8, s2
4177; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
4178; GFX6-NEXT:    s_sext_i32_i16 s9, s0
4179; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
4180; GFX6-NEXT:    s_xor_b32 s8, s9, s8
4181; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4182; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
4183; GFX6-NEXT:    s_or_b32 s8, s8, 1
4184; GFX6-NEXT:    v_mov_b32_e32 v3, s8
4185; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4186; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4187; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4188; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4189; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4190; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4191; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4192; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
4193; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
4194; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
4195; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4196; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
4197; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
4198; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
4199; GFX6-NEXT:    s_xor_b32 s8, s0, s2
4200; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
4201; GFX6-NEXT:    s_or_b32 s8, s8, 1
4202; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
4203; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4204; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
4205; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
4206; GFX6-NEXT:    v_mov_b32_e32 v4, s8
4207; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
4208; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
4209; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
4210; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s2
4211; GFX6-NEXT:    s_sext_i32_i16 s2, s3
4212; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s2
4213; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s0, v1
4214; GFX6-NEXT:    s_sext_i32_i16 s0, s1
4215; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
4216; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4217; GFX6-NEXT:    s_xor_b32 s0, s0, s2
4218; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
4219; GFX6-NEXT:    s_or_b32 s0, s0, 1
4220; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
4221; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4222; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
4223; GFX6-NEXT:    v_mov_b32_e32 v5, s0
4224; GFX6-NEXT:    s_ashr_i32 s0, s3, 16
4225; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
4226; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
4227; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
4228; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
4229; GFX6-NEXT:    s_ashr_i32 s2, s1, 16
4230; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
4231; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s2
4232; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
4233; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
4234; GFX6-NEXT:    s_xor_b32 s3, s2, s0
4235; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
4236; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
4237; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4238; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
4239; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4240; GFX6-NEXT:    s_or_b32 s3, s3, 1
4241; GFX6-NEXT:    v_mov_b32_e32 v6, s3
4242; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
4243; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
4244; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
4245; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
4246; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
4247; GFX6-NEXT:    s_mov_b32 s0, 0xffff
4248; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
4249; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
4250; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
4251; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
4252; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
4253; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
4254; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
4255; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4256; GFX6-NEXT:    s_endpgm
4257;
4258; GFX9-LABEL: srem_v4i16:
4259; GFX9:       ; %bb.0:
4260; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4261; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
4262; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4263; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4264; GFX9-NEXT:    s_sext_i32_i16 s0, s6
4265; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
4266; GFX9-NEXT:    s_sext_i32_i16 s1, s4
4267; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
4268; GFX9-NEXT:    s_xor_b32 s0, s1, s0
4269; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4270; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4271; GFX9-NEXT:    s_or_b32 s8, s0, 1
4272; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
4273; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4274; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
4275; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
4276; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4277; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
4278; GFX9-NEXT:    s_ashr_i32 s9, s6, 16
4279; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4280; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s9
4281; GFX9-NEXT:    s_ashr_i32 s8, s4, 16
4282; GFX9-NEXT:    v_add_u32_e32 v1, s0, v3
4283; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s8
4284; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4285; GFX9-NEXT:    s_xor_b32 s0, s8, s9
4286; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4287; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
4288; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4289; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4290; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
4291; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4292; GFX9-NEXT:    s_or_b32 s6, s0, 1
4293; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
4294; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4295; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
4296; GFX9-NEXT:    v_add_u32_e32 v0, s0, v4
4297; GFX9-NEXT:    s_sext_i32_i16 s0, s7
4298; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
4299; GFX9-NEXT:    s_sext_i32_i16 s1, s5
4300; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
4301; GFX9-NEXT:    s_xor_b32 s0, s1, s0
4302; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
4303; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4304; GFX9-NEXT:    s_or_b32 s6, s0, 1
4305; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
4306; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4307; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4308; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
4309; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
4310; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
4311; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4312; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
4313; GFX9-NEXT:    s_ashr_i32 s6, s7, 16
4314; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s6
4315; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
4316; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
4317; GFX9-NEXT:    s_ashr_i32 s7, s5, 16
4318; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s7
4319; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
4320; GFX9-NEXT:    s_xor_b32 s0, s7, s6
4321; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4322; GFX9-NEXT:    s_or_b32 s9, s0, 1
4323; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
4324; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
4325; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
4326; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
4327; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
4328; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4329; GFX9-NEXT:    s_cselect_b32 s0, s9, 0
4330; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
4331; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
4332; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
4333; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v3
4334; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
4335; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v4
4336; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
4337; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
4338; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
4339; GFX9-NEXT:    v_and_b32_e32 v3, v4, v5
4340; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
4341; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
4342; GFX9-NEXT:    s_endpgm
4343;
4344; GFX90A-LABEL: srem_v4i16:
4345; GFX90A:       ; %bb.0:
4346; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4347; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
4348; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
4349; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
4350; GFX90A-NEXT:    s_sext_i32_i16 s0, s6
4351; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
4352; GFX90A-NEXT:    s_sext_i32_i16 s1, s4
4353; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s1
4354; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
4355; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4356; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4357; GFX90A-NEXT:    s_or_b32 s8, s0, 1
4358; GFX90A-NEXT:    v_mul_f32_e32 v3, v1, v3
4359; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
4360; GFX90A-NEXT:    v_mad_f32 v1, -v3, v0, v1
4361; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
4362; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
4363; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4364; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
4365; GFX90A-NEXT:    s_ashr_i32 s8, s6, 16
4366; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s8
4367; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
4368; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s6
4369; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
4370; GFX90A-NEXT:    s_ashr_i32 s4, s4, 16
4371; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s4
4372; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4373; GFX90A-NEXT:    s_xor_b32 s0, s4, s8
4374; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4375; GFX90A-NEXT:    s_or_b32 s6, s0, 1
4376; GFX90A-NEXT:    v_mul_f32_e32 v4, v3, v4
4377; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
4378; GFX90A-NEXT:    v_mad_f32 v3, -v4, v1, v3
4379; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
4380; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v1|
4381; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4382; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
4383; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v4
4384; GFX90A-NEXT:    s_sext_i32_i16 s0, s7
4385; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s0
4386; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s8
4387; GFX90A-NEXT:    s_sext_i32_i16 s1, s5
4388; GFX90A-NEXT:    v_sub_u32_e32 v4, s4, v1
4389; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s1
4390; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v3
4391; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
4392; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4393; GFX90A-NEXT:    s_or_b32 s4, s0, 1
4394; GFX90A-NEXT:    v_mul_f32_e32 v5, v1, v5
4395; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
4396; GFX90A-NEXT:    v_mad_f32 v1, -v5, v3, v1
4397; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
4398; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v3|
4399; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4400; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
4401; GFX90A-NEXT:    s_ashr_i32 s4, s7, 16
4402; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s4
4403; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v5
4404; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s7
4405; GFX90A-NEXT:    v_sub_u32_e32 v1, s5, v1
4406; GFX90A-NEXT:    s_ashr_i32 s5, s5, 16
4407; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s5
4408; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v3
4409; GFX90A-NEXT:    s_xor_b32 s0, s5, s4
4410; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4411; GFX90A-NEXT:    s_or_b32 s6, s0, 1
4412; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
4413; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
4414; GFX90A-NEXT:    v_mad_f32 v5, -v6, v3, v5
4415; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
4416; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
4417; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4418; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
4419; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v6
4420; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s4
4421; GFX90A-NEXT:    v_mov_b32_e32 v5, 0xffff
4422; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v3
4423; GFX90A-NEXT:    v_and_b32_e32 v1, v5, v1
4424; GFX90A-NEXT:    v_and_b32_e32 v0, v5, v0
4425; GFX90A-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
4426; GFX90A-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
4427; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
4428; GFX90A-NEXT:    s_endpgm
4429  %r = srem <4 x i16> %x, %y
4430  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
4431  ret void
4432}
4433
4434define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
4435; CHECK-LABEL: @udiv_i3(
4436; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
4437; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
4438; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
4439; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
4440; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
4441; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
4442; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
4443; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
4444; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
4445; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
4446; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4447; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
4448; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
4449; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
4450; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
4451; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
4452; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
4453; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
4454; CHECK-NEXT:    ret void
4455;
4456; GFX6-LABEL: udiv_i3:
4457; GFX6:       ; %bb.0:
4458; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4459; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
4460; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4461; GFX6-NEXT:    s_mov_b32 s6, -1
4462; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4463; GFX6-NEXT:    s_bfe_u32 s1, s0, 0x30008
4464; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
4465; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
4466; GFX6-NEXT:    s_and_b32 s0, s0, 7
4467; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
4468; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
4469; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4470; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
4471; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
4472; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4473; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
4474; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
4475; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
4476; GFX6-NEXT:    s_endpgm
4477;
4478; GFX9-LABEL: udiv_i3:
4479; GFX9:       ; %bb.0:
4480; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4481; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
4482; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4483; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4484; GFX9-NEXT:    s_bfe_u32 s0, s4, 0x30008
4485; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
4486; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
4487; GFX9-NEXT:    s_and_b32 s0, s4, 7
4488; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
4489; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
4490; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4491; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
4492; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
4493; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4494; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
4495; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
4496; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
4497; GFX9-NEXT:    s_endpgm
4498;
4499; GFX90A-LABEL: udiv_i3:
4500; GFX90A:       ; %bb.0:
4501; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4502; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
4503; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
4504; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
4505; GFX90A-NEXT:    s_bfe_u32 s0, s4, 0x30008
4506; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
4507; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v0
4508; GFX90A-NEXT:    s_and_b32 s0, s4, 7
4509; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
4510; GFX90A-NEXT:    v_mul_f32_e32 v1, v3, v1
4511; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
4512; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v1
4513; GFX90A-NEXT:    v_mad_f32 v1, -v1, v0, v3
4514; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4515; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
4516; GFX90A-NEXT:    v_and_b32_e32 v0, 7, v0
4517; GFX90A-NEXT:    global_store_byte v2, v0, s[2:3]
4518; GFX90A-NEXT:    s_endpgm
4519  %r = udiv i3 %x, %y
4520  store i3 %r, i3 addrspace(1)* %out
4521  ret void
4522}
4523
4524define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
4525; CHECK-LABEL: @urem_i3(
4526; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
4527; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
4528; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
4529; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
4530; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
4531; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
4532; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
4533; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
4534; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
4535; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
4536; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4537; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
4538; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
4539; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
4540; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
4541; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
4542; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
4543; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
4544; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
4545; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
4546; CHECK-NEXT:    ret void
4547;
4548; GFX6-LABEL: urem_i3:
4549; GFX6:       ; %bb.0:
4550; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4551; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
4552; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4553; GFX6-NEXT:    s_mov_b32 s6, -1
4554; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4555; GFX6-NEXT:    s_bfe_u32 s1, s0, 0x30008
4556; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
4557; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
4558; GFX6-NEXT:    s_and_b32 s2, s0, 7
4559; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
4560; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
4561; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
4562; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4563; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
4564; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
4565; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4566; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
4567; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s1
4568; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4569; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
4570; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
4571; GFX6-NEXT:    s_endpgm
4572;
4573; GFX9-LABEL: urem_i3:
4574; GFX9:       ; %bb.0:
4575; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
4576; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4577; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x30008
4578; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
4579; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
4580; GFX9-NEXT:    s_and_b32 s4, s2, 7
4581; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
4582; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
4583; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
4584; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4585; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
4586; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
4587; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4588; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4589; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
4590; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
4591; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4592; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
4593; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
4594; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4595; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
4596; GFX9-NEXT:    s_endpgm
4597;
4598; GFX90A-LABEL: urem_i3:
4599; GFX90A:       ; %bb.0:
4600; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4601; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
4602; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
4603; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
4604; GFX90A-NEXT:    s_bfe_u32 s0, s4, 0x30008
4605; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
4606; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v1
4607; GFX90A-NEXT:    s_and_b32 s1, s4, 7
4608; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v3, s1
4609; GFX90A-NEXT:    s_lshr_b32 s0, s4, 8
4610; GFX90A-NEXT:    v_mul_f32_e32 v2, v3, v2
4611; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
4612; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v2
4613; GFX90A-NEXT:    v_mad_f32 v2, -v2, v1, v3
4614; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
4615; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
4616; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s0
4617; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
4618; GFX90A-NEXT:    v_and_b32_e32 v1, 7, v1
4619; GFX90A-NEXT:    global_store_byte v0, v1, s[2:3]
4620; GFX90A-NEXT:    s_endpgm
4621  %r = urem i3 %x, %y
4622  store i3 %r, i3 addrspace(1)* %out
4623  ret void
4624}
4625
4626define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
4627; CHECK-LABEL: @sdiv_i3(
4628; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
4629; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
4630; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
4631; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
4632; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
4633; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
4634; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
4635; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
4636; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
4637; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
4638; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
4639; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
4640; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
4641; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
4642; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
4643; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
4644; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
4645; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
4646; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
4647; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
4648; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
4649; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
4650; CHECK-NEXT:    ret void
4651;
4652; GFX6-LABEL: sdiv_i3:
4653; GFX6:       ; %bb.0:
4654; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4655; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
4656; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4657; GFX6-NEXT:    s_mov_b32 s6, -1
4658; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4659; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x30008
4660; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
4661; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x30000
4662; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
4663; GFX6-NEXT:    s_xor_b32 s0, s0, s1
4664; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4665; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
4666; GFX6-NEXT:    s_or_b32 s0, s0, 1
4667; GFX6-NEXT:    v_mov_b32_e32 v3, s0
4668; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4669; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4670; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4671; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4672; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4673; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4674; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4675; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
4676; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
4677; GFX6-NEXT:    s_endpgm
4678;
4679; GFX9-LABEL: sdiv_i3:
4680; GFX9:       ; %bb.0:
4681; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4682; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
4683; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4684; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4685; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x30008
4686; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
4687; GFX9-NEXT:    s_bfe_i32 s1, s4, 0x30000
4688; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
4689; GFX9-NEXT:    s_xor_b32 s0, s1, s0
4690; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4691; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4692; GFX9-NEXT:    s_or_b32 s4, s0, 1
4693; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4694; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4695; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4696; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4697; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
4698; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4699; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4700; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
4701; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
4702; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
4703; GFX9-NEXT:    s_endpgm
4704;
4705; GFX90A-LABEL: sdiv_i3:
4706; GFX90A:       ; %bb.0:
4707; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4708; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
4709; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
4710; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
4711; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0x30008
4712; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
4713; GFX90A-NEXT:    s_bfe_i32 s1, s4, 0x30000
4714; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
4715; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
4716; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4717; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4718; GFX90A-NEXT:    s_or_b32 s4, s0, 1
4719; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
4720; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
4721; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
4722; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
4723; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
4724; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4725; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
4726; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
4727; GFX90A-NEXT:    v_and_b32_e32 v0, 7, v0
4728; GFX90A-NEXT:    global_store_byte v1, v0, s[2:3]
4729; GFX90A-NEXT:    s_endpgm
4730  %r = sdiv i3 %x, %y
4731  store i3 %r, i3 addrspace(1)* %out
4732  ret void
4733}
4734
4735define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
4736; CHECK-LABEL: @srem_i3(
4737; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
4738; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
4739; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
4740; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
4741; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
4742; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
4743; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
4744; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
4745; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
4746; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
4747; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
4748; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
4749; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
4750; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
4751; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
4752; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
4753; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
4754; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
4755; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
4756; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
4757; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
4758; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
4759; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
4760; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
4761; CHECK-NEXT:    ret void
4762;
4763; GFX6-LABEL: srem_i3:
4764; GFX6:       ; %bb.0:
4765; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4766; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
4767; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4768; GFX6-NEXT:    s_mov_b32 s6, -1
4769; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4770; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x30008
4771; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
4772; GFX6-NEXT:    s_bfe_i32 s3, s0, 0x30000
4773; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
4774; GFX6-NEXT:    s_xor_b32 s1, s3, s1
4775; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4776; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
4777; GFX6-NEXT:    s_or_b32 s1, s1, 1
4778; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4779; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4780; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4781; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4782; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4783; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4784; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4785; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
4786; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4787; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
4788; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4789; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
4790; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
4791; GFX6-NEXT:    s_endpgm
4792;
4793; GFX9-LABEL: srem_i3:
4794; GFX9:       ; %bb.0:
4795; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
4796; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4797; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x30008
4798; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
4799; GFX9-NEXT:    s_bfe_i32 s3, s4, 0x30000
4800; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
4801; GFX9-NEXT:    s_xor_b32 s2, s3, s2
4802; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4803; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
4804; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
4805; GFX9-NEXT:    s_or_b32 s6, s2, 1
4806; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
4807; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4808; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
4809; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
4810; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
4811; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
4812; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
4813; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
4814; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
4815; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4816; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4817; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
4818; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
4819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4820; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
4821; GFX9-NEXT:    s_endpgm
4822;
4823; GFX90A-LABEL: srem_i3:
4824; GFX90A:       ; %bb.0:
4825; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4826; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
4827; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
4828; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
4829; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0x30008
4830; GFX90A-NEXT:    v_cvt_f32_i32_e32 v1, s0
4831; GFX90A-NEXT:    s_bfe_i32 s1, s4, 0x30000
4832; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
4833; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
4834; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v1
4835; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
4836; GFX90A-NEXT:    s_lshr_b32 s5, s4, 8
4837; GFX90A-NEXT:    s_or_b32 s6, s0, 1
4838; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
4839; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
4840; GFX90A-NEXT:    v_mad_f32 v2, -v3, v1, v2
4841; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
4842; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v1|
4843; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4844; GFX90A-NEXT:    s_cselect_b32 s0, s6, 0
4845; GFX90A-NEXT:    v_add_u32_e32 v1, s0, v3
4846; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s5
4847; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
4848; GFX90A-NEXT:    v_and_b32_e32 v1, 7, v1
4849; GFX90A-NEXT:    global_store_byte v0, v1, s[2:3]
4850; GFX90A-NEXT:    s_endpgm
4851  %r = srem i3 %x, %y
4852  store i3 %r, i3 addrspace(1)* %out
4853  ret void
4854}
4855
4856define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4857; CHECK-LABEL: @udiv_v3i16(
4858; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4859; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4860; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
4861; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
4862; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4863; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4864; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4865; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4866; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4867; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4868; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4869; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4870; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4871; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4872; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4873; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4874; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4875; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
4876; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
4877; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
4878; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
4879; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4880; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
4881; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
4882; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
4883; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
4884; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
4885; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
4886; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
4887; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
4888; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
4889; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
4890; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
4891; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
4892; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
4893; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
4894; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
4895; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
4896; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
4897; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
4898; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
4899; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4900; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
4901; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
4902; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
4903; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
4904; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
4905; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
4906; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
4907; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
4908; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
4909; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
4910; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
4911; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
4912; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
4913; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
4914; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
4915; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
4916; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
4917; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
4918; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4919; CHECK-NEXT:    ret void
4920;
4921; GFX6-LABEL: udiv_v3i16:
4922; GFX6:       ; %bb.0:
4923; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4924; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4925; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4926; GFX6-NEXT:    s_mov_b32 s8, 0xffff
4927; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4928; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4929; GFX6-NEXT:    s_and_b32 s6, s0, s8
4930; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
4931; GFX6-NEXT:    s_and_b32 s6, s2, s8
4932; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
4933; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
4934; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4935; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s0
4936; GFX6-NEXT:    s_lshr_b32 s0, s2, 16
4937; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
4938; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4939; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
4940; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4941; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4942; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
4943; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4944; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
4945; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4946; GFX6-NEXT:    s_and_b32 s0, s1, s8
4947; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
4948; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
4949; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
4950; GFX6-NEXT:    s_and_b32 s0, s3, s8
4951; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
4952; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4953; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
4954; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
4955; GFX6-NEXT:    s_mov_b32 s6, -1
4956; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4957; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
4958; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4959; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
4960; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
4961; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
4962; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4963; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
4964; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
4965; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4966; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
4967; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4968; GFX6-NEXT:    s_endpgm
4969;
4970; GFX9-LABEL: udiv_v3i16:
4971; GFX9:       ; %bb.0:
4972; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4973; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4974; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4975; GFX9-NEXT:    s_mov_b32 s8, 0xffff
4976; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4977; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4978; GFX9-NEXT:    s_and_b32 s0, s6, s8
4979; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
4980; GFX9-NEXT:    s_and_b32 s0, s4, s8
4981; GFX9-NEXT:    s_lshr_b32 s1, s6, 16
4982; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s0
4983; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4984; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s1
4985; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
4986; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
4987; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4988; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
4989; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4990; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4991; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
4992; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
4993; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
4994; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4995; GFX9-NEXT:    s_and_b32 s0, s7, s8
4996; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
4997; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
4998; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
4999; GFX9-NEXT:    s_and_b32 s0, s5, s8
5000; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
5001; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
5002; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5003; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
5004; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5005; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
5006; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
5007; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
5008; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
5009; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v6
5010; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
5011; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
5012; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
5013; GFX9-NEXT:    global_store_short v1, v3, s[2:3] offset:4
5014; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
5015; GFX9-NEXT:    s_endpgm
5016;
5017; GFX90A-LABEL: udiv_v3i16:
5018; GFX90A:       ; %bb.0:
5019; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5020; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5021; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5022; GFX90A-NEXT:    s_mov_b32 s8, 0xffff
5023; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
5024; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
5025; GFX90A-NEXT:    s_and_b32 s0, s6, s8
5026; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s0
5027; GFX90A-NEXT:    s_and_b32 s0, s4, s8
5028; GFX90A-NEXT:    s_lshr_b32 s1, s6, 16
5029; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s0
5030; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
5031; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s1
5032; GFX90A-NEXT:    s_lshr_b32 s0, s4, 16
5033; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s0
5034; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
5035; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5036; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
5037; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
5038; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
5039; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
5040; GFX90A-NEXT:    v_mul_f32_e32 v2, v5, v6
5041; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
5042; GFX90A-NEXT:    s_and_b32 s0, s7, s8
5043; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
5044; GFX90A-NEXT:    v_mad_f32 v3, -v2, v4, v5
5045; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s0
5046; GFX90A-NEXT:    s_and_b32 s0, s5, s8
5047; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
5048; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
5049; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5050; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
5051; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5052; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
5053; GFX90A-NEXT:    v_mul_f32_e32 v3, v6, v7
5054; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
5055; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v3
5056; GFX90A-NEXT:    v_mad_f32 v3, -v3, v5, v6
5057; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
5058; GFX90A-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
5059; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
5060; GFX90A-NEXT:    global_store_short v1, v3, s[2:3] offset:4
5061; GFX90A-NEXT:    global_store_dword v1, v0, s[2:3]
5062; GFX90A-NEXT:    s_endpgm
5063  %r = udiv <3 x i16> %x, %y
5064  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
5065  ret void
5066}
5067
5068define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
5069; CHECK-LABEL: @urem_v3i16(
5070; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
5071; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
5072; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
5073; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
5074; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
5075; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
5076; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
5077; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
5078; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
5079; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
5080; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
5081; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
5082; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
5083; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
5084; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
5085; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
5086; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
5087; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
5088; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
5089; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
5090; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
5091; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
5092; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
5093; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
5094; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
5095; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
5096; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
5097; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
5098; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
5099; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
5100; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
5101; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
5102; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
5103; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
5104; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
5105; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
5106; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
5107; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
5108; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
5109; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
5110; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
5111; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
5112; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
5113; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
5114; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
5115; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
5116; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
5117; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
5118; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
5119; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
5120; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
5121; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
5122; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
5123; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
5124; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
5125; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
5126; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
5127; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
5128; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
5129; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
5130; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
5131; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
5132; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
5133; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
5134; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
5135; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
5136; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
5137; CHECK-NEXT:    ret void
5138;
5139; GFX6-LABEL: urem_v3i16:
5140; GFX6:       ; %bb.0:
5141; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5142; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5143; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5144; GFX6-NEXT:    s_mov_b32 s8, 0xffff
5145; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5146; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5147; GFX6-NEXT:    v_mov_b32_e32 v1, s2
5148; GFX6-NEXT:    s_and_b32 s6, s0, s8
5149; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
5150; GFX6-NEXT:    s_and_b32 s6, s2, s8
5151; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
5152; GFX6-NEXT:    v_mov_b32_e32 v4, s0
5153; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
5154; GFX6-NEXT:    v_alignbit_b32 v4, s1, v4, 16
5155; GFX6-NEXT:    v_and_b32_e32 v5, s8, v4
5156; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
5157; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
5158; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
5159; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
5160; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
5161; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
5162; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v5
5163; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
5164; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
5165; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
5166; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
5167; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
5168; GFX6-NEXT:    s_and_b32 s0, s1, s8
5169; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
5170; GFX6-NEXT:    s_and_b32 s0, s3, s8
5171; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
5172; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5173; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s0
5174; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
5175; GFX6-NEXT:    v_mad_f32 v3, -v5, v2, v3
5176; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
5177; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
5178; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
5179; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
5180; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
5181; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
5182; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
5183; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
5184; GFX6-NEXT:    v_mad_f32 v3, -v3, v6, v7
5185; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
5186; GFX6-NEXT:    s_mov_b32 s6, -1
5187; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
5188; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
5189; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
5190; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
5191; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
5192; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
5193; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
5194; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
5195; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5196; GFX6-NEXT:    s_endpgm
5197;
5198; GFX9-LABEL: urem_v3i16:
5199; GFX9:       ; %bb.0:
5200; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5201; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5202; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5203; GFX9-NEXT:    s_mov_b32 s8, 0xffff
5204; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5205; GFX9-NEXT:    s_and_b32 s0, s4, s8
5206; GFX9-NEXT:    s_and_b32 s1, s6, s8
5207; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
5208; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
5209; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
5210; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
5211; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
5212; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
5213; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
5214; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v2
5215; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
5216; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
5217; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v3
5218; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
5219; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
5220; GFX9-NEXT:    v_mul_f32_e32 v1, v4, v5
5221; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
5222; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s1
5223; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
5224; GFX9-NEXT:    s_and_b32 s1, s7, s8
5225; GFX9-NEXT:    v_mad_f32 v3, -v1, v2, v4
5226; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s1
5227; GFX9-NEXT:    s_and_b32 s5, s5, s8
5228; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
5229; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
5230; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5231; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
5232; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
5233; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
5234; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
5235; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
5236; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
5237; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
5238; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
5239; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
5240; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
5241; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s1
5242; GFX9-NEXT:    v_mov_b32_e32 v3, 0
5243; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
5244; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5245; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
5246; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
5247; GFX9-NEXT:    global_store_short v3, v2, s[2:3] offset:4
5248; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
5249; GFX9-NEXT:    s_endpgm
5250;
5251; GFX90A-LABEL: urem_v3i16:
5252; GFX90A:       ; %bb.0:
5253; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5254; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5255; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5256; GFX90A-NEXT:    s_mov_b32 s8, 0xffff
5257; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
5258; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
5259; GFX90A-NEXT:    s_and_b32 s1, s4, s8
5260; GFX90A-NEXT:    s_and_b32 s0, s6, s8
5261; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s0
5262; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s1
5263; GFX90A-NEXT:    s_lshr_b32 s6, s6, 16
5264; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s6
5265; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
5266; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
5267; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s4
5268; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5269; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
5270; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
5271; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
5272; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
5273; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
5274; GFX90A-NEXT:    v_mul_f32_e32 v2, v5, v6
5275; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
5276; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
5277; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s0
5278; GFX90A-NEXT:    s_and_b32 s0, s7, s8
5279; GFX90A-NEXT:    v_mad_f32 v3, -v2, v4, v5
5280; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s0
5281; GFX90A-NEXT:    v_sub_u32_e32 v0, s1, v0
5282; GFX90A-NEXT:    s_and_b32 s1, s5, s8
5283; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s1
5284; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5285; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
5286; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
5287; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5288; GFX90A-NEXT:    v_mul_f32_e32 v3, v6, v7
5289; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
5290; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v3
5291; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
5292; GFX90A-NEXT:    v_mad_f32 v3, -v3, v5, v6
5293; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
5294; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
5295; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, s6
5296; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s0
5297; GFX90A-NEXT:    v_sub_u32_e32 v2, s4, v2
5298; GFX90A-NEXT:    v_sub_u32_e32 v3, s1, v3
5299; GFX90A-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
5300; GFX90A-NEXT:    global_store_short v1, v3, s[2:3] offset:4
5301; GFX90A-NEXT:    global_store_dword v1, v0, s[2:3]
5302; GFX90A-NEXT:    s_endpgm
5303  %r = urem <3 x i16> %x, %y
5304  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
5305  ret void
5306}
5307
5308define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
5309; CHECK-LABEL: @sdiv_v3i16(
5310; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
5311; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
5312; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
5313; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
5314; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
5315; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
5316; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
5317; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
5318; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
5319; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5320; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
5321; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
5322; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
5323; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
5324; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
5325; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
5326; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
5327; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
5328; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
5329; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
5330; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
5331; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
5332; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
5333; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
5334; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
5335; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
5336; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
5337; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
5338; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
5339; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
5340; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
5341; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
5342; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
5343; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
5344; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
5345; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
5346; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
5347; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
5348; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
5349; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
5350; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
5351; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
5352; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
5353; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
5354; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
5355; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
5356; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
5357; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
5358; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
5359; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
5360; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
5361; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
5362; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
5363; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
5364; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
5365; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
5366; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
5367; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
5368; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
5369; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
5370; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
5371; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
5372; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
5373; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
5374; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
5375; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
5376; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
5377; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
5378; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
5379; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
5380; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
5381; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
5382; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
5383; CHECK-NEXT:    ret void
5384;
5385; GFX6-LABEL: sdiv_v3i16:
5386; GFX6:       ; %bb.0:
5387; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5388; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5389; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5390; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5391; GFX6-NEXT:    s_mov_b32 s6, -1
5392; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5393; GFX6-NEXT:    s_sext_i32_i16 s9, s2
5394; GFX6-NEXT:    s_sext_i32_i16 s8, s0
5395; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
5396; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
5397; GFX6-NEXT:    s_xor_b32 s8, s9, s8
5398; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
5399; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
5400; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
5401; GFX6-NEXT:    s_or_b32 s8, s8, 1
5402; GFX6-NEXT:    v_mov_b32_e32 v3, s8
5403; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
5404; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
5405; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
5406; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
5407; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
5408; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
5409; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
5410; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
5411; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5412; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s2
5413; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
5414; GFX6-NEXT:    s_xor_b32 s0, s2, s0
5415; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5416; GFX6-NEXT:    s_or_b32 s0, s0, 1
5417; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
5418; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
5419; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
5420; GFX6-NEXT:    v_mov_b32_e32 v4, s0
5421; GFX6-NEXT:    s_sext_i32_i16 s0, s1
5422; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
5423; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
5424; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
5425; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
5426; GFX6-NEXT:    s_sext_i32_i16 s1, s3
5427; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5428; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
5429; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5430; GFX6-NEXT:    s_xor_b32 s0, s1, s0
5431; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5432; GFX6-NEXT:    s_or_b32 s0, s0, 1
5433; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
5434; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
5435; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
5436; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
5437; GFX6-NEXT:    v_mov_b32_e32 v5, s0
5438; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
5439; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
5440; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5441; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
5442; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5443; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
5444; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
5445; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5446; GFX6-NEXT:    s_endpgm
5447;
5448; GFX9-LABEL: sdiv_v3i16:
5449; GFX9:       ; %bb.0:
5450; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5451; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5452; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5453; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5454; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5455; GFX9-NEXT:    s_sext_i32_i16 s1, s4
5456; GFX9-NEXT:    s_sext_i32_i16 s0, s6
5457; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
5458; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
5459; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5460; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5461; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
5462; GFX9-NEXT:    s_or_b32 s8, s0, 1
5463; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
5464; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
5465; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
5466; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
5467; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5468; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
5469; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
5470; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
5471; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
5472; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
5473; GFX9-NEXT:    v_add_u32_e32 v2, s0, v3
5474; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
5475; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
5476; GFX9-NEXT:    s_xor_b32 s0, s4, s1
5477; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5478; GFX9-NEXT:    s_or_b32 s4, s0, 1
5479; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
5480; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
5481; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
5482; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
5483; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5484; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
5485; GFX9-NEXT:    s_sext_i32_i16 s1, s7
5486; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
5487; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
5488; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
5489; GFX9-NEXT:    s_sext_i32_i16 s0, s5
5490; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
5491; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
5492; GFX9-NEXT:    s_xor_b32 s0, s0, s1
5493; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5494; GFX9-NEXT:    s_or_b32 s4, s0, 1
5495; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
5496; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
5497; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
5498; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
5499; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
5500; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5501; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
5502; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
5503; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
5504; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
5505; GFX9-NEXT:    global_store_short v1, v0, s[2:3] offset:4
5506; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
5507; GFX9-NEXT:    s_endpgm
5508;
5509; GFX90A-LABEL: sdiv_v3i16:
5510; GFX90A:       ; %bb.0:
5511; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5512; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5513; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5514; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
5515; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
5516; GFX90A-NEXT:    s_sext_i32_i16 s1, s4
5517; GFX90A-NEXT:    s_sext_i32_i16 s0, s6
5518; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s0
5519; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s1
5520; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
5521; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
5522; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
5523; GFX90A-NEXT:    s_or_b32 s8, s0, 1
5524; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
5525; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
5526; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
5527; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
5528; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5529; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
5530; GFX90A-NEXT:    s_ashr_i32 s1, s6, 16
5531; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
5532; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
5533; GFX90A-NEXT:    s_ashr_i32 s4, s4, 16
5534; GFX90A-NEXT:    v_add_u32_e32 v2, s0, v3
5535; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s4
5536; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v4, v0
5537; GFX90A-NEXT:    s_xor_b32 s0, s4, s1
5538; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
5539; GFX90A-NEXT:    s_or_b32 s4, s0, 1
5540; GFX90A-NEXT:    v_mul_f32_e32 v4, v3, v4
5541; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
5542; GFX90A-NEXT:    v_mad_f32 v3, -v4, v0, v3
5543; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
5544; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5545; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
5546; GFX90A-NEXT:    s_sext_i32_i16 s1, s7
5547; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s1
5548; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
5549; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v4
5550; GFX90A-NEXT:    s_sext_i32_i16 s0, s5
5551; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s0
5552; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v0
5553; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
5554; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
5555; GFX90A-NEXT:    s_or_b32 s4, s0, 1
5556; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
5557; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
5558; GFX90A-NEXT:    v_mad_f32 v4, -v5, v0, v4
5559; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
5560; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
5561; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5562; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
5563; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v5
5564; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff, v2
5565; GFX90A-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
5566; GFX90A-NEXT:    global_store_short v1, v0, s[2:3] offset:4
5567; GFX90A-NEXT:    global_store_dword v1, v2, s[2:3]
5568; GFX90A-NEXT:    s_endpgm
5569  %r = sdiv <3 x i16> %x, %y
5570  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
5571  ret void
5572}
5573
5574define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
5575; CHECK-LABEL: @srem_v3i16(
5576; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
5577; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
5578; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
5579; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
5580; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
5581; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
5582; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
5583; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
5584; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
5585; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5586; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
5587; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
5588; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
5589; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
5590; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
5591; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
5592; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
5593; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
5594; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
5595; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
5596; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
5597; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
5598; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
5599; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
5600; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
5601; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
5602; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
5603; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
5604; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
5605; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
5606; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
5607; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
5608; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
5609; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
5610; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
5611; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5612; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
5613; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
5614; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
5615; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
5616; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
5617; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
5618; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
5619; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
5620; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
5621; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
5622; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
5623; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
5624; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
5625; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
5626; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
5627; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
5628; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
5629; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
5630; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
5631; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
5632; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
5633; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
5634; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
5635; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
5636; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
5637; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
5638; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
5639; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
5640; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
5641; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
5642; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
5643; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
5644; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
5645; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
5646; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
5647; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
5648; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
5649; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
5650; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
5651; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
5652; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
5653; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
5654; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
5655; CHECK-NEXT:    ret void
5656;
5657; GFX6-LABEL: srem_v3i16:
5658; GFX6:       ; %bb.0:
5659; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5660; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5661; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5662; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5663; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5664; GFX6-NEXT:    s_sext_i32_i16 s8, s2
5665; GFX6-NEXT:    s_sext_i32_i16 s6, s0
5666; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
5667; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s8
5668; GFX6-NEXT:    s_xor_b32 s6, s8, s6
5669; GFX6-NEXT:    s_ashr_i32 s6, s6, 30
5670; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
5671; GFX6-NEXT:    s_or_b32 s6, s6, 1
5672; GFX6-NEXT:    v_mov_b32_e32 v3, s6
5673; GFX6-NEXT:    s_mov_b32 s6, -1
5674; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
5675; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
5676; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
5677; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
5678; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
5679; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
5680; GFX6-NEXT:    v_mov_b32_e32 v1, s2
5681; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5682; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5683; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 16
5684; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
5685; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
5686; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
5687; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
5688; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
5689; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
5690; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
5691; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
5692; GFX6-NEXT:    s_sext_i32_i16 s0, s1
5693; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
5694; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5695; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
5696; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
5697; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5698; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
5699; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
5700; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s0
5701; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
5702; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
5703; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5704; GFX6-NEXT:    s_sext_i32_i16 s2, s3
5705; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
5706; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s2
5707; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v4
5708; GFX6-NEXT:    s_xor_b32 s0, s2, s0
5709; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5710; GFX6-NEXT:    s_or_b32 s0, s0, 1
5711; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
5712; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5713; GFX6-NEXT:    v_mad_f32 v3, -v5, v4, v3
5714; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5715; GFX6-NEXT:    v_mov_b32_e32 v6, s0
5716; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
5717; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
5718; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5719; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
5720; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
5721; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
5722; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
5723; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5724; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
5725; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
5726; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5727; GFX6-NEXT:    s_endpgm
5728;
5729; GFX9-LABEL: srem_v3i16:
5730; GFX9:       ; %bb.0:
5731; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
5732; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5733; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
5734; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5735; GFX9-NEXT:    s_sext_i32_i16 s8, s2
5736; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
5737; GFX9-NEXT:    s_sext_i32_i16 s9, s6
5738; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
5739; GFX9-NEXT:    s_xor_b32 s0, s9, s8
5740; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
5741; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5742; GFX9-NEXT:    s_or_b32 s10, s0, 1
5743; GFX9-NEXT:    s_sext_i32_i16 s3, s3
5744; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
5745; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
5746; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
5747; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
5748; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5749; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
5750; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
5751; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
5752; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
5753; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
5754; GFX9-NEXT:    v_add_u32_e32 v1, s0, v2
5755; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s6
5756; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
5757; GFX9-NEXT:    s_xor_b32 s0, s6, s2
5758; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5759; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
5760; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
5761; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
5762; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
5763; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
5764; GFX9-NEXT:    s_or_b32 s8, s0, 1
5765; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
5766; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5767; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s3
5768; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
5769; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
5770; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
5771; GFX9-NEXT:    s_sext_i32_i16 s2, s7
5772; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
5773; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5774; GFX9-NEXT:    s_xor_b32 s0, s2, s3
5775; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5776; GFX9-NEXT:    s_or_b32 s7, s0, 1
5777; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
5778; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
5779; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
5780; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
5781; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
5782; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5783; GFX9-NEXT:    s_cselect_b32 s0, s7, 0
5784; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
5785; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
5786; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
5787; GFX9-NEXT:    v_mov_b32_e32 v3, 0
5788; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
5789; GFX9-NEXT:    v_sub_u32_e32 v2, s2, v2
5790; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
5791; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
5792; GFX9-NEXT:    global_store_short v3, v2, s[4:5] offset:4
5793; GFX9-NEXT:    global_store_dword v3, v0, s[4:5]
5794; GFX9-NEXT:    s_endpgm
5795;
5796; GFX90A-LABEL: srem_v3i16:
5797; GFX90A:       ; %bb.0:
5798; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5799; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5800; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5801; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
5802; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
5803; GFX90A-NEXT:    s_sext_i32_i16 s9, s4
5804; GFX90A-NEXT:    s_sext_i32_i16 s8, s6
5805; GFX90A-NEXT:    v_cvt_f32_i32_e32 v0, s8
5806; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s9
5807; GFX90A-NEXT:    s_xor_b32 s0, s9, s8
5808; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
5809; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v3, v0
5810; GFX90A-NEXT:    s_or_b32 s10, s0, 1
5811; GFX90A-NEXT:    v_mul_f32_e32 v3, v2, v3
5812; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
5813; GFX90A-NEXT:    v_mad_f32 v2, -v3, v0, v2
5814; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
5815; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5816; GFX90A-NEXT:    s_cselect_b32 s0, s10, 0
5817; GFX90A-NEXT:    s_ashr_i32 s6, s6, 16
5818; GFX90A-NEXT:    v_cvt_i32_f32_e32 v3, v3
5819; GFX90A-NEXT:    v_cvt_f32_i32_e32 v2, s6
5820; GFX90A-NEXT:    s_ashr_i32 s4, s4, 16
5821; GFX90A-NEXT:    v_add_u32_e32 v0, s0, v3
5822; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s4
5823; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5824; GFX90A-NEXT:    s_xor_b32 s0, s4, s6
5825; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
5826; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s8
5827; GFX90A-NEXT:    v_mul_f32_e32 v4, v3, v4
5828; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
5829; GFX90A-NEXT:    v_mad_f32 v3, -v4, v2, v3
5830; GFX90A-NEXT:    v_cvt_i32_f32_e32 v4, v4
5831; GFX90A-NEXT:    s_or_b32 s8, s0, 1
5832; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
5833; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5834; GFX90A-NEXT:    s_cselect_b32 s0, s8, 0
5835; GFX90A-NEXT:    v_add_u32_e32 v2, s0, v4
5836; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, s6
5837; GFX90A-NEXT:    s_sext_i32_i16 s6, s7
5838; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s6
5839; GFX90A-NEXT:    v_sub_u32_e32 v2, s4, v2
5840; GFX90A-NEXT:    s_sext_i32_i16 s4, s5
5841; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s4
5842; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5843; GFX90A-NEXT:    s_xor_b32 s0, s4, s6
5844; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
5845; GFX90A-NEXT:    s_or_b32 s5, s0, 1
5846; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
5847; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
5848; GFX90A-NEXT:    v_mad_f32 v4, -v5, v3, v4
5849; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
5850; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
5851; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5852; GFX90A-NEXT:    s_cselect_b32 s0, s5, 0
5853; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v5
5854; GFX90A-NEXT:    v_sub_u32_e32 v0, s9, v0
5855; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s6
5856; GFX90A-NEXT:    v_sub_u32_e32 v3, s4, v3
5857; GFX90A-NEXT:    v_and_b32_e32 v0, 0xffff, v0
5858; GFX90A-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
5859; GFX90A-NEXT:    global_store_short v1, v3, s[2:3] offset:4
5860; GFX90A-NEXT:    global_store_dword v1, v0, s[2:3]
5861; GFX90A-NEXT:    s_endpgm
5862  %r = srem <3 x i16> %x, %y
5863  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
5864  ret void
5865}
5866
5867define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
5868; CHECK-LABEL: @udiv_v3i15(
5869; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
5870; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
5871; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
5872; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
5873; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
5874; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
5875; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
5876; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
5877; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
5878; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
5879; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
5880; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
5881; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
5882; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
5883; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
5884; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
5885; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
5886; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
5887; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
5888; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
5889; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
5890; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
5891; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
5892; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
5893; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
5894; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
5895; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
5896; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
5897; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
5898; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
5899; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
5900; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
5901; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
5902; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
5903; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
5904; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
5905; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
5906; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
5907; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
5908; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
5909; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
5910; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
5911; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
5912; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
5913; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
5914; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
5915; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
5916; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
5917; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
5918; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
5919; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
5920; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
5921; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
5922; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
5923; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
5924; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
5925; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
5926; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
5927; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
5928; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
5929; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
5930; CHECK-NEXT:    ret void
5931;
5932; GFX6-LABEL: udiv_v3i15:
5933; GFX6:       ; %bb.0:
5934; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5935; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5936; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5937; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5938; GFX6-NEXT:    s_mov_b32 s6, -1
5939; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5940; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5941; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5942; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
5943; GFX6-NEXT:    s_and_b32 s9, s0, s3
5944; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
5945; GFX6-NEXT:    s_and_b32 s8, s2, s3
5946; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5947; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf000f
5948; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
5949; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
5950; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
5951; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf000f
5952; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
5953; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
5954; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
5955; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5956; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
5957; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
5958; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
5959; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
5960; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
5961; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
5962; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
5963; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
5964; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
5965; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
5966; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
5967; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
5968; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
5969; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
5970; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
5971; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
5972; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v6
5973; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
5974; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
5975; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
5976; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
5977; GFX6-NEXT:    v_and_b32_e32 v2, s3, v3
5978; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
5979; GFX6-NEXT:    v_and_b32_e32 v3, s3, v4
5980; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5981; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5982; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
5983; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5984; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5985; GFX6-NEXT:    s_waitcnt expcnt(0)
5986; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5987; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5988; GFX6-NEXT:    s_endpgm
5989;
5990; GFX9-LABEL: udiv_v3i15:
5991; GFX9:       ; %bb.0:
5992; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5993; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5994; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5995; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
5996; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5997; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5998; GFX9-NEXT:    s_and_b32 s0, s4, s8
5999; GFX9-NEXT:    s_and_b32 s1, s6, s8
6000; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
6001; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
6002; GFX9-NEXT:    s_bfe_u32 s0, s6, 0xf000f
6003; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
6004; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
6005; GFX9-NEXT:    s_bfe_u32 s1, s4, 0xf000f
6006; GFX9-NEXT:    v_mov_b32_e32 v3, s6
6007; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
6008; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
6009; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
6010; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
6011; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
6012; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
6013; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
6014; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
6015; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
6016; GFX9-NEXT:    v_mov_b32_e32 v0, s4
6017; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
6018; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
6019; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
6020; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
6021; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
6022; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
6023; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
6024; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6025; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
6026; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
6027; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
6028; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
6029; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v7
6030; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
6031; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
6032; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
6033; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
6034; GFX9-NEXT:    v_and_b32_e32 v3, s8, v4
6035; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
6036; GFX9-NEXT:    v_and_b32_e32 v4, s8, v5
6037; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
6038; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
6039; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
6040; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
6041; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
6042; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6043; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
6044; GFX9-NEXT:    s_endpgm
6045;
6046; GFX90A-LABEL: udiv_v3i15:
6047; GFX90A:       ; %bb.0:
6048; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6049; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6050; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
6051; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
6052; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
6053; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
6054; GFX90A-NEXT:    s_and_b32 s0, s4, s8
6055; GFX90A-NEXT:    s_and_b32 s1, s6, s8
6056; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s1
6057; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s0
6058; GFX90A-NEXT:    s_bfe_u32 s0, s6, 0xf000f
6059; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s0
6060; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v1
6061; GFX90A-NEXT:    s_bfe_u32 s1, s4, 0xf000f
6062; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
6063; GFX90A-NEXT:    v_alignbit_b32 v3, s7, v3, 30
6064; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
6065; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s1
6066; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v6
6067; GFX90A-NEXT:    v_and_b32_e32 v3, s8, v3
6068; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
6069; GFX90A-NEXT:    v_mad_f32 v4, -v5, v1, v4
6070; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
6071; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, v3
6072; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
6073; GFX90A-NEXT:    v_alignbit_b32 v0, s5, v0, 30
6074; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
6075; GFX90A-NEXT:    v_mul_f32_e32 v1, v7, v8
6076; GFX90A-NEXT:    v_and_b32_e32 v0, s8, v0
6077; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
6078; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
6079; GFX90A-NEXT:    v_mad_f32 v5, -v1, v6, v7
6080; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
6081; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, v0
6082; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v3
6083; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
6084; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
6085; GFX90A-NEXT:    v_mul_f32_e32 v1, v0, v7
6086; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
6087; GFX90A-NEXT:    v_cvt_u32_f32_e32 v6, v1
6088; GFX90A-NEXT:    v_mad_f32 v0, -v1, v3, v0
6089; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
6090; GFX90A-NEXT:    v_and_b32_e32 v3, s8, v4
6091; GFX90A-NEXT:    v_and_b32_e32 v4, s8, v5
6092; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
6093; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
6094; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
6095; GFX90A-NEXT:    v_or_b32_e32 v3, v3, v4
6096; GFX90A-NEXT:    v_or_b32_e32 v0, v3, v0
6097; GFX90A-NEXT:    global_store_dword v2, v0, s[2:3]
6098; GFX90A-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6099; GFX90A-NEXT:    global_store_short v2, v0, s[2:3] offset:4
6100; GFX90A-NEXT:    s_endpgm
6101  %r = udiv <3 x i15> %x, %y
6102  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
6103  ret void
6104}
6105
6106define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
6107; CHECK-LABEL: @urem_v3i15(
6108; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
6109; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
6110; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
6111; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
6112; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
6113; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
6114; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
6115; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
6116; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
6117; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
6118; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
6119; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
6120; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
6121; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
6122; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
6123; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
6124; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
6125; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
6126; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
6127; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
6128; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
6129; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
6130; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
6131; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
6132; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
6133; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
6134; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
6135; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
6136; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
6137; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
6138; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
6139; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
6140; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
6141; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
6142; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
6143; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
6144; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
6145; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
6146; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
6147; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
6148; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
6149; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
6150; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
6151; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
6152; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
6153; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
6154; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
6155; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
6156; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
6157; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
6158; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
6159; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
6160; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
6161; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
6162; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
6163; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
6164; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
6165; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
6166; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
6167; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
6168; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
6169; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
6170; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
6171; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
6172; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
6173; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
6174; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
6175; CHECK-NEXT:    ret void
6176;
6177; GFX6-LABEL: urem_v3i15:
6178; GFX6:       ; %bb.0:
6179; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6180; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6181; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
6182; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6183; GFX6-NEXT:    s_mov_b32 s6, -1
6184; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6185; GFX6-NEXT:    v_mov_b32_e32 v0, s2
6186; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
6187; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
6188; GFX6-NEXT:    s_and_b32 s10, s0, s3
6189; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
6190; GFX6-NEXT:    s_and_b32 s9, s2, s3
6191; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
6192; GFX6-NEXT:    v_mov_b32_e32 v2, s0
6193; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
6194; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
6195; GFX6-NEXT:    s_bfe_u32 s1, s0, 0xf000f
6196; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
6197; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
6198; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
6199; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
6200; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
6201; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
6202; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
6203; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
6204; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
6205; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
6206; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
6207; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
6208; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
6209; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
6210; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
6211; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
6212; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
6213; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
6214; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
6215; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
6216; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6217; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
6218; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
6219; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
6220; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
6221; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
6222; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6223; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
6224; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
6225; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
6226; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6227; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
6228; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
6229; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
6230; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
6231; GFX6-NEXT:    v_and_b32_e32 v3, s3, v3
6232; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
6233; GFX6-NEXT:    v_and_b32_e32 v2, s3, v6
6234; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
6235; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
6236; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
6237; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6238; GFX6-NEXT:    s_waitcnt expcnt(0)
6239; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6240; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
6241; GFX6-NEXT:    s_endpgm
6242;
6243; GFX9-LABEL: urem_v3i15:
6244; GFX9:       ; %bb.0:
6245; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6246; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6247; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
6248; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
6249; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6250; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6251; GFX9-NEXT:    v_mov_b32_e32 v0, s4
6252; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
6253; GFX9-NEXT:    s_and_b32 s5, s6, s8
6254; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
6255; GFX9-NEXT:    s_and_b32 s0, s4, s8
6256; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
6257; GFX9-NEXT:    s_bfe_u32 s5, s6, 0xf000f
6258; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
6259; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s5
6260; GFX9-NEXT:    v_mov_b32_e32 v3, s6
6261; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
6262; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
6263; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
6264; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
6265; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
6266; GFX9-NEXT:    s_bfe_u32 s1, s4, 0xf000f
6267; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
6268; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
6269; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
6270; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
6271; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
6272; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
6273; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
6274; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
6275; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
6276; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
6277; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
6278; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
6279; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
6280; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
6281; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
6282; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
6283; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
6284; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
6285; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
6286; GFX9-NEXT:    s_lshr_b32 s0, s6, 15
6287; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
6288; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s0
6289; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
6290; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
6291; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
6292; GFX9-NEXT:    s_lshr_b32 s0, s4, 15
6293; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v4
6294; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
6295; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
6296; GFX9-NEXT:    v_and_b32_e32 v4, s8, v4
6297; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
6298; GFX9-NEXT:    v_and_b32_e32 v3, s8, v5
6299; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
6300; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
6301; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
6302; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
6303; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6304; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
6305; GFX9-NEXT:    s_endpgm
6306;
6307; GFX90A-LABEL: urem_v3i15:
6308; GFX90A:       ; %bb.0:
6309; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6310; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6311; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
6312; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
6313; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
6314; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
6315; GFX90A-NEXT:    s_and_b32 s1, s4, s8
6316; GFX90A-NEXT:    s_and_b32 s9, s6, s8
6317; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
6318; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s1
6319; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
6320; GFX90A-NEXT:    v_alignbit_b32 v3, s7, v3, 30
6321; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v1
6322; GFX90A-NEXT:    s_bfe_u32 s7, s6, 0xf000f
6323; GFX90A-NEXT:    v_cvt_f32_u32_e32 v6, s7
6324; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
6325; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
6326; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
6327; GFX90A-NEXT:    v_mad_f32 v4, -v5, v1, v4
6328; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
6329; GFX90A-NEXT:    v_alignbit_b32 v0, s5, v0, 30
6330; GFX90A-NEXT:    s_bfe_u32 s5, s4, 0xf000f
6331; GFX90A-NEXT:    v_cvt_f32_u32_e32 v7, s5
6332; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v8, v6
6333; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
6334; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
6335; GFX90A-NEXT:    v_and_b32_e32 v3, s8, v3
6336; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s6
6337; GFX90A-NEXT:    v_sub_u32_e32 v4, s4, v1
6338; GFX90A-NEXT:    v_mul_f32_e32 v1, v7, v8
6339; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, v3
6340; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
6341; GFX90A-NEXT:    v_mad_f32 v7, -v1, v6, v7
6342; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
6343; GFX90A-NEXT:    v_and_b32_e32 v0, s8, v0
6344; GFX90A-NEXT:    v_cvt_f32_u32_e32 v8, v0
6345; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v9, v5
6346; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
6347; GFX90A-NEXT:    s_lshr_b32 s1, s6, 15
6348; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6349; GFX90A-NEXT:    s_lshr_b32 s0, s4, 15
6350; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s1
6351; GFX90A-NEXT:    v_sub_u32_e32 v6, s0, v1
6352; GFX90A-NEXT:    v_mul_f32_e32 v1, v8, v9
6353; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
6354; GFX90A-NEXT:    v_cvt_u32_f32_e32 v7, v1
6355; GFX90A-NEXT:    v_mad_f32 v1, -v1, v5, v8
6356; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v5
6357; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
6358; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, v3
6359; GFX90A-NEXT:    v_and_b32_e32 v3, s8, v4
6360; GFX90A-NEXT:    v_and_b32_e32 v4, s8, v6
6361; GFX90A-NEXT:    v_sub_u32_e32 v0, v0, v1
6362; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
6363; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
6364; GFX90A-NEXT:    v_or_b32_e32 v3, v3, v4
6365; GFX90A-NEXT:    v_or_b32_e32 v0, v3, v0
6366; GFX90A-NEXT:    global_store_dword v2, v0, s[2:3]
6367; GFX90A-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6368; GFX90A-NEXT:    global_store_short v2, v0, s[2:3] offset:4
6369; GFX90A-NEXT:    s_endpgm
6370  %r = urem <3 x i15> %x, %y
6371  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
6372  ret void
6373}
6374
6375define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
6376; CHECK-LABEL: @sdiv_v3i15(
6377; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
6378; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
6379; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
6380; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
6381; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
6382; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
6383; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
6384; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
6385; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
6386; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
6387; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
6388; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
6389; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
6390; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
6391; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
6392; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
6393; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
6394; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
6395; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
6396; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
6397; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
6398; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
6399; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
6400; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
6401; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
6402; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
6403; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
6404; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
6405; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
6406; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
6407; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
6408; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
6409; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
6410; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
6411; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
6412; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
6413; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
6414; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
6415; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
6416; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
6417; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
6418; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
6419; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
6420; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
6421; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
6422; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
6423; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
6424; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
6425; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
6426; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
6427; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
6428; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
6429; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
6430; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
6431; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
6432; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
6433; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
6434; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
6435; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
6436; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
6437; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
6438; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
6439; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
6440; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
6441; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
6442; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
6443; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
6444; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
6445; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
6446; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
6447; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
6448; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
6449; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
6450; CHECK-NEXT:    ret void
6451;
6452; GFX6-LABEL: sdiv_v3i15:
6453; GFX6:       ; %bb.0:
6454; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6455; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6456; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
6457; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6458; GFX6-NEXT:    s_mov_b32 s6, -1
6459; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6460; GFX6-NEXT:    v_mov_b32_e32 v0, s2
6461; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
6462; GFX6-NEXT:    s_bfe_i32 s3, s0, 0xf0000
6463; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s3
6464; GFX6-NEXT:    v_mov_b32_e32 v1, s0
6465; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
6466; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf0000
6467; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
6468; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
6469; GFX6-NEXT:    s_xor_b32 s1, s1, s3
6470; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
6471; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
6472; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
6473; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
6474; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
6475; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
6476; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
6477; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
6478; GFX6-NEXT:    s_or_b32 s1, s1, 1
6479; GFX6-NEXT:    v_mov_b32_e32 v5, s1
6480; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
6481; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
6482; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6483; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
6484; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
6485; GFX6-NEXT:    s_xor_b32 s0, s1, s0
6486; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
6487; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
6488; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
6489; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
6490; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
6491; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
6492; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
6493; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v1
6494; GFX6-NEXT:    s_or_b32 s0, s0, 1
6495; GFX6-NEXT:    v_mov_b32_e32 v6, s0
6496; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
6497; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
6498; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6499; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
6500; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
6501; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
6502; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
6503; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
6504; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
6505; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
6506; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
6507; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
6508; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
6509; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
6510; GFX6-NEXT:    s_movk_i32 s0, 0x7fff
6511; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6512; GFX6-NEXT:    v_and_b32_e32 v3, s0, v3
6513; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
6514; GFX6-NEXT:    v_and_b32_e32 v2, s0, v2
6515; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
6516; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
6517; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
6518; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6519; GFX6-NEXT:    s_waitcnt expcnt(0)
6520; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6521; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
6522; GFX6-NEXT:    s_endpgm
6523;
6524; GFX9-LABEL: sdiv_v3i15:
6525; GFX9:       ; %bb.0:
6526; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6527; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6528; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
6529; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6530; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6531; GFX9-NEXT:    s_bfe_i32 s1, s4, 0xf0000
6532; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf0000
6533; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
6534; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
6535; GFX9-NEXT:    s_xor_b32 s0, s1, s0
6536; GFX9-NEXT:    v_mov_b32_e32 v0, s4
6537; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
6538; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
6539; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
6540; GFX9-NEXT:    s_or_b32 s5, s0, 1
6541; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
6542; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
6543; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
6544; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
6545; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
6546; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
6547; GFX9-NEXT:    s_cselect_b32 s0, s5, 0
6548; GFX9-NEXT:    s_bfe_i32 s1, s6, 0xf000f
6549; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s1
6550; GFX9-NEXT:    v_add_u32_e32 v4, s0, v5
6551; GFX9-NEXT:    s_bfe_i32 s0, s4, 0xf000f
6552; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
6553; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
6554; GFX9-NEXT:    v_mov_b32_e32 v1, s6
6555; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
6556; GFX9-NEXT:    s_xor_b32 s0, s0, s1
6557; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
6558; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
6559; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
6560; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
6561; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
6562; GFX9-NEXT:    s_or_b32 s4, s0, 1
6563; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
6564; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
6565; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
6566; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
6567; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
6568; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
6569; GFX9-NEXT:    v_add_u32_e32 v5, s0, v6
6570; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v0
6571; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
6572; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
6573; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
6574; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
6575; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
6576; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
6577; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v1
6578; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
6579; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
6580; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
6581; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
6582; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
6583; GFX9-NEXT:    v_and_b32_e32 v3, s0, v4
6584; GFX9-NEXT:    v_and_b32_e32 v4, s0, v5
6585; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
6586; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
6587; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
6588; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
6589; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
6590; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6591; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
6592; GFX9-NEXT:    s_endpgm
6593;
6594; GFX90A-LABEL: sdiv_v3i15:
6595; GFX90A:       ; %bb.0:
6596; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6597; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6598; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
6599; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
6600; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
6601; GFX90A-NEXT:    s_bfe_i32 s1, s4, 0xf0000
6602; GFX90A-NEXT:    s_bfe_i32 s0, s6, 0xf0000
6603; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s0
6604; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s1
6605; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
6606; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
6607; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v3
6608; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
6609; GFX90A-NEXT:    v_alignbit_b32 v0, s5, v0, 30
6610; GFX90A-NEXT:    s_or_b32 s5, s0, 1
6611; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
6612; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
6613; GFX90A-NEXT:    v_mad_f32 v4, -v5, v3, v4
6614; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
6615; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
6616; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
6617; GFX90A-NEXT:    s_cselect_b32 s0, s5, 0
6618; GFX90A-NEXT:    s_bfe_i32 s1, s6, 0xf000f
6619; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s1
6620; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v5
6621; GFX90A-NEXT:    s_bfe_i32 s0, s4, 0xf000f
6622; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s0
6623; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v3
6624; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
6625; GFX90A-NEXT:    v_alignbit_b32 v1, s7, v1, 30
6626; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
6627; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
6628; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
6629; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
6630; GFX90A-NEXT:    v_mad_f32 v5, -v6, v3, v5
6631; GFX90A-NEXT:    v_bfe_i32 v1, v1, 0, 15
6632; GFX90A-NEXT:    s_or_b32 s4, s0, 1
6633; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
6634; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
6635; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, v1
6636; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
6637; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
6638; GFX90A-NEXT:    v_bfe_i32 v0, v0, 0, 15
6639; GFX90A-NEXT:    v_add_u32_e32 v5, s0, v6
6640; GFX90A-NEXT:    v_cvt_f32_i32_e32 v6, v0
6641; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v7, v3
6642; GFX90A-NEXT:    v_xor_b32_e32 v0, v0, v1
6643; GFX90A-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
6644; GFX90A-NEXT:    v_or_b32_e32 v0, 1, v0
6645; GFX90A-NEXT:    v_mul_f32_e32 v1, v6, v7
6646; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
6647; GFX90A-NEXT:    v_cvt_i32_f32_e32 v7, v1
6648; GFX90A-NEXT:    v_mad_f32 v1, -v1, v3, v6
6649; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
6650; GFX90A-NEXT:    s_movk_i32 s0, 0x7fff
6651; GFX90A-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
6652; GFX90A-NEXT:    v_and_b32_e32 v3, s0, v4
6653; GFX90A-NEXT:    v_and_b32_e32 v4, s0, v5
6654; GFX90A-NEXT:    v_add_u32_e32 v0, v7, v0
6655; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
6656; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
6657; GFX90A-NEXT:    v_or_b32_e32 v3, v3, v4
6658; GFX90A-NEXT:    v_or_b32_e32 v0, v3, v0
6659; GFX90A-NEXT:    global_store_dword v2, v0, s[2:3]
6660; GFX90A-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6661; GFX90A-NEXT:    global_store_short v2, v0, s[2:3] offset:4
6662; GFX90A-NEXT:    s_endpgm
6663  %r = sdiv <3 x i15> %x, %y
6664  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
6665  ret void
6666}
6667
6668define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
6669; CHECK-LABEL: @srem_v3i15(
6670; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
6671; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
6672; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
6673; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
6674; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
6675; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
6676; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
6677; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
6678; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
6679; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
6680; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
6681; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
6682; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
6683; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
6684; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
6685; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
6686; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
6687; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
6688; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
6689; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
6690; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
6691; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
6692; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
6693; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
6694; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
6695; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
6696; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
6697; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
6698; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
6699; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
6700; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
6701; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
6702; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
6703; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
6704; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
6705; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
6706; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
6707; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
6708; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
6709; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
6710; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
6711; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
6712; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
6713; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
6714; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
6715; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
6716; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
6717; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
6718; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
6719; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
6720; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
6721; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
6722; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
6723; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
6724; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
6725; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
6726; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
6727; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
6728; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
6729; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
6730; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
6731; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
6732; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
6733; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
6734; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
6735; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
6736; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
6737; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
6738; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
6739; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
6740; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
6741; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
6742; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
6743; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
6744; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
6745; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
6746; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
6747; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
6748; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
6749; CHECK-NEXT:    ret void
6750;
6751; GFX6-LABEL: srem_v3i15:
6752; GFX6:       ; %bb.0:
6753; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6754; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6755; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
6756; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6757; GFX6-NEXT:    s_mov_b32 s6, -1
6758; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6759; GFX6-NEXT:    v_mov_b32_e32 v0, s2
6760; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
6761; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
6762; GFX6-NEXT:    s_and_b32 s11, s0, s3
6763; GFX6-NEXT:    s_bfe_i32 s11, s11, 0xf0000
6764; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s11
6765; GFX6-NEXT:    s_and_b32 s9, s2, s3
6766; GFX6-NEXT:    s_bfe_i32 s9, s9, 0xf0000
6767; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s9
6768; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
6769; GFX6-NEXT:    s_xor_b32 s9, s9, s11
6770; GFX6-NEXT:    s_ashr_i32 s9, s9, 30
6771; GFX6-NEXT:    s_or_b32 s9, s9, 1
6772; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
6773; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
6774; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
6775; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
6776; GFX6-NEXT:    v_mov_b32_e32 v5, s9
6777; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
6778; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
6779; GFX6-NEXT:    v_mov_b32_e32 v1, s0
6780; GFX6-NEXT:    s_bfe_u32 s12, s0, 0xf000f
6781; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6782; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
6783; GFX6-NEXT:    s_lshr_b32 s1, s0, 15
6784; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
6785; GFX6-NEXT:    s_bfe_i32 s0, s12, 0xf0000
6786; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
6787; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
6788; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
6789; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
6790; GFX6-NEXT:    s_bfe_i32 s2, s10, 0xf0000
6791; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s2
6792; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
6793; GFX6-NEXT:    s_xor_b32 s0, s2, s0
6794; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
6795; GFX6-NEXT:    s_or_b32 s0, s0, 1
6796; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
6797; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
6798; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
6799; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
6800; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
6801; GFX6-NEXT:    v_mov_b32_e32 v6, s0
6802; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
6803; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
6804; GFX6-NEXT:    v_bfe_i32 v4, v1, 0, 15
6805; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6806; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v4
6807; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
6808; GFX6-NEXT:    v_bfe_i32 v6, v0, 0, 15
6809; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v6
6810; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v5
6811; GFX6-NEXT:    v_xor_b32_e32 v4, v6, v4
6812; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
6813; GFX6-NEXT:    v_or_b32_e32 v4, 1, v4
6814; GFX6-NEXT:    v_mul_f32_e32 v6, v7, v8
6815; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
6816; GFX6-NEXT:    v_mad_f32 v7, -v6, v5, v7
6817; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
6818; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v5|
6819; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
6820; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
6821; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
6822; GFX6-NEXT:    v_mul_lo_u32 v1, v4, v1
6823; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
6824; GFX6-NEXT:    v_and_b32_e32 v3, s3, v3
6825; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
6826; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
6827; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
6828; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
6829; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
6830; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
6831; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6832; GFX6-NEXT:    s_waitcnt expcnt(0)
6833; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6834; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
6835; GFX6-NEXT:    s_endpgm
6836;
6837; GFX9-LABEL: srem_v3i15:
6838; GFX9:       ; %bb.0:
6839; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6840; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6841; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
6842; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
6843; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6844; GFX9-NEXT:    s_and_b32 s0, s4, s8
6845; GFX9-NEXT:    s_and_b32 s1, s6, s8
6846; GFX9-NEXT:    s_bfe_i32 s1, s1, 0xf0000
6847; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
6848; GFX9-NEXT:    s_bfe_i32 s0, s0, 0xf0000
6849; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
6850; GFX9-NEXT:    s_xor_b32 s0, s0, s1
6851; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
6852; GFX9-NEXT:    v_mov_b32_e32 v0, s4
6853; GFX9-NEXT:    v_mov_b32_e32 v1, s6
6854; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
6855; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
6856; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
6857; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
6858; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
6859; GFX9-NEXT:    s_lshr_b32 s9, s4, 15
6860; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
6861; GFX9-NEXT:    s_bfe_u32 s5, s4, 0xf000f
6862; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
6863; GFX9-NEXT:    s_lshr_b32 s7, s6, 15
6864; GFX9-NEXT:    s_bfe_u32 s10, s6, 0xf000f
6865; GFX9-NEXT:    s_or_b32 s11, s0, 1
6866; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
6867; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
6868; GFX9-NEXT:    s_cselect_b32 s0, s11, 0
6869; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
6870; GFX9-NEXT:    s_bfe_i32 s0, s10, 0xf0000
6871; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
6872; GFX9-NEXT:    s_bfe_i32 s1, s5, 0xf0000
6873; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
6874; GFX9-NEXT:    s_xor_b32 s0, s1, s0
6875; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
6876; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
6877; GFX9-NEXT:    s_or_b32 s5, s0, 1
6878; GFX9-NEXT:    v_and_b32_e32 v1, s8, v1
6879; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
6880; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
6881; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
6882; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
6883; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
6884; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
6885; GFX9-NEXT:    s_cselect_b32 s0, s5, 0
6886; GFX9-NEXT:    v_bfe_i32 v4, v1, 0, 15
6887; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
6888; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, v4
6889; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
6890; GFX9-NEXT:    v_bfe_i32 v6, v0, 0, 15
6891; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, v6
6892; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
6893; GFX9-NEXT:    v_xor_b32_e32 v4, v6, v4
6894; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
6895; GFX9-NEXT:    v_or_b32_e32 v4, 1, v4
6896; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v8
6897; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
6898; GFX9-NEXT:    v_cvt_i32_f32_e32 v8, v6
6899; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v7
6900; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v5|
6901; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
6902; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
6903; GFX9-NEXT:    v_add_u32_e32 v4, v8, v4
6904; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s6
6905; GFX9-NEXT:    v_mul_lo_u32 v1, v4, v1
6906; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
6907; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
6908; GFX9-NEXT:    v_sub_u32_e32 v2, s4, v2
6909; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
6910; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
6911; GFX9-NEXT:    v_and_b32_e32 v2, s8, v2
6912; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
6913; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
6914; GFX9-NEXT:    v_mov_b32_e32 v4, 0
6915; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
6916; GFX9-NEXT:    global_store_dword v4, v0, s[2:3]
6917; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
6918; GFX9-NEXT:    global_store_short v4, v0, s[2:3] offset:4
6919; GFX9-NEXT:    s_endpgm
6920;
6921; GFX90A-LABEL: srem_v3i15:
6922; GFX90A:       ; %bb.0:
6923; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6924; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6925; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
6926; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
6927; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
6928; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
6929; GFX90A-NEXT:    s_and_b32 s0, s4, s8
6930; GFX90A-NEXT:    s_and_b32 s1, s6, s8
6931; GFX90A-NEXT:    s_bfe_i32 s1, s1, 0xf0000
6932; GFX90A-NEXT:    v_cvt_f32_i32_e32 v3, s1
6933; GFX90A-NEXT:    s_bfe_i32 s0, s0, 0xf0000
6934; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s0
6935; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
6936; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v5, v3
6937; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
6938; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
6939; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
6940; GFX90A-NEXT:    v_mul_f32_e32 v5, v4, v5
6941; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
6942; GFX90A-NEXT:    v_mad_f32 v4, -v5, v3, v4
6943; GFX90A-NEXT:    v_cvt_i32_f32_e32 v5, v5
6944; GFX90A-NEXT:    v_alignbit_b32 v0, s5, v0, 30
6945; GFX90A-NEXT:    s_lshr_b32 s5, s4, 15
6946; GFX90A-NEXT:    s_bfe_u32 s9, s4, 0xf000f
6947; GFX90A-NEXT:    v_alignbit_b32 v1, s7, v1, 30
6948; GFX90A-NEXT:    s_lshr_b32 s7, s6, 15
6949; GFX90A-NEXT:    s_bfe_u32 s10, s6, 0xf000f
6950; GFX90A-NEXT:    s_or_b32 s11, s0, 1
6951; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
6952; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
6953; GFX90A-NEXT:    s_cselect_b32 s0, s11, 0
6954; GFX90A-NEXT:    v_add_u32_e32 v3, s0, v5
6955; GFX90A-NEXT:    s_bfe_i32 s0, s10, 0xf0000
6956; GFX90A-NEXT:    v_cvt_f32_i32_e32 v4, s0
6957; GFX90A-NEXT:    s_bfe_i32 s1, s9, 0xf0000
6958; GFX90A-NEXT:    v_cvt_f32_i32_e32 v5, s1
6959; GFX90A-NEXT:    s_xor_b32 s0, s1, s0
6960; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v6, v4
6961; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, s6
6962; GFX90A-NEXT:    s_ashr_i32 s0, s0, 30
6963; GFX90A-NEXT:    v_sub_u32_e32 v3, s4, v3
6964; GFX90A-NEXT:    v_mul_f32_e32 v6, v5, v6
6965; GFX90A-NEXT:    v_trunc_f32_e32 v6, v6
6966; GFX90A-NEXT:    v_mad_f32 v5, -v6, v4, v5
6967; GFX90A-NEXT:    v_cvt_i32_f32_e32 v6, v6
6968; GFX90A-NEXT:    s_or_b32 s4, s0, 1
6969; GFX90A-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
6970; GFX90A-NEXT:    v_and_b32_e32 v1, s8, v1
6971; GFX90A-NEXT:    s_and_b64 s[0:1], s[0:1], exec
6972; GFX90A-NEXT:    s_cselect_b32 s0, s4, 0
6973; GFX90A-NEXT:    v_bfe_i32 v5, v1, 0, 15
6974; GFX90A-NEXT:    v_add_u32_e32 v4, s0, v6
6975; GFX90A-NEXT:    v_cvt_f32_i32_e32 v6, v5
6976; GFX90A-NEXT:    v_and_b32_e32 v0, s8, v0
6977; GFX90A-NEXT:    v_bfe_i32 v7, v0, 0, 15
6978; GFX90A-NEXT:    v_cvt_f32_i32_e32 v8, v7
6979; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v9, v6
6980; GFX90A-NEXT:    v_xor_b32_e32 v5, v7, v5
6981; GFX90A-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
6982; GFX90A-NEXT:    v_or_b32_e32 v5, 1, v5
6983; GFX90A-NEXT:    v_mul_f32_e32 v7, v8, v9
6984; GFX90A-NEXT:    v_trunc_f32_e32 v7, v7
6985; GFX90A-NEXT:    v_cvt_i32_f32_e32 v9, v7
6986; GFX90A-NEXT:    v_mad_f32 v7, -v7, v6, v8
6987; GFX90A-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
6988; GFX90A-NEXT:    v_mul_lo_u32 v4, v4, s7
6989; GFX90A-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
6990; GFX90A-NEXT:    v_sub_u32_e32 v4, s5, v4
6991; GFX90A-NEXT:    v_add_u32_e32 v5, v9, v5
6992; GFX90A-NEXT:    v_mul_lo_u32 v1, v5, v1
6993; GFX90A-NEXT:    v_and_b32_e32 v4, s8, v4
6994; GFX90A-NEXT:    v_sub_u32_e32 v0, v0, v1
6995; GFX90A-NEXT:    v_and_b32_e32 v3, s8, v3
6996; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
6997; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
6998; GFX90A-NEXT:    v_or_b32_e32 v3, v3, v4
6999; GFX90A-NEXT:    v_or_b32_e32 v0, v3, v0
7000; GFX90A-NEXT:    global_store_dword v2, v0, s[2:3]
7001; GFX90A-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
7002; GFX90A-NEXT:    global_store_short v2, v0, s[2:3] offset:4
7003; GFX90A-NEXT:    s_endpgm
7004  %r = srem <3 x i15> %x, %y
7005  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
7006  ret void
7007}
7008
7009define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
7010; CHECK-LABEL: @udiv_i32_oddk_denom(
7011; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
7012; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
7013; CHECK-NEXT:    ret void
7014;
7015; GFX6-LABEL: udiv_i32_oddk_denom:
7016; GFX6:       ; %bb.0:
7017; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7018; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
7019; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
7020; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7021; GFX6-NEXT:    s_mov_b32 s6, -1
7022; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7023; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
7024; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
7025; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
7026; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7027; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
7028; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7029; GFX6-NEXT:    s_endpgm
7030;
7031; GFX9-LABEL: udiv_i32_oddk_denom:
7032; GFX9:       ; %bb.0:
7033; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7034; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
7035; GFX9-NEXT:    v_mov_b32_e32 v0, 0
7036; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7037; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
7038; GFX9-NEXT:    s_sub_i32 s1, s4, s0
7039; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
7040; GFX9-NEXT:    s_add_i32 s1, s1, s0
7041; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
7042; GFX9-NEXT:    v_mov_b32_e32 v1, s0
7043; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
7044; GFX9-NEXT:    s_endpgm
7045;
7046; GFX90A-LABEL: udiv_i32_oddk_denom:
7047; GFX90A:       ; %bb.0:
7048; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7049; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
7050; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
7051; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7052; GFX90A-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
7053; GFX90A-NEXT:    s_sub_i32 s1, s4, s0
7054; GFX90A-NEXT:    s_lshr_b32 s1, s1, 1
7055; GFX90A-NEXT:    s_add_i32 s1, s1, s0
7056; GFX90A-NEXT:    s_lshr_b32 s0, s1, 20
7057; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
7058; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
7059; GFX90A-NEXT:    s_endpgm
7060  %r = udiv i32 %x, 1235195
7061  store i32 %r, i32 addrspace(1)* %out
7062  ret void
7063}
7064
7065define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
7066; CHECK-LABEL: @udiv_i32_pow2k_denom(
7067; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
7068; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
7069; CHECK-NEXT:    ret void
7070;
7071; GFX6-LABEL: udiv_i32_pow2k_denom:
7072; GFX6:       ; %bb.0:
7073; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7074; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
7075; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7076; GFX6-NEXT:    s_mov_b32 s6, -1
7077; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7078; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
7079; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7080; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7081; GFX6-NEXT:    s_endpgm
7082;
7083; GFX9-LABEL: udiv_i32_pow2k_denom:
7084; GFX9:       ; %bb.0:
7085; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7086; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
7087; GFX9-NEXT:    v_mov_b32_e32 v0, 0
7088; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7089; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
7090; GFX9-NEXT:    v_mov_b32_e32 v1, s0
7091; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
7092; GFX9-NEXT:    s_endpgm
7093;
7094; GFX90A-LABEL: udiv_i32_pow2k_denom:
7095; GFX90A:       ; %bb.0:
7096; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7097; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
7098; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
7099; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7100; GFX90A-NEXT:    s_lshr_b32 s0, s4, 12
7101; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
7102; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
7103; GFX90A-NEXT:    s_endpgm
7104  %r = udiv i32 %x, 4096
7105  store i32 %r, i32 addrspace(1)* %out
7106  ret void
7107}
7108
7109define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
7110; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
7111; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
7112; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
7113; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
7114; CHECK-NEXT:    ret void
7115;
7116; GFX6-LABEL: udiv_i32_pow2_shl_denom:
7117; GFX6:       ; %bb.0:
7118; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7119; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7120; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7121; GFX6-NEXT:    s_mov_b32 s6, -1
7122; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7123; GFX6-NEXT:    s_add_i32 s1, s1, 12
7124; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
7125; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7126; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7127; GFX6-NEXT:    s_endpgm
7128;
7129; GFX9-LABEL: udiv_i32_pow2_shl_denom:
7130; GFX9:       ; %bb.0:
7131; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7132; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7133; GFX9-NEXT:    v_mov_b32_e32 v0, 0
7134; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7135; GFX9-NEXT:    s_add_i32 s0, s5, 12
7136; GFX9-NEXT:    s_lshr_b32 s0, s4, s0
7137; GFX9-NEXT:    v_mov_b32_e32 v1, s0
7138; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
7139; GFX9-NEXT:    s_endpgm
7140;
7141; GFX90A-LABEL: udiv_i32_pow2_shl_denom:
7142; GFX90A:       ; %bb.0:
7143; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7144; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7145; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
7146; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7147; GFX90A-NEXT:    s_add_i32 s0, s5, 12
7148; GFX90A-NEXT:    s_lshr_b32 s0, s4, s0
7149; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
7150; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
7151; GFX90A-NEXT:    s_endpgm
7152  %shl.y = shl i32 4096, %y
7153  %r = udiv i32 %x, %shl.y
7154  store i32 %r, i32 addrspace(1)* %out
7155  ret void
7156}
7157
7158define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
7159; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
7160; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
7161; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
7162; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
7163; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
7164; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
7165; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
7166; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
7167; CHECK-NEXT:    ret void
7168;
7169; GFX6-LABEL: udiv_v2i32_pow2k_denom:
7170; GFX6:       ; %bb.0:
7171; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7172; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7173; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7174; GFX6-NEXT:    s_mov_b32 s6, -1
7175; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7176; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
7177; GFX6-NEXT:    s_lshr_b32 s1, s1, 12
7178; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7179; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7180; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7181; GFX6-NEXT:    s_endpgm
7182;
7183; GFX9-LABEL: udiv_v2i32_pow2k_denom:
7184; GFX9:       ; %bb.0:
7185; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7186; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7187; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7188; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7189; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
7190; GFX9-NEXT:    s_lshr_b32 s1, s5, 12
7191; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7192; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7193; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7194; GFX9-NEXT:    s_endpgm
7195;
7196; GFX90A-LABEL: udiv_v2i32_pow2k_denom:
7197; GFX90A:       ; %bb.0:
7198; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7199; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7200; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
7201; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7202; GFX90A-NEXT:    s_lshr_b32 s0, s4, 12
7203; GFX90A-NEXT:    s_lshr_b32 s1, s5, 12
7204; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
7205; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
7206; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7207; GFX90A-NEXT:    s_endpgm
7208  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
7209  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7210  ret void
7211}
7212
7213define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
7214; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
7215; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
7216; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
7217; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
7218; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
7219; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
7220; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
7221; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
7222; CHECK-NEXT:    ret void
7223;
7224; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom:
7225; GFX6:       ; %bb.0:
7226; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7227; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7228; GFX6-NEXT:    v_mov_b32_e32 v0, 0x100101
7229; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7230; GFX6-NEXT:    s_mov_b32 s6, -1
7231; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7232; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
7233; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
7234; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v0
7235; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
7236; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7237; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
7238; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7239; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7240; GFX6-NEXT:    s_endpgm
7241;
7242; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
7243; GFX9:       ; %bb.0:
7244; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7245; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7246; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7247; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7248; GFX9-NEXT:    s_mul_hi_u32 s1, s5, 0x100101
7249; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
7250; GFX9-NEXT:    s_sub_i32 s4, s5, s1
7251; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
7252; GFX9-NEXT:    s_add_i32 s4, s4, s1
7253; GFX9-NEXT:    s_lshr_b32 s1, s4, 11
7254; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7255; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7256; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7257; GFX9-NEXT:    s_endpgm
7258;
7259; GFX90A-LABEL: udiv_v2i32_mixed_pow2k_denom:
7260; GFX90A:       ; %bb.0:
7261; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7262; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7263; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
7264; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7265; GFX90A-NEXT:    s_mul_hi_u32 s1, s5, 0x100101
7266; GFX90A-NEXT:    s_lshr_b32 s0, s4, 12
7267; GFX90A-NEXT:    s_sub_i32 s4, s5, s1
7268; GFX90A-NEXT:    s_lshr_b32 s4, s4, 1
7269; GFX90A-NEXT:    s_add_i32 s4, s4, s1
7270; GFX90A-NEXT:    s_lshr_b32 s1, s4, 11
7271; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
7272; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
7273; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7274; GFX90A-NEXT:    s_endpgm
7275  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
7276  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7277  ret void
7278}
7279
7280define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
7281; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
7282; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
7283; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
7284; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
7285; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
7286; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
7287; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
7288; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
7289; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
7290; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
7291; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
7292; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
7293; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
7294; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
7295; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
7296; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
7297; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
7298; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
7299; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
7300; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
7301; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
7302; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
7303; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
7304; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
7305; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
7306; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
7307; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
7308; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
7309; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
7310; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
7311; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
7312; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
7313; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
7314; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0
7315; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
7316; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
7317; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
7318; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
7319; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
7320; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
7321; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
7322; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
7323; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
7324; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
7325; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
7326; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
7327; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
7328; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
7329; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
7330; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
7331; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
7332; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
7333; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
7334; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
7335; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
7336; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
7337; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
7338; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
7339; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
7340; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
7341; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
7342; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
7343; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
7344; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
7345; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
7346; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
7347; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
7348; CHECK-NEXT:    ret void
7349;
7350; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
7351; GFX6:       ; %bb.0:
7352; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
7353; GFX6-NEXT:    s_movk_i32 s4, 0x1000
7354; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7355; GFX6-NEXT:    s_mov_b32 s6, -1
7356; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7357; GFX6-NEXT:    s_lshl_b32 s8, s4, s2
7358; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
7359; GFX6-NEXT:    s_lshl_b32 s9, s4, s3
7360; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
7361; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7362; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
7363; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7364; GFX6-NEXT:    s_mov_b32 s0, 0x4f7ffffe
7365; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
7366; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
7367; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7368; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
7369; GFX6-NEXT:    s_sub_i32 s0, 0, s8
7370; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7371; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
7372; GFX6-NEXT:    s_sub_i32 s0, 0, s9
7373; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
7374; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
7375; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7376; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
7377; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7378; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
7379; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
7380; GFX6-NEXT:    v_mul_hi_u32 v1, s3, v1
7381; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
7382; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
7383; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
7384; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
7385; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
7386; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
7387; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
7388; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
7389; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
7390; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
7391; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7392; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v4
7393; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
7394; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
7395; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
7396; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v2
7397; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
7398; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
7399; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
7400; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7401; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7402; GFX6-NEXT:    s_endpgm
7403;
7404; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
7405; GFX9:       ; %bb.0:
7406; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
7407; GFX9-NEXT:    s_movk_i32 s4, 0x1000
7408; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7409; GFX9-NEXT:    s_lshl_b32 s5, s4, s3
7410; GFX9-NEXT:    s_lshl_b32 s4, s4, s2
7411; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
7412; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
7413; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
7414; GFX9-NEXT:    s_sub_i32 s3, 0, s5
7415; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7416; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
7417; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
7418; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7419; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
7420; GFX9-NEXT:    s_sub_i32 s2, 0, s4
7421; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7422; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
7423; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
7424; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
7425; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
7426; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
7427; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
7428; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
7429; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7430; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
7431; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7432; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
7433; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7434; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
7435; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
7436; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
7437; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
7438; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v3
7439; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
7440; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
7441; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
7442; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
7443; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v4
7444; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
7445; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
7446; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
7447; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v4
7448; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v4
7449; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
7450; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
7451; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
7452; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
7453; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7454; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7455; GFX9-NEXT:    s_endpgm
7456;
7457; GFX90A-LABEL: udiv_v2i32_pow2_shl_denom:
7458; GFX90A:       ; %bb.0:
7459; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
7460; GFX90A-NEXT:    s_movk_i32 s8, 0x1000
7461; GFX90A-NEXT:    s_mov_b32 s9, 0x4f7ffffe
7462; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
7463; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
7464; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
7465; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7466; GFX90A-NEXT:    s_lshl_b32 s2, s8, s2
7467; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
7468; GFX90A-NEXT:    s_lshl_b32 s0, s8, s3
7469; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s0
7470; GFX90A-NEXT:    s_sub_i32 s1, 0, s2
7471; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7472; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
7473; GFX90A-NEXT:    v_mul_f32_e32 v0, s9, v0
7474; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
7475; GFX90A-NEXT:    v_mul_f32_e32 v1, s9, v1
7476; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
7477; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v0
7478; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v3
7479; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v3
7480; GFX90A-NEXT:    v_mul_hi_u32 v0, s6, v0
7481; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
7482; GFX90A-NEXT:    v_sub_u32_e32 v3, s6, v3
7483; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v0
7484; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
7485; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
7486; GFX90A-NEXT:    v_subrev_u32_e32 v4, s2, v3
7487; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
7488; GFX90A-NEXT:    s_sub_i32 s1, 0, s0
7489; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
7490; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v1
7491; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
7492; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
7493; GFX90A-NEXT:    v_mul_hi_u32 v1, s7, v1
7494; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s0
7495; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v0
7496; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v3
7497; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
7498; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v1
7499; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
7500; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7501; GFX90A-NEXT:    v_subrev_u32_e32 v4, s0, v3
7502; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
7503; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v1
7504; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
7505; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7506; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7507; GFX90A-NEXT:    s_endpgm
7508  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
7509  %r = udiv <2 x i32> %x, %shl.y
7510  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7511  ret void
7512}
7513
7514define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
7515; CHECK-LABEL: @urem_i32_oddk_denom(
7516; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
7517; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
7518; CHECK-NEXT:    ret void
7519;
7520; GFX6-LABEL: urem_i32_oddk_denom:
7521; GFX6:       ; %bb.0:
7522; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
7523; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
7524; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
7525; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
7526; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7527; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7528; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
7529; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
7530; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
7531; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7532; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
7533; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
7534; GFX6-NEXT:    s_mov_b32 s2, -1
7535; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
7536; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
7537; GFX6-NEXT:    s_endpgm
7538;
7539; GFX9-LABEL: urem_i32_oddk_denom:
7540; GFX9:       ; %bb.0:
7541; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7542; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
7543; GFX9-NEXT:    v_mov_b32_e32 v0, 0
7544; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7545; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
7546; GFX9-NEXT:    s_sub_i32 s1, s4, s0
7547; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
7548; GFX9-NEXT:    s_add_i32 s1, s1, s0
7549; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
7550; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
7551; GFX9-NEXT:    s_sub_i32 s0, s4, s0
7552; GFX9-NEXT:    v_mov_b32_e32 v1, s0
7553; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
7554; GFX9-NEXT:    s_endpgm
7555;
7556; GFX90A-LABEL: urem_i32_oddk_denom:
7557; GFX90A:       ; %bb.0:
7558; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7559; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
7560; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
7561; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7562; GFX90A-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
7563; GFX90A-NEXT:    s_sub_i32 s1, s4, s0
7564; GFX90A-NEXT:    s_lshr_b32 s1, s1, 1
7565; GFX90A-NEXT:    s_add_i32 s1, s1, s0
7566; GFX90A-NEXT:    s_lshr_b32 s0, s1, 20
7567; GFX90A-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
7568; GFX90A-NEXT:    s_sub_i32 s0, s4, s0
7569; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
7570; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
7571; GFX90A-NEXT:    s_endpgm
7572  %r = urem i32 %x, 1235195
7573  store i32 %r, i32 addrspace(1)* %out
7574  ret void
7575}
7576
7577define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
7578; CHECK-LABEL: @urem_i32_pow2k_denom(
7579; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
7580; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
7581; CHECK-NEXT:    ret void
7582;
7583; GFX6-LABEL: urem_i32_pow2k_denom:
7584; GFX6:       ; %bb.0:
7585; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7586; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
7587; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7588; GFX6-NEXT:    s_mov_b32 s6, -1
7589; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7590; GFX6-NEXT:    s_and_b32 s0, s0, 0xfff
7591; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7592; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7593; GFX6-NEXT:    s_endpgm
7594;
7595; GFX9-LABEL: urem_i32_pow2k_denom:
7596; GFX9:       ; %bb.0:
7597; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7598; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
7599; GFX9-NEXT:    v_mov_b32_e32 v0, 0
7600; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7601; GFX9-NEXT:    s_and_b32 s0, s4, 0xfff
7602; GFX9-NEXT:    v_mov_b32_e32 v1, s0
7603; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
7604; GFX9-NEXT:    s_endpgm
7605;
7606; GFX90A-LABEL: urem_i32_pow2k_denom:
7607; GFX90A:       ; %bb.0:
7608; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7609; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
7610; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
7611; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7612; GFX90A-NEXT:    s_and_b32 s0, s4, 0xfff
7613; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
7614; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
7615; GFX90A-NEXT:    s_endpgm
7616  %r = urem i32 %x, 4096
7617  store i32 %r, i32 addrspace(1)* %out
7618  ret void
7619}
7620
7621define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
7622; CHECK-LABEL: @urem_i32_pow2_shl_denom(
7623; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
7624; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
7625; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
7626; CHECK-NEXT:    ret void
7627;
7628; GFX6-LABEL: urem_i32_pow2_shl_denom:
7629; GFX6:       ; %bb.0:
7630; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7631; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7632; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7633; GFX6-NEXT:    s_mov_b32 s6, -1
7634; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7635; GFX6-NEXT:    s_lshl_b32 s1, 0x1000, s1
7636; GFX6-NEXT:    s_add_i32 s1, s1, -1
7637; GFX6-NEXT:    s_and_b32 s0, s0, s1
7638; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7639; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7640; GFX6-NEXT:    s_endpgm
7641;
7642; GFX9-LABEL: urem_i32_pow2_shl_denom:
7643; GFX9:       ; %bb.0:
7644; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7645; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7646; GFX9-NEXT:    v_mov_b32_e32 v0, 0
7647; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7648; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s5
7649; GFX9-NEXT:    s_add_i32 s0, s0, -1
7650; GFX9-NEXT:    s_and_b32 s0, s4, s0
7651; GFX9-NEXT:    v_mov_b32_e32 v1, s0
7652; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
7653; GFX9-NEXT:    s_endpgm
7654;
7655; GFX90A-LABEL: urem_i32_pow2_shl_denom:
7656; GFX90A:       ; %bb.0:
7657; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7658; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7659; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
7660; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7661; GFX90A-NEXT:    s_lshl_b32 s0, 0x1000, s5
7662; GFX90A-NEXT:    s_add_i32 s0, s0, -1
7663; GFX90A-NEXT:    s_and_b32 s0, s4, s0
7664; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
7665; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
7666; GFX90A-NEXT:    s_endpgm
7667  %shl.y = shl i32 4096, %y
7668  %r = urem i32 %x, %shl.y
7669  store i32 %r, i32 addrspace(1)* %out
7670  ret void
7671}
7672
7673define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
7674; CHECK-LABEL: @urem_v2i32_pow2k_denom(
7675; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
7676; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
7677; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
7678; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
7679; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
7680; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
7681; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
7682; CHECK-NEXT:    ret void
7683;
7684; GFX6-LABEL: urem_v2i32_pow2k_denom:
7685; GFX6:       ; %bb.0:
7686; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7687; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7688; GFX6-NEXT:    s_movk_i32 s2, 0xfff
7689; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7690; GFX6-NEXT:    s_mov_b32 s6, -1
7691; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7692; GFX6-NEXT:    s_and_b32 s0, s0, s2
7693; GFX6-NEXT:    s_and_b32 s1, s1, s2
7694; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7695; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7696; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7697; GFX6-NEXT:    s_endpgm
7698;
7699; GFX9-LABEL: urem_v2i32_pow2k_denom:
7700; GFX9:       ; %bb.0:
7701; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7702; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7703; GFX9-NEXT:    s_movk_i32 s0, 0xfff
7704; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7705; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7706; GFX9-NEXT:    s_and_b32 s1, s4, s0
7707; GFX9-NEXT:    s_and_b32 s0, s5, s0
7708; GFX9-NEXT:    v_mov_b32_e32 v0, s1
7709; GFX9-NEXT:    v_mov_b32_e32 v1, s0
7710; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7711; GFX9-NEXT:    s_endpgm
7712;
7713; GFX90A-LABEL: urem_v2i32_pow2k_denom:
7714; GFX90A:       ; %bb.0:
7715; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7716; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7717; GFX90A-NEXT:    s_movk_i32 s0, 0xfff
7718; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
7719; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7720; GFX90A-NEXT:    s_and_b32 s1, s4, s0
7721; GFX90A-NEXT:    s_and_b32 s0, s5, s0
7722; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
7723; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
7724; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7725; GFX90A-NEXT:    s_endpgm
7726  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
7727  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7728  ret void
7729}
7730
7731define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
7732; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
7733; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
7734; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
7735; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
7736; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
7737; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
7738; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
7739; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
7740; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
7741; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
7742; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
7743; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
7744; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
7745; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
7746; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
7747; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
7748; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
7749; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
7750; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
7751; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
7752; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
7753; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
7754; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
7755; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
7756; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
7757; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
7758; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
7759; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
7760; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
7761; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
7762; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
7763; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0
7764; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
7765; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
7766; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
7767; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
7768; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
7769; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
7770; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
7771; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
7772; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
7773; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
7774; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
7775; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
7776; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
7777; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
7778; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
7779; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
7780; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
7781; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
7782; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
7783; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
7784; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
7785; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
7786; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
7787; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
7788; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
7789; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
7790; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
7791; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
7792; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
7793; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
7794; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
7795; CHECK-NEXT:    ret void
7796;
7797; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
7798; GFX6:       ; %bb.0:
7799; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
7800; GFX6-NEXT:    s_movk_i32 s4, 0x1000
7801; GFX6-NEXT:    s_mov_b32 s5, 0x4f7ffffe
7802; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7803; GFX6-NEXT:    s_mov_b32 s6, -1
7804; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7805; GFX6-NEXT:    s_lshl_b32 s2, s4, s2
7806; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
7807; GFX6-NEXT:    s_lshl_b32 s3, s4, s3
7808; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
7809; GFX6-NEXT:    s_sub_i32 s4, 0, s2
7810; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7811; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
7812; GFX6-NEXT:    v_mul_f32_e32 v0, s5, v0
7813; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7814; GFX6-NEXT:    v_mul_f32_e32 v1, s5, v1
7815; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7816; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v0
7817; GFX6-NEXT:    s_sub_i32 s4, 0, s3
7818; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
7819; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7820; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
7821; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
7822; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7823; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
7824; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7825; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
7826; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
7827; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
7828; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
7829; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
7830; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
7831; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v0
7832; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
7833; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7834; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v0
7835; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
7836; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7837; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
7838; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
7839; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
7840; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7841; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
7842; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
7843; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7844; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7845; GFX6-NEXT:    s_endpgm
7846;
7847; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
7848; GFX9:       ; %bb.0:
7849; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
7850; GFX9-NEXT:    s_movk_i32 s4, 0x1000
7851; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7852; GFX9-NEXT:    s_lshl_b32 s5, s4, s3
7853; GFX9-NEXT:    s_lshl_b32 s4, s4, s2
7854; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
7855; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
7856; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
7857; GFX9-NEXT:    s_sub_i32 s3, 0, s5
7858; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7859; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
7860; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
7861; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
7862; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7863; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7864; GFX9-NEXT:    s_sub_i32 s2, 0, s4
7865; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
7866; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
7867; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
7868; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
7869; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
7870; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
7871; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
7872; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7873; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7874; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
7875; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
7876; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7877; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
7878; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s5
7879; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
7880; GFX9-NEXT:    v_sub_u32_e32 v1, s3, v1
7881; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
7882; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
7883; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
7884; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7885; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
7886; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7887; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
7888; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
7889; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
7890; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7891; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
7892; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7893; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7894; GFX9-NEXT:    s_endpgm
7895;
7896; GFX90A-LABEL: urem_v2i32_pow2_shl_denom:
7897; GFX90A:       ; %bb.0:
7898; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
7899; GFX90A-NEXT:    s_movk_i32 s8, 0x1000
7900; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
7901; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
7902; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
7903; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7904; GFX90A-NEXT:    s_lshl_b32 s2, s8, s2
7905; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
7906; GFX90A-NEXT:    s_lshl_b32 s0, s8, s3
7907; GFX90A-NEXT:    s_mov_b32 s3, 0x4f7ffffe
7908; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s0
7909; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7910; GFX90A-NEXT:    s_sub_i32 s1, 0, s2
7911; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
7912; GFX90A-NEXT:    v_mul_f32_e32 v0, s3, v0
7913; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
7914; GFX90A-NEXT:    v_mul_f32_e32 v1, s3, v1
7915; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
7916; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v0
7917; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v3
7918; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v3
7919; GFX90A-NEXT:    v_mul_hi_u32 v0, s6, v0
7920; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s2
7921; GFX90A-NEXT:    v_sub_u32_e32 v0, s6, v0
7922; GFX90A-NEXT:    v_subrev_u32_e32 v3, s2, v0
7923; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
7924; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7925; GFX90A-NEXT:    v_subrev_u32_e32 v3, s2, v0
7926; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
7927; GFX90A-NEXT:    s_sub_i32 s1, 0, s0
7928; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7929; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v1
7930; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
7931; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
7932; GFX90A-NEXT:    v_mul_hi_u32 v1, s7, v1
7933; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s0
7934; GFX90A-NEXT:    v_sub_u32_e32 v1, s7, v1
7935; GFX90A-NEXT:    v_subrev_u32_e32 v3, s0, v1
7936; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
7937; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7938; GFX90A-NEXT:    v_subrev_u32_e32 v3, s0, v1
7939; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v1
7940; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7941; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7942; GFX90A-NEXT:    s_endpgm
7943  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
7944  %r = urem <2 x i32> %x, %shl.y
7945  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7946  ret void
7947}
7948
7949define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
7950; CHECK-LABEL: @sdiv_i32_oddk_denom(
7951; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
7952; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
7953; CHECK-NEXT:    ret void
7954;
7955; GFX6-LABEL: sdiv_i32_oddk_denom:
7956; GFX6:       ; %bb.0:
7957; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7958; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
7959; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
7960; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7961; GFX6-NEXT:    s_mov_b32 s6, -1
7962; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7963; GFX6-NEXT:    v_mul_hi_i32 v0, s0, v0
7964; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
7965; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
7966; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
7967; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
7968; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
7969; GFX6-NEXT:    s_endpgm
7970;
7971; GFX9-LABEL: sdiv_i32_oddk_denom:
7972; GFX9:       ; %bb.0:
7973; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7974; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
7975; GFX9-NEXT:    v_mov_b32_e32 v0, 0
7976; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7977; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
7978; GFX9-NEXT:    s_add_i32 s0, s0, s4
7979; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
7980; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
7981; GFX9-NEXT:    s_add_i32 s0, s0, s1
7982; GFX9-NEXT:    v_mov_b32_e32 v1, s0
7983; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
7984; GFX9-NEXT:    s_endpgm
7985;
7986; GFX90A-LABEL: sdiv_i32_oddk_denom:
7987; GFX90A:       ; %bb.0:
7988; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7989; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
7990; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
7991; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
7992; GFX90A-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
7993; GFX90A-NEXT:    s_add_i32 s0, s0, s4
7994; GFX90A-NEXT:    s_lshr_b32 s1, s0, 31
7995; GFX90A-NEXT:    s_ashr_i32 s0, s0, 20
7996; GFX90A-NEXT:    s_add_i32 s0, s0, s1
7997; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
7998; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
7999; GFX90A-NEXT:    s_endpgm
8000  %r = sdiv i32 %x, 1235195
8001  store i32 %r, i32 addrspace(1)* %out
8002  ret void
8003}
8004
8005define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
8006; CHECK-LABEL: @sdiv_i32_pow2k_denom(
8007; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
8008; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
8009; CHECK-NEXT:    ret void
8010;
8011; GFX6-LABEL: sdiv_i32_pow2k_denom:
8012; GFX6:       ; %bb.0:
8013; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8014; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
8015; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8016; GFX6-NEXT:    s_mov_b32 s6, -1
8017; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8018; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
8019; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
8020; GFX6-NEXT:    s_add_i32 s0, s0, s1
8021; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
8022; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8023; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
8024; GFX6-NEXT:    s_endpgm
8025;
8026; GFX9-LABEL: sdiv_i32_pow2k_denom:
8027; GFX9:       ; %bb.0:
8028; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8029; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
8030; GFX9-NEXT:    v_mov_b32_e32 v0, 0
8031; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8032; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
8033; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8034; GFX9-NEXT:    s_add_i32 s4, s4, s0
8035; GFX9-NEXT:    s_ashr_i32 s0, s4, 12
8036; GFX9-NEXT:    v_mov_b32_e32 v1, s0
8037; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
8038; GFX9-NEXT:    s_endpgm
8039;
8040; GFX90A-LABEL: sdiv_i32_pow2k_denom:
8041; GFX90A:       ; %bb.0:
8042; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8043; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
8044; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
8045; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8046; GFX90A-NEXT:    s_ashr_i32 s0, s4, 31
8047; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
8048; GFX90A-NEXT:    s_add_i32 s4, s4, s0
8049; GFX90A-NEXT:    s_ashr_i32 s0, s4, 12
8050; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
8051; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
8052; GFX90A-NEXT:    s_endpgm
8053  %r = sdiv i32 %x, 4096
8054  store i32 %r, i32 addrspace(1)* %out
8055  ret void
8056}
8057
8058define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
8059; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
8060; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
8061; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
8062; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
8063; CHECK-NEXT:    ret void
8064;
8065; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
8066; GFX6:       ; %bb.0:
8067; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
8068; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8069; GFX6-NEXT:    s_mov_b32 s6, -1
8070; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8071; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
8072; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
8073; GFX6-NEXT:    s_add_i32 s3, s3, s8
8074; GFX6-NEXT:    s_xor_b32 s3, s3, s8
8075; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
8076; GFX6-NEXT:    s_sub_i32 s4, 0, s3
8077; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8078; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
8079; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8080; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
8081; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8082; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
8083; GFX6-NEXT:    s_add_i32 s1, s2, s0
8084; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
8085; GFX6-NEXT:    s_xor_b32 s1, s1, s0
8086; GFX6-NEXT:    s_xor_b32 s2, s0, s8
8087; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
8088; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
8089; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
8090; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
8091; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
8092; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
8093; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8094; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
8095; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
8096; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
8097; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
8098; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
8099; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
8100; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
8101; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8102; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
8103; GFX6-NEXT:    s_endpgm
8104;
8105; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
8106; GFX9:       ; %bb.0:
8107; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
8108; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8109; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
8110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8111; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
8112; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
8113; GFX9-NEXT:    s_add_i32 s3, s3, s4
8114; GFX9-NEXT:    s_xor_b32 s3, s3, s4
8115; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
8116; GFX9-NEXT:    s_sub_i32 s5, 0, s3
8117; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8118; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
8119; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8120; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
8121; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
8122; GFX9-NEXT:    s_add_i32 s2, s2, s5
8123; GFX9-NEXT:    s_xor_b32 s2, s2, s5
8124; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
8125; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
8126; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
8127; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
8128; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
8129; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
8130; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
8131; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
8132; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
8133; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
8134; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
8135; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
8136; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
8137; GFX9-NEXT:    s_xor_b32 s2, s5, s4
8138; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
8139; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
8140; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
8141; GFX9-NEXT:    s_endpgm
8142;
8143; GFX90A-LABEL: sdiv_i32_pow2_shl_denom:
8144; GFX90A:       ; %bb.0:
8145; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
8146; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
8147; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
8148; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8149; GFX90A-NEXT:    s_lshl_b32 s3, 0x1000, s3
8150; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
8151; GFX90A-NEXT:    s_add_i32 s3, s3, s4
8152; GFX90A-NEXT:    s_xor_b32 s3, s3, s4
8153; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
8154; GFX90A-NEXT:    s_sub_i32 s6, 0, s3
8155; GFX90A-NEXT:    s_ashr_i32 s5, s2, 31
8156; GFX90A-NEXT:    s_add_i32 s2, s2, s5
8157; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8158; GFX90A-NEXT:    s_xor_b32 s2, s2, s5
8159; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
8160; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
8161; GFX90A-NEXT:    v_mul_lo_u32 v2, s6, v0
8162; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
8163; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
8164; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
8165; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s3
8166; GFX90A-NEXT:    v_sub_u32_e32 v3, s2, v3
8167; GFX90A-NEXT:    v_add_u32_e32 v2, 1, v0
8168; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
8169; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8170; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v3
8171; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
8172; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v0
8173; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
8174; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
8175; GFX90A-NEXT:    s_xor_b32 s2, s5, s4
8176; GFX90A-NEXT:    v_xor_b32_e32 v0, s2, v0
8177; GFX90A-NEXT:    v_subrev_u32_e32 v0, s2, v0
8178; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
8179; GFX90A-NEXT:    s_endpgm
8180  %shl.y = shl i32 4096, %y
8181  %r = sdiv i32 %x, %shl.y
8182  store i32 %r, i32 addrspace(1)* %out
8183  ret void
8184}
8185
8186define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
8187; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
8188; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
8189; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
8190; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
8191; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
8192; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
8193; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
8194; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
8195; CHECK-NEXT:    ret void
8196;
8197; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
8198; GFX6:       ; %bb.0:
8199; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8200; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8201; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8202; GFX6-NEXT:    s_mov_b32 s6, -1
8203; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8204; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
8205; GFX6-NEXT:    s_ashr_i32 s3, s1, 31
8206; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
8207; GFX6-NEXT:    s_add_i32 s0, s0, s2
8208; GFX6-NEXT:    s_lshr_b32 s2, s3, 20
8209; GFX6-NEXT:    s_add_i32 s1, s1, s2
8210; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
8211; GFX6-NEXT:    s_ashr_i32 s1, s1, 12
8212; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8213; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8214; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8215; GFX6-NEXT:    s_endpgm
8216;
8217; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
8218; GFX9:       ; %bb.0:
8219; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8220; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
8221; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8222; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8223; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
8224; GFX9-NEXT:    s_ashr_i32 s1, s5, 31
8225; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8226; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
8227; GFX9-NEXT:    s_add_i32 s0, s4, s0
8228; GFX9-NEXT:    s_add_i32 s1, s5, s1
8229; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
8230; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
8231; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8232; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8233; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
8234; GFX9-NEXT:    s_endpgm
8235;
8236; GFX90A-LABEL: sdiv_v2i32_pow2k_denom:
8237; GFX90A:       ; %bb.0:
8238; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8239; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
8240; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
8241; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8242; GFX90A-NEXT:    s_ashr_i32 s0, s4, 31
8243; GFX90A-NEXT:    s_ashr_i32 s1, s5, 31
8244; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
8245; GFX90A-NEXT:    s_lshr_b32 s1, s1, 20
8246; GFX90A-NEXT:    s_add_i32 s0, s4, s0
8247; GFX90A-NEXT:    s_add_i32 s1, s5, s1
8248; GFX90A-NEXT:    s_ashr_i32 s0, s0, 12
8249; GFX90A-NEXT:    s_ashr_i32 s1, s1, 12
8250; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
8251; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
8252; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
8253; GFX90A-NEXT:    s_endpgm
8254  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
8255  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
8256  ret void
8257}
8258
8259define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
8260; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
8261; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
8262; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
8263; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
8264; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
8265; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
8266; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
8267; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
8268; CHECK-NEXT:    ret void
8269;
8270; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
8271; GFX6:       ; %bb.0:
8272; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8273; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8274; GFX6-NEXT:    v_mov_b32_e32 v0, 0x80080081
8275; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8276; GFX6-NEXT:    s_mov_b32 s6, -1
8277; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8278; GFX6-NEXT:    v_mul_hi_i32 v0, s1, v0
8279; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
8280; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
8281; GFX6-NEXT:    s_add_i32 s0, s0, s2
8282; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
8283; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
8284; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
8285; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
8286; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
8287; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8288; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8289; GFX6-NEXT:    s_endpgm
8290;
8291; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
8292; GFX9:       ; %bb.0:
8293; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8294; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
8295; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8296; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8297; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
8298; GFX9-NEXT:    s_mul_hi_i32 s1, s5, 0x80080081
8299; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8300; GFX9-NEXT:    s_add_i32 s1, s1, s5
8301; GFX9-NEXT:    s_add_i32 s0, s4, s0
8302; GFX9-NEXT:    s_lshr_b32 s4, s1, 31
8303; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
8304; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
8305; GFX9-NEXT:    s_add_i32 s1, s1, s4
8306; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8307; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8308; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
8309; GFX9-NEXT:    s_endpgm
8310;
8311; GFX90A-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
8312; GFX90A:       ; %bb.0:
8313; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8314; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
8315; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
8316; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8317; GFX90A-NEXT:    s_ashr_i32 s0, s4, 31
8318; GFX90A-NEXT:    s_mul_hi_i32 s1, s5, 0x80080081
8319; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
8320; GFX90A-NEXT:    s_add_i32 s1, s1, s5
8321; GFX90A-NEXT:    s_add_i32 s0, s4, s0
8322; GFX90A-NEXT:    s_lshr_b32 s4, s1, 31
8323; GFX90A-NEXT:    s_ashr_i32 s1, s1, 11
8324; GFX90A-NEXT:    s_ashr_i32 s0, s0, 12
8325; GFX90A-NEXT:    s_add_i32 s1, s1, s4
8326; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
8327; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
8328; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
8329; GFX90A-NEXT:    s_endpgm
8330  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
8331  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
8332  ret void
8333}
8334
8335define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
8336; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
8337; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
8338; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
8339; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
8340; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
8341; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
8342; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
8343; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
8344; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
8345; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
8346; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
8347; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
8348; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
8349; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
8350; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
8351; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
8352; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
8353; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
8354; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
8355; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
8356; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
8357; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
8358; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
8359; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
8360; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
8361; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
8362; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
8363; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
8364; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
8365; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
8366; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
8367; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
8368; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
8369; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
8370; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
8371; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
8372; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
8373; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
8374; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
8375; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
8376; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
8377; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
8378; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0
8379; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
8380; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
8381; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
8382; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
8383; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
8384; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
8385; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
8386; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
8387; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
8388; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
8389; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
8390; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
8391; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
8392; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
8393; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
8394; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
8395; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
8396; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
8397; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
8398; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
8399; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
8400; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
8401; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
8402; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
8403; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
8404; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
8405; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
8406; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
8407; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
8408; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
8409; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
8410; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
8411; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
8412; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
8413; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
8414; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
8415; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
8416; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
8417; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
8418; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
8419; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
8420; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
8421; CHECK-NEXT:    ret void
8422;
8423; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
8424; GFX6:       ; %bb.0:
8425; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
8426; GFX6-NEXT:    s_movk_i32 s10, 0x1000
8427; GFX6-NEXT:    s_mov_b32 s12, 0x4f7ffffe
8428; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8429; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
8430; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8431; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8432; GFX6-NEXT:    s_lshl_b32 s2, s10, s2
8433; GFX6-NEXT:    s_ashr_i32 s11, s2, 31
8434; GFX6-NEXT:    s_add_i32 s2, s2, s11
8435; GFX6-NEXT:    s_xor_b32 s2, s2, s11
8436; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
8437; GFX6-NEXT:    s_lshl_b32 s0, s10, s3
8438; GFX6-NEXT:    s_sub_i32 s10, 0, s2
8439; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
8440; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8441; GFX6-NEXT:    s_add_i32 s0, s0, s3
8442; GFX6-NEXT:    s_ashr_i32 s1, s8, 31
8443; GFX6-NEXT:    s_mov_b32 s6, -1
8444; GFX6-NEXT:    v_mul_f32_e32 v0, s12, v0
8445; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8446; GFX6-NEXT:    v_mul_lo_u32 v1, s10, v0
8447; GFX6-NEXT:    s_xor_b32 s10, s0, s3
8448; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
8449; GFX6-NEXT:    s_add_i32 s0, s8, s1
8450; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
8451; GFX6-NEXT:    s_xor_b32 s0, s0, s1
8452; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
8453; GFX6-NEXT:    s_xor_b32 s8, s1, s11
8454; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
8455; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
8456; GFX6-NEXT:    v_mul_f32_e32 v1, s12, v2
8457; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8458; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
8459; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
8460; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
8461; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
8462; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
8463; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
8464; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
8465; GFX6-NEXT:    s_sub_i32 s0, 0, s10
8466; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
8467; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
8468; GFX6-NEXT:    s_add_i32 s1, s9, s0
8469; GFX6-NEXT:    s_xor_b32 s1, s1, s0
8470; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
8471; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
8472; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
8473; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
8474; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
8475; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
8476; GFX6-NEXT:    s_xor_b32 s2, s0, s3
8477; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
8478; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
8479; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
8480; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
8481; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
8482; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8483; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
8484; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
8485; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
8486; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
8487; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
8488; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
8489; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
8490; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
8491; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8492; GFX6-NEXT:    s_endpgm
8493;
8494; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
8495; GFX9:       ; %bb.0:
8496; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
8497; GFX9-NEXT:    s_movk_i32 s8, 0x1000
8498; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
8499; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
8500; GFX9-NEXT:    s_mov_b32 s10, 0x4f7ffffe
8501; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8502; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8503; GFX9-NEXT:    s_lshl_b32 s2, s8, s2
8504; GFX9-NEXT:    s_ashr_i32 s9, s2, 31
8505; GFX9-NEXT:    s_add_i32 s2, s2, s9
8506; GFX9-NEXT:    s_xor_b32 s2, s2, s9
8507; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
8508; GFX9-NEXT:    s_lshl_b32 s0, s8, s3
8509; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
8510; GFX9-NEXT:    s_add_i32 s0, s0, s1
8511; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8512; GFX9-NEXT:    s_xor_b32 s0, s0, s1
8513; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
8514; GFX9-NEXT:    s_sub_i32 s3, 0, s2
8515; GFX9-NEXT:    v_mul_f32_e32 v0, s10, v0
8516; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8517; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
8518; GFX9-NEXT:    s_sub_i32 s8, 0, s0
8519; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
8520; GFX9-NEXT:    v_mul_f32_e32 v1, s10, v1
8521; GFX9-NEXT:    s_ashr_i32 s3, s6, 31
8522; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8523; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
8524; GFX9-NEXT:    s_add_i32 s6, s6, s3
8525; GFX9-NEXT:    s_xor_b32 s6, s6, s3
8526; GFX9-NEXT:    s_xor_b32 s3, s3, s9
8527; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
8528; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
8529; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
8530; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
8531; GFX9-NEXT:    s_xor_b32 s1, s8, s1
8532; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
8533; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
8534; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
8535; GFX9-NEXT:    v_sub_u32_e32 v4, s6, v4
8536; GFX9-NEXT:    s_add_i32 s6, s7, s8
8537; GFX9-NEXT:    s_xor_b32 s6, s6, s8
8538; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
8539; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
8540; GFX9-NEXT:    v_mul_hi_u32 v1, s6, v1
8541; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
8542; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v4
8543; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
8544; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
8545; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
8546; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
8547; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s0
8548; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
8549; GFX9-NEXT:    v_xor_b32_e32 v0, s3, v0
8550; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
8551; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
8552; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
8553; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8554; GFX9-NEXT:    v_subrev_u32_e32 v4, s0, v3
8555; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
8556; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
8557; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
8558; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8559; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
8560; GFX9-NEXT:    v_subrev_u32_e32 v1, s1, v1
8561; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
8562; GFX9-NEXT:    s_endpgm
8563;
8564; GFX90A-LABEL: sdiv_v2i32_pow2_shl_denom:
8565; GFX90A:       ; %bb.0:
8566; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
8567; GFX90A-NEXT:    s_movk_i32 s8, 0x1000
8568; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
8569; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
8570; GFX90A-NEXT:    s_mov_b32 s10, 0x4f7ffffe
8571; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
8572; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8573; GFX90A-NEXT:    s_lshl_b32 s2, s8, s2
8574; GFX90A-NEXT:    s_ashr_i32 s9, s2, 31
8575; GFX90A-NEXT:    s_add_i32 s2, s2, s9
8576; GFX90A-NEXT:    s_xor_b32 s2, s2, s9
8577; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s2
8578; GFX90A-NEXT:    s_ashr_i32 s1, s6, 31
8579; GFX90A-NEXT:    s_lshl_b32 s0, s8, s3
8580; GFX90A-NEXT:    s_add_i32 s3, s6, s1
8581; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8582; GFX90A-NEXT:    s_xor_b32 s6, s1, s9
8583; GFX90A-NEXT:    s_xor_b32 s1, s3, s1
8584; GFX90A-NEXT:    s_sub_i32 s3, 0, s2
8585; GFX90A-NEXT:    v_mul_f32_e32 v0, s10, v0
8586; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
8587; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v0
8588; GFX90A-NEXT:    v_mul_hi_u32 v1, v0, v1
8589; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v1
8590; GFX90A-NEXT:    v_mul_hi_u32 v0, s1, v0
8591; GFX90A-NEXT:    v_mul_lo_u32 v1, v0, s2
8592; GFX90A-NEXT:    v_sub_u32_e32 v1, s1, v1
8593; GFX90A-NEXT:    s_ashr_i32 s1, s0, 31
8594; GFX90A-NEXT:    s_add_i32 s0, s0, s1
8595; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
8596; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s0
8597; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
8598; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
8599; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
8600; GFX90A-NEXT:    v_subrev_u32_e32 v3, s2, v1
8601; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
8602; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
8603; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v4
8604; GFX90A-NEXT:    s_ashr_i32 s2, s7, 31
8605; GFX90A-NEXT:    s_add_i32 s3, s7, s2
8606; GFX90A-NEXT:    v_add_u32_e32 v3, 1, v0
8607; GFX90A-NEXT:    v_mul_f32_e32 v1, s10, v1
8608; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
8609; GFX90A-NEXT:    s_xor_b32 s1, s2, s1
8610; GFX90A-NEXT:    s_xor_b32 s2, s3, s2
8611; GFX90A-NEXT:    s_sub_i32 s3, 0, s0
8612; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
8613; GFX90A-NEXT:    v_mul_lo_u32 v3, s3, v1
8614; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
8615; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
8616; GFX90A-NEXT:    v_mul_hi_u32 v1, s2, v1
8617; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s0
8618; GFX90A-NEXT:    v_sub_u32_e32 v3, s2, v3
8619; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v1
8620; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
8621; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8622; GFX90A-NEXT:    v_subrev_u32_e32 v4, s0, v3
8623; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
8624; GFX90A-NEXT:    v_add_u32_e32 v4, 1, v1
8625; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
8626; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8627; GFX90A-NEXT:    v_xor_b32_e32 v0, s6, v0
8628; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
8629; GFX90A-NEXT:    v_subrev_u32_e32 v0, s6, v0
8630; GFX90A-NEXT:    v_subrev_u32_e32 v1, s1, v1
8631; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
8632; GFX90A-NEXT:    s_endpgm
8633  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
8634  %r = sdiv <2 x i32> %x, %shl.y
8635  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
8636  ret void
8637}
8638
8639define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
8640; CHECK-LABEL: @srem_i32_oddk_denom(
8641; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
8642; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
8643; CHECK-NEXT:    ret void
8644;
8645; GFX6-LABEL: srem_i32_oddk_denom:
8646; GFX6:       ; %bb.0:
8647; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
8648; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
8649; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
8650; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
8651; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8652; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8653; GFX6-NEXT:    v_mul_hi_i32 v0, s4, v0
8654; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
8655; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
8656; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
8657; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
8658; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
8659; GFX6-NEXT:    s_mov_b32 s2, -1
8660; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
8661; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8662; GFX6-NEXT:    s_endpgm
8663;
8664; GFX9-LABEL: srem_i32_oddk_denom:
8665; GFX9:       ; %bb.0:
8666; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8667; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
8668; GFX9-NEXT:    v_mov_b32_e32 v0, 0
8669; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8670; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
8671; GFX9-NEXT:    s_add_i32 s0, s0, s4
8672; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
8673; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
8674; GFX9-NEXT:    s_add_i32 s0, s0, s1
8675; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
8676; GFX9-NEXT:    s_sub_i32 s0, s4, s0
8677; GFX9-NEXT:    v_mov_b32_e32 v1, s0
8678; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
8679; GFX9-NEXT:    s_endpgm
8680;
8681; GFX90A-LABEL: srem_i32_oddk_denom:
8682; GFX90A:       ; %bb.0:
8683; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8684; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
8685; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
8686; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8687; GFX90A-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
8688; GFX90A-NEXT:    s_add_i32 s0, s0, s4
8689; GFX90A-NEXT:    s_lshr_b32 s1, s0, 31
8690; GFX90A-NEXT:    s_ashr_i32 s0, s0, 20
8691; GFX90A-NEXT:    s_add_i32 s0, s0, s1
8692; GFX90A-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
8693; GFX90A-NEXT:    s_sub_i32 s0, s4, s0
8694; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
8695; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
8696; GFX90A-NEXT:    s_endpgm
8697  %r = srem i32 %x, 1235195
8698  store i32 %r, i32 addrspace(1)* %out
8699  ret void
8700}
8701
8702define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
8703; CHECK-LABEL: @srem_i32_pow2k_denom(
8704; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
8705; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
8706; CHECK-NEXT:    ret void
8707;
8708; GFX6-LABEL: srem_i32_pow2k_denom:
8709; GFX6:       ; %bb.0:
8710; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8711; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
8712; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8713; GFX6-NEXT:    s_mov_b32 s6, -1
8714; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8715; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
8716; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
8717; GFX6-NEXT:    s_add_i32 s1, s0, s1
8718; GFX6-NEXT:    s_and_b32 s1, s1, 0xfffff000
8719; GFX6-NEXT:    s_sub_i32 s0, s0, s1
8720; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8721; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
8722; GFX6-NEXT:    s_endpgm
8723;
8724; GFX9-LABEL: srem_i32_pow2k_denom:
8725; GFX9:       ; %bb.0:
8726; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8727; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
8728; GFX9-NEXT:    v_mov_b32_e32 v0, 0
8729; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8730; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
8731; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8732; GFX9-NEXT:    s_add_i32 s0, s4, s0
8733; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
8734; GFX9-NEXT:    s_sub_i32 s0, s4, s0
8735; GFX9-NEXT:    v_mov_b32_e32 v1, s0
8736; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
8737; GFX9-NEXT:    s_endpgm
8738;
8739; GFX90A-LABEL: srem_i32_pow2k_denom:
8740; GFX90A:       ; %bb.0:
8741; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8742; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x2c
8743; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
8744; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8745; GFX90A-NEXT:    s_ashr_i32 s0, s4, 31
8746; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
8747; GFX90A-NEXT:    s_add_i32 s0, s4, s0
8748; GFX90A-NEXT:    s_and_b32 s0, s0, 0xfffff000
8749; GFX90A-NEXT:    s_sub_i32 s0, s4, s0
8750; GFX90A-NEXT:    v_mov_b32_e32 v1, s0
8751; GFX90A-NEXT:    global_store_dword v0, v1, s[2:3]
8752; GFX90A-NEXT:    s_endpgm
8753  %r = srem i32 %x, 4096
8754  store i32 %r, i32 addrspace(1)* %out
8755  ret void
8756}
8757
8758define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
8759; CHECK-LABEL: @srem_i32_pow2_shl_denom(
8760; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
8761; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
8762; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
8763; CHECK-NEXT:    ret void
8764;
8765; GFX6-LABEL: srem_i32_pow2_shl_denom:
8766; GFX6:       ; %bb.0:
8767; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
8768; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
8769; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8770; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
8771; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
8772; GFX6-NEXT:    s_add_i32 s3, s3, s4
8773; GFX6-NEXT:    s_xor_b32 s4, s3, s4
8774; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
8775; GFX6-NEXT:    s_sub_i32 s3, 0, s4
8776; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
8777; GFX6-NEXT:    s_add_i32 s2, s2, s5
8778; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8779; GFX6-NEXT:    s_xor_b32 s6, s2, s5
8780; GFX6-NEXT:    s_mov_b32 s2, -1
8781; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
8782; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8783; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
8784; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8785; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
8786; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
8787; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
8788; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
8789; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
8790; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
8791; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
8792; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
8793; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
8794; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
8795; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
8796; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
8797; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
8798; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
8799; GFX6-NEXT:    s_endpgm
8800;
8801; GFX9-LABEL: srem_i32_pow2_shl_denom:
8802; GFX9:       ; %bb.0:
8803; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
8804; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8805; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
8806; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
8807; GFX9-NEXT:    s_add_i32 s3, s3, s4
8808; GFX9-NEXT:    s_xor_b32 s3, s3, s4
8809; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
8810; GFX9-NEXT:    s_sub_i32 s4, 0, s3
8811; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
8812; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8813; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
8814; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8815; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
8816; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
8817; GFX9-NEXT:    s_add_i32 s2, s2, s4
8818; GFX9-NEXT:    s_xor_b32 s2, s2, s4
8819; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
8820; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
8821; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
8822; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8823; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
8824; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
8825; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
8826; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
8827; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8828; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
8829; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
8830; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8831; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
8832; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
8833; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8834; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
8835; GFX9-NEXT:    s_endpgm
8836;
8837; GFX90A-LABEL: srem_i32_pow2_shl_denom:
8838; GFX90A:       ; %bb.0:
8839; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
8840; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
8841; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
8842; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8843; GFX90A-NEXT:    s_lshl_b32 s3, 0x1000, s3
8844; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
8845; GFX90A-NEXT:    s_add_i32 s3, s3, s4
8846; GFX90A-NEXT:    s_xor_b32 s3, s3, s4
8847; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
8848; GFX90A-NEXT:    s_sub_i32 s5, 0, s3
8849; GFX90A-NEXT:    s_ashr_i32 s4, s2, 31
8850; GFX90A-NEXT:    s_add_i32 s2, s2, s4
8851; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
8852; GFX90A-NEXT:    s_xor_b32 s2, s2, s4
8853; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
8854; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
8855; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v0
8856; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, v2
8857; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v2
8858; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
8859; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s3
8860; GFX90A-NEXT:    v_sub_u32_e32 v0, s2, v0
8861; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
8862; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
8863; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8864; GFX90A-NEXT:    v_subrev_u32_e32 v2, s3, v0
8865; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
8866; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8867; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
8868; GFX90A-NEXT:    v_subrev_u32_e32 v0, s4, v0
8869; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
8870; GFX90A-NEXT:    s_endpgm
8871  %shl.y = shl i32 4096, %y
8872  %r = srem i32 %x, %shl.y
8873  store i32 %r, i32 addrspace(1)* %out
8874  ret void
8875}
8876
8877define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
8878; CHECK-LABEL: @srem_v2i32_pow2k_denom(
8879; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
8880; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
8881; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
8882; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
8883; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
8884; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
8885; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
8886; CHECK-NEXT:    ret void
8887;
8888; GFX6-LABEL: srem_v2i32_pow2k_denom:
8889; GFX6:       ; %bb.0:
8890; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8891; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
8892; GFX6-NEXT:    s_movk_i32 s2, 0xf000
8893; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8894; GFX6-NEXT:    s_mov_b32 s6, -1
8895; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8896; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
8897; GFX6-NEXT:    s_lshr_b32 s3, s3, 20
8898; GFX6-NEXT:    s_add_i32 s3, s0, s3
8899; GFX6-NEXT:    s_and_b32 s3, s3, s2
8900; GFX6-NEXT:    s_sub_i32 s0, s0, s3
8901; GFX6-NEXT:    s_ashr_i32 s3, s1, 31
8902; GFX6-NEXT:    s_lshr_b32 s3, s3, 20
8903; GFX6-NEXT:    s_add_i32 s3, s1, s3
8904; GFX6-NEXT:    s_and_b32 s2, s3, s2
8905; GFX6-NEXT:    s_sub_i32 s1, s1, s2
8906; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8907; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8908; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8909; GFX6-NEXT:    s_endpgm
8910;
8911; GFX9-LABEL: srem_v2i32_pow2k_denom:
8912; GFX9:       ; %bb.0:
8913; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8914; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
8915; GFX9-NEXT:    s_movk_i32 s6, 0xf000
8916; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8917; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8918; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
8919; GFX9-NEXT:    s_ashr_i32 s1, s5, 31
8920; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8921; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
8922; GFX9-NEXT:    s_add_i32 s0, s4, s0
8923; GFX9-NEXT:    s_add_i32 s1, s5, s1
8924; GFX9-NEXT:    s_and_b32 s0, s0, s6
8925; GFX9-NEXT:    s_and_b32 s1, s1, s6
8926; GFX9-NEXT:    s_sub_i32 s0, s4, s0
8927; GFX9-NEXT:    s_sub_i32 s1, s5, s1
8928; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8929; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8930; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
8931; GFX9-NEXT:    s_endpgm
8932;
8933; GFX90A-LABEL: srem_v2i32_pow2k_denom:
8934; GFX90A:       ; %bb.0:
8935; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8936; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
8937; GFX90A-NEXT:    s_movk_i32 s6, 0xf000
8938; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
8939; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
8940; GFX90A-NEXT:    s_ashr_i32 s0, s4, 31
8941; GFX90A-NEXT:    s_ashr_i32 s1, s5, 31
8942; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
8943; GFX90A-NEXT:    s_lshr_b32 s1, s1, 20
8944; GFX90A-NEXT:    s_add_i32 s0, s4, s0
8945; GFX90A-NEXT:    s_add_i32 s1, s5, s1
8946; GFX90A-NEXT:    s_and_b32 s0, s0, s6
8947; GFX90A-NEXT:    s_and_b32 s1, s1, s6
8948; GFX90A-NEXT:    s_sub_i32 s0, s4, s0
8949; GFX90A-NEXT:    s_sub_i32 s1, s5, s1
8950; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
8951; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
8952; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
8953; GFX90A-NEXT:    s_endpgm
8954  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
8955  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
8956  ret void
8957}
8958
8959define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
8960; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
8961; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
8962; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
8963; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
8964; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
8965; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
8966; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
8967; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
8968; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
8969; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
8970; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
8971; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
8972; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
8973; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
8974; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
8975; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
8976; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
8977; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
8978; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
8979; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
8980; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
8981; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
8982; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
8983; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
8984; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
8985; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
8986; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
8987; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
8988; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
8989; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
8990; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
8991; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
8992; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
8993; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
8994; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
8995; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
8996; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
8997; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
8998; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
8999; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0
9000; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
9001; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
9002; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
9003; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
9004; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
9005; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
9006; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
9007; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
9008; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
9009; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
9010; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
9011; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
9012; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
9013; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
9014; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
9015; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
9016; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
9017; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
9018; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
9019; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
9020; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
9021; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
9022; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
9023; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
9024; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
9025; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
9026; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
9027; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
9028; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
9029; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
9030; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
9031; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
9032; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
9033; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
9034; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
9035; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
9036; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
9037; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
9038; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
9039; CHECK-NEXT:    ret void
9040;
9041; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
9042; GFX6:       ; %bb.0:
9043; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
9044; GFX6-NEXT:    s_movk_i32 s6, 0x1000
9045; GFX6-NEXT:    s_mov_b32 s10, 0x4f7ffffe
9046; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9047; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9048; GFX6-NEXT:    s_lshl_b32 s2, s6, s2
9049; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
9050; GFX6-NEXT:    s_add_i32 s2, s2, s4
9051; GFX6-NEXT:    s_xor_b32 s2, s2, s4
9052; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
9053; GFX6-NEXT:    s_lshl_b32 s3, s6, s3
9054; GFX6-NEXT:    s_ashr_i32 s6, s3, 31
9055; GFX6-NEXT:    s_add_i32 s3, s3, s6
9056; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
9057; GFX6-NEXT:    s_sub_i32 s9, 0, s2
9058; GFX6-NEXT:    s_xor_b32 s3, s3, s6
9059; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
9060; GFX6-NEXT:    v_mul_f32_e32 v0, s10, v0
9061; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9062; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
9063; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
9064; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
9065; GFX6-NEXT:    s_mov_b32 s6, -1
9066; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v0
9067; GFX6-NEXT:    s_sub_i32 s9, 0, s3
9068; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9069; GFX6-NEXT:    s_ashr_i32 s8, s0, 31
9070; GFX6-NEXT:    s_add_i32 s0, s0, s8
9071; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
9072; GFX6-NEXT:    s_xor_b32 s0, s0, s8
9073; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
9074; GFX6-NEXT:    v_mul_f32_e32 v1, s10, v2
9075; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9076; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
9077; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
9078; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
9079; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
9080; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
9081; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
9082; GFX6-NEXT:    s_ashr_i32 s0, s1, 31
9083; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
9084; GFX6-NEXT:    s_add_i32 s1, s1, s0
9085; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
9086; GFX6-NEXT:    s_xor_b32 s1, s1, s0
9087; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
9088; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
9089; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
9090; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
9091; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
9092; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
9093; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
9094; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
9095; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
9096; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
9097; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
9098; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
9099; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
9100; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
9101; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
9102; GFX6-NEXT:    v_xor_b32_e32 v1, s0, v1
9103; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
9104; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9105; GFX6-NEXT:    s_endpgm
9106;
9107; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
9108; GFX9:       ; %bb.0:
9109; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
9110; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
9111; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
9112; GFX9-NEXT:    s_movk_i32 s8, 0x1000
9113; GFX9-NEXT:    s_mov_b32 s9, 0x4f7ffffe
9114; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9115; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9116; GFX9-NEXT:    s_lshl_b32 s0, s8, s6
9117; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
9118; GFX9-NEXT:    s_add_i32 s0, s0, s1
9119; GFX9-NEXT:    s_xor_b32 s0, s0, s1
9120; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
9121; GFX9-NEXT:    s_lshl_b32 s1, s8, s7
9122; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
9123; GFX9-NEXT:    s_add_i32 s1, s1, s6
9124; GFX9-NEXT:    s_xor_b32 s1, s1, s6
9125; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
9126; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
9127; GFX9-NEXT:    s_sub_i32 s7, 0, s0
9128; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
9129; GFX9-NEXT:    v_mul_f32_e32 v0, s9, v0
9130; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
9131; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9132; GFX9-NEXT:    s_add_i32 s4, s4, s6
9133; GFX9-NEXT:    s_xor_b32 s4, s4, s6
9134; GFX9-NEXT:    v_mul_f32_e32 v1, s9, v1
9135; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v0
9136; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9137; GFX9-NEXT:    s_sub_i32 s7, 0, s1
9138; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
9139; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v1
9140; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
9141; GFX9-NEXT:    s_add_i32 s5, s5, s7
9142; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
9143; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
9144; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
9145; GFX9-NEXT:    s_xor_b32 s5, s5, s7
9146; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
9147; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
9148; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
9149; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s1
9150; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
9151; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v0
9152; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
9153; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
9154; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v0
9155; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
9156; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
9157; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
9158; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v1
9159; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
9160; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
9161; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v1
9162; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
9163; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
9164; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
9165; GFX9-NEXT:    v_xor_b32_e32 v1, s7, v1
9166; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
9167; GFX9-NEXT:    v_subrev_u32_e32 v1, s7, v1
9168; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
9169; GFX9-NEXT:    s_endpgm
9170;
9171; GFX90A-LABEL: srem_v2i32_pow2_shl_denom:
9172; GFX90A:       ; %bb.0:
9173; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
9174; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
9175; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
9176; GFX90A-NEXT:    s_movk_i32 s8, 0x1000
9177; GFX90A-NEXT:    s_mov_b32 s9, 0x4f7ffffe
9178; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
9179; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
9180; GFX90A-NEXT:    s_lshl_b32 s0, s8, s6
9181; GFX90A-NEXT:    s_ashr_i32 s1, s0, 31
9182; GFX90A-NEXT:    s_add_i32 s0, s0, s1
9183; GFX90A-NEXT:    s_xor_b32 s0, s0, s1
9184; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s0
9185; GFX90A-NEXT:    s_lshl_b32 s1, s8, s7
9186; GFX90A-NEXT:    s_sub_i32 s8, 0, s0
9187; GFX90A-NEXT:    s_ashr_i32 s6, s4, 31
9188; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
9189; GFX90A-NEXT:    s_add_i32 s4, s4, s6
9190; GFX90A-NEXT:    s_xor_b32 s4, s4, s6
9191; GFX90A-NEXT:    s_ashr_i32 s7, s1, 31
9192; GFX90A-NEXT:    v_mul_f32_e32 v0, s9, v0
9193; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
9194; GFX90A-NEXT:    s_add_i32 s1, s1, s7
9195; GFX90A-NEXT:    s_xor_b32 s1, s1, s7
9196; GFX90A-NEXT:    v_mul_lo_u32 v1, s8, v0
9197; GFX90A-NEXT:    v_mul_hi_u32 v1, v0, v1
9198; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v1
9199; GFX90A-NEXT:    v_mul_hi_u32 v0, s4, v0
9200; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s0
9201; GFX90A-NEXT:    v_sub_u32_e32 v0, s4, v0
9202; GFX90A-NEXT:    v_subrev_u32_e32 v1, s0, v0
9203; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
9204; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
9205; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s1
9206; GFX90A-NEXT:    v_subrev_u32_e32 v3, s0, v0
9207; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
9208; GFX90A-NEXT:    s_ashr_i32 s0, s5, 31
9209; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v1, v1
9210; GFX90A-NEXT:    s_add_i32 s4, s5, s0
9211; GFX90A-NEXT:    s_sub_i32 s5, 0, s1
9212; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
9213; GFX90A-NEXT:    v_mul_f32_e32 v1, s9, v1
9214; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
9215; GFX90A-NEXT:    s_xor_b32 s4, s4, s0
9216; GFX90A-NEXT:    v_xor_b32_e32 v0, s6, v0
9217; GFX90A-NEXT:    v_subrev_u32_e32 v0, s6, v0
9218; GFX90A-NEXT:    v_mul_lo_u32 v3, s5, v1
9219; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v3
9220; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
9221; GFX90A-NEXT:    v_mul_hi_u32 v1, s4, v1
9222; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s1
9223; GFX90A-NEXT:    v_sub_u32_e32 v1, s4, v1
9224; GFX90A-NEXT:    v_subrev_u32_e32 v3, s1, v1
9225; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
9226; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
9227; GFX90A-NEXT:    v_subrev_u32_e32 v3, s1, v1
9228; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
9229; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
9230; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
9231; GFX90A-NEXT:    v_subrev_u32_e32 v1, s0, v1
9232; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
9233; GFX90A-NEXT:    s_endpgm
9234  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
9235  %r = srem <2 x i32> %x, %shl.y
9236  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
9237  ret void
9238}
9239
9240define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
9241; CHECK-LABEL: @udiv_i64_oddk_denom(
9242; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
9243; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9244; CHECK-NEXT:    ret void
9245;
9246; GFX6-LABEL: udiv_i64_oddk_denom:
9247; GFX6:       ; %bb.0:
9248; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
9249; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
9250; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
9251; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9252; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
9253; GFX6-NEXT:    s_mov_b32 s3, 0x68958c89
9254; GFX6-NEXT:    v_mov_b32_e32 v8, 0
9255; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9256; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9257; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9258; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9259; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9260; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9261; GFX6-NEXT:    v_mov_b32_e32 v7, 0
9262; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
9263; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
9264; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
9265; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
9266; GFX6-NEXT:    s_mov_b32 s11, 0xf000
9267; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9268; GFX6-NEXT:    s_mov_b32 s8, s4
9269; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9270; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
9271; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
9272; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9273; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
9274; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
9275; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
9276; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9277; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
9278; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
9279; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
9280; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
9281; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
9282; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
9283; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
9284; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9285; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
9286; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
9287; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
9288; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
9289; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
9290; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
9291; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9292; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
9293; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
9294; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
9295; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
9296; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
9297; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
9298; GFX6-NEXT:    s_movk_i32 s2, 0x11f
9299; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
9300; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
9301; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
9302; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
9303; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
9304; GFX6-NEXT:    s_mov_b32 s3, 0x976a7377
9305; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
9306; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
9307; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
9308; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9309; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
9310; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
9311; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
9312; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9313; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9314; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
9315; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
9316; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
9317; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
9318; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
9319; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9320; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
9321; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
9322; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
9323; GFX6-NEXT:    s_mov_b32 s4, 0x976a7376
9324; GFX6-NEXT:    s_mov_b32 s10, -1
9325; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9326; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9327; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
9328; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9329; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
9330; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
9331; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
9332; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
9333; GFX6-NEXT:    v_mov_b32_e32 v5, s2
9334; GFX6-NEXT:    s_mov_b32 s9, s5
9335; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9336; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
9337; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9338; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
9339; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
9340; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
9341; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s3, v3
9342; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
9343; GFX6-NEXT:    s_movk_i32 s3, 0x11e
9344; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
9345; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9346; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v5
9347; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9348; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
9349; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
9350; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
9351; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
9352; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
9353; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
9354; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
9355; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
9356; GFX6-NEXT:    v_mov_b32_e32 v6, s7
9357; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
9358; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
9359; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9360; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
9361; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9362; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
9363; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
9364; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9365; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
9366; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
9367; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9368; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
9369; GFX6-NEXT:    s_endpgm
9370;
9371; GFX9-LABEL: udiv_i64_oddk_denom:
9372; GFX9:       ; %bb.0:
9373; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
9374; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
9375; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
9376; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9377; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
9378; GFX9-NEXT:    s_mov_b32 s5, 0x68958c89
9379; GFX9-NEXT:    v_mov_b32_e32 v8, 0
9380; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9381; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9382; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9383; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9384; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9385; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9386; GFX9-NEXT:    v_mov_b32_e32 v5, 0
9387; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
9388; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
9389; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
9390; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s5
9391; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9392; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
9393; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
9394; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v6
9395; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
9396; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
9397; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9398; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
9399; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
9400; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
9401; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
9402; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
9403; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
9404; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
9405; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9406; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
9407; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
9408; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
9409; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
9410; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
9411; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s5
9412; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
9413; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9414; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
9415; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
9416; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
9417; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v9
9418; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
9419; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
9420; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
9421; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
9422; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v10, vcc
9423; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
9424; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
9425; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
9426; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
9427; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v9, vcc
9428; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
9429; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
9430; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
9431; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
9432; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9433; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9434; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9435; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
9436; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
9437; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
9438; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
9439; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
9440; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9441; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
9442; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
9443; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
9444; GFX9-NEXT:    s_movk_i32 s2, 0x11f
9445; GFX9-NEXT:    s_mov_b32 s3, 0x976a7377
9446; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
9447; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9448; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
9449; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9450; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
9451; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
9452; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
9453; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
9454; GFX9-NEXT:    v_mov_b32_e32 v6, s2
9455; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9456; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
9457; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
9458; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
9459; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
9460; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
9461; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v3
9462; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
9463; GFX9-NEXT:    s_movk_i32 s3, 0x11e
9464; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
9465; GFX9-NEXT:    s_mov_b32 s6, 0x976a7376
9466; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9467; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v6
9468; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9469; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
9470; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
9471; GFX9-NEXT:    v_mov_b32_e32 v7, s7
9472; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
9473; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
9474; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
9475; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9476; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v3
9477; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
9478; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9479; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
9480; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
9481; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
9482; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
9483; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9484; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
9485; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
9486; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
9487; GFX9-NEXT:    s_endpgm
9488;
9489; GFX90A-LABEL: udiv_i64_oddk_denom:
9490; GFX90A:       ; %bb.0:
9491; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
9492; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x4f800000
9493; GFX90A-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
9494; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
9495; GFX90A-NEXT:    s_movk_i32 s2, 0xfee0
9496; GFX90A-NEXT:    s_mov_b32 s3, 0x68958c89
9497; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
9498; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9499; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9500; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
9501; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9502; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
9503; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
9504; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
9505; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9506; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
9507; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s3
9508; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
9509; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s3
9510; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
9511; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s3
9512; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
9513; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
9514; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
9515; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
9516; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
9517; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
9518; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
9519; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
9520; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
9521; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v9, vcc
9522; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
9523; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
9524; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
9525; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
9526; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
9527; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
9528; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
9529; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s3
9530; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s3
9531; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
9532; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
9533; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s3
9534; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
9535; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
9536; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
9537; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
9538; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
9539; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
9540; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
9541; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
9542; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
9543; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v11, vcc
9544; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v2, vcc
9545; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
9546; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
9547; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v7, vcc
9548; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
9549; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
9550; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
9551; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9552; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
9553; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
9554; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
9555; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
9556; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
9557; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
9558; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
9559; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
9560; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
9561; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
9562; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
9563; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
9564; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
9565; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9566; GFX90A-NEXT:    s_movk_i32 s2, 0x11f
9567; GFX90A-NEXT:    s_mov_b32 s3, 0x976a7377
9568; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
9569; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
9570; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s3
9571; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
9572; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s3
9573; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
9574; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s3
9575; GFX90A-NEXT:    v_sub_u32_e32 v4, s7, v3
9576; GFX90A-NEXT:    v_mov_b32_e32 v6, s2
9577; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
9578; GFX90A-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
9579; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v5
9580; GFX90A-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
9581; GFX90A-NEXT:    s_movk_i32 s3, 0x11e
9582; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
9583; GFX90A-NEXT:    s_mov_b32 s6, 0x976a7376
9584; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9585; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v6
9586; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9587; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
9588; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
9589; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
9590; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
9591; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
9592; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
9593; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9594; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v5
9595; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
9596; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9597; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v3
9598; GFX90A-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
9599; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
9600; GFX90A-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
9601; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
9602; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
9603; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
9604; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
9605; GFX90A-NEXT:    s_endpgm
9606  %r = udiv i64 %x, 1235195949943
9607  store i64 %r, i64 addrspace(1)* %out
9608  ret void
9609}
9610
9611define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
9612; CHECK-LABEL: @udiv_i64_pow2k_denom(
9613; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
9614; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9615; CHECK-NEXT:    ret void
9616;
9617; GFX6-LABEL: udiv_i64_pow2k_denom:
9618; GFX6:       ; %bb.0:
9619; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
9620; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9621; GFX6-NEXT:    s_mov_b32 s6, -1
9622; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9623; GFX6-NEXT:    s_mov_b32 s4, s0
9624; GFX6-NEXT:    s_mov_b32 s5, s1
9625; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
9626; GFX6-NEXT:    v_mov_b32_e32 v0, s0
9627; GFX6-NEXT:    v_mov_b32_e32 v1, s1
9628; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9629; GFX6-NEXT:    s_endpgm
9630;
9631; GFX9-LABEL: udiv_i64_pow2k_denom:
9632; GFX9:       ; %bb.0:
9633; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
9634; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9635; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9636; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
9637; GFX9-NEXT:    v_mov_b32_e32 v0, s2
9638; GFX9-NEXT:    v_mov_b32_e32 v1, s3
9639; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
9640; GFX9-NEXT:    s_endpgm
9641;
9642; GFX90A-LABEL: udiv_i64_pow2k_denom:
9643; GFX90A:       ; %bb.0:
9644; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
9645; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
9646; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
9647; GFX90A-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
9648; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
9649; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
9650; GFX90A-NEXT:    s_endpgm
9651  %r = udiv i64 %x, 4096
9652  store i64 %r, i64 addrspace(1)* %out
9653  ret void
9654}
9655
9656define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
9657; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
9658; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
9659; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
9660; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9661; CHECK-NEXT:    ret void
9662;
9663; GFX6-LABEL: udiv_i64_pow2_shl_denom:
9664; GFX6:       ; %bb.0:
9665; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
9666; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
9667; GFX6-NEXT:    s_mov_b32 s3, 0xf000
9668; GFX6-NEXT:    s_mov_b32 s2, -1
9669; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9670; GFX6-NEXT:    s_mov_b32 s0, s4
9671; GFX6-NEXT:    s_add_i32 s8, s8, 12
9672; GFX6-NEXT:    s_mov_b32 s1, s5
9673; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
9674; GFX6-NEXT:    v_mov_b32_e32 v0, s4
9675; GFX6-NEXT:    v_mov_b32_e32 v1, s5
9676; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
9677; GFX6-NEXT:    s_endpgm
9678;
9679; GFX9-LABEL: udiv_i64_pow2_shl_denom:
9680; GFX9:       ; %bb.0:
9681; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9682; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
9683; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9684; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9685; GFX9-NEXT:    s_add_i32 s2, s2, 12
9686; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s2
9687; GFX9-NEXT:    v_mov_b32_e32 v0, s0
9688; GFX9-NEXT:    v_mov_b32_e32 v1, s1
9689; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
9690; GFX9-NEXT:    s_endpgm
9691;
9692; GFX90A-LABEL: udiv_i64_pow2_shl_denom:
9693; GFX90A:       ; %bb.0:
9694; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9695; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x34
9696; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
9697; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
9698; GFX90A-NEXT:    s_add_i32 s2, s2, 12
9699; GFX90A-NEXT:    s_lshr_b64 s[0:1], s[6:7], s2
9700; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
9701; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
9702; GFX90A-NEXT:    s_endpgm
9703  %shl.y = shl i64 4096, %y
9704  %r = udiv i64 %x, %shl.y
9705  store i64 %r, i64 addrspace(1)* %out
9706  ret void
9707}
9708
9709define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
9710; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
9711; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9712; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
9713; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
9714; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
9715; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
9716; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
9717; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
9718; CHECK-NEXT:    ret void
9719;
9720; GFX6-LABEL: udiv_v2i64_pow2k_denom:
9721; GFX6:       ; %bb.0:
9722; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
9723; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
9724; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9725; GFX6-NEXT:    s_mov_b32 s6, -1
9726; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9727; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
9728; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
9729; GFX6-NEXT:    v_mov_b32_e32 v0, s0
9730; GFX6-NEXT:    v_mov_b32_e32 v1, s1
9731; GFX6-NEXT:    v_mov_b32_e32 v2, s2
9732; GFX6-NEXT:    v_mov_b32_e32 v3, s3
9733; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
9734; GFX6-NEXT:    s_endpgm
9735;
9736; GFX9-LABEL: udiv_v2i64_pow2k_denom:
9737; GFX9:       ; %bb.0:
9738; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
9739; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
9740; GFX9-NEXT:    v_mov_b32_e32 v4, 0
9741; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9742; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
9743; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 12
9744; GFX9-NEXT:    v_mov_b32_e32 v0, s0
9745; GFX9-NEXT:    v_mov_b32_e32 v1, s1
9746; GFX9-NEXT:    v_mov_b32_e32 v2, s4
9747; GFX9-NEXT:    v_mov_b32_e32 v3, s5
9748; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
9749; GFX9-NEXT:    s_endpgm
9750;
9751; GFX90A-LABEL: udiv_v2i64_pow2k_denom:
9752; GFX90A:       ; %bb.0:
9753; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
9754; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
9755; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
9756; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
9757; GFX90A-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
9758; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[6:7], 12
9759; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
9760; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
9761; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
9762; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
9763; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
9764; GFX90A-NEXT:    s_endpgm
9765  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
9766  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
9767  ret void
9768}
9769
9770define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
9771; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
9772; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9773; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
9774; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
9775; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
9776; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
9777; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
9778; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
9779; CHECK-NEXT:    ret void
9780;
9781; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom:
9782; GFX6:       ; %bb.0:
9783; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9784; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
9785; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9786; GFX6-NEXT:    s_movk_i32 s2, 0xf001
9787; GFX6-NEXT:    v_mov_b32_e32 v8, 0
9788; GFX6-NEXT:    v_mov_b32_e32 v7, 0
9789; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9790; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9791; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9792; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9793; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9794; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9795; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
9796; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
9797; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9798; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s2
9799; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s2
9800; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
9801; GFX6-NEXT:    s_mov_b32 s6, -1
9802; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
9803; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9804; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9805; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
9806; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
9807; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
9808; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9809; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
9810; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9811; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9812; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
9813; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
9814; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
9815; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
9816; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9817; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
9818; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
9819; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
9820; GFX6-NEXT:    v_mul_hi_u32 v4, v0, s2
9821; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s2
9822; GFX6-NEXT:    v_mul_lo_u32 v6, v0, s2
9823; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9824; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
9825; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
9826; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
9827; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v4
9828; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v6
9829; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
9830; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
9831; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
9832; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
9833; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v6
9834; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v6
9835; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
9836; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
9837; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v6, vcc
9838; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
9839; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9840; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
9841; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
9842; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
9843; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9844; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9845; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
9846; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
9847; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v1
9848; GFX6-NEXT:    v_mul_hi_u32 v5, s11, v1
9849; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
9850; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9851; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
9852; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
9853; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
9854; GFX6-NEXT:    s_movk_i32 s0, 0xfff
9855; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9856; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9857; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
9858; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9859; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
9860; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
9861; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
9862; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
9863; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
9864; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
9865; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
9866; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
9867; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9868; GFX6-NEXT:    v_mov_b32_e32 v5, s11
9869; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s10, v8
9870; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
9871; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
9872; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
9873; GFX6-NEXT:    s_movk_i32 s0, 0xffe
9874; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
9875; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9876; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
9877; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
9878; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
9879; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
9880; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9881; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
9882; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
9883; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
9884; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
9885; GFX6-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
9886; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
9887; GFX6-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
9888; GFX6-NEXT:    v_mov_b32_e32 v0, s2
9889; GFX6-NEXT:    v_mov_b32_e32 v1, s3
9890; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
9891; GFX6-NEXT:    s_endpgm
9892;
9893; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
9894; GFX9:       ; %bb.0:
9895; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9896; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
9897; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9898; GFX9-NEXT:    s_movk_i32 s4, 0xf001
9899; GFX9-NEXT:    v_mov_b32_e32 v7, 0
9900; GFX9-NEXT:    v_mov_b32_e32 v5, 0
9901; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9902; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9903; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9904; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9905; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9906; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9907; GFX9-NEXT:    s_movk_i32 s8, 0xfff
9908; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s4
9909; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s4
9910; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
9911; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
9912; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
9913; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v3
9914; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
9915; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
9916; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
9917; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9918; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
9919; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
9920; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v3
9921; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
9922; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
9923; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
9924; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
9925; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9926; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
9927; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
9928; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
9929; GFX9-NEXT:    v_mul_hi_u32 v4, v0, s4
9930; GFX9-NEXT:    v_mul_lo_u32 v6, v2, s4
9931; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s4
9932; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
9933; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
9934; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
9935; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
9936; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v8
9937; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
9938; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
9939; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
9940; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v9, v6
9941; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v10, vcc
9942; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v8
9943; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v8
9944; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
9945; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
9946; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
9947; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v9, v8, vcc
9948; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
9949; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
9950; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
9951; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
9952; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9953; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9954; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9955; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
9956; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
9957; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
9958; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
9959; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
9960; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9961; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
9962; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
9963; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
9964; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 12
9965; GFX9-NEXT:    s_movk_i32 s4, 0xffe
9966; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
9967; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9968; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
9969; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9970; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
9971; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
9972; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
9973; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
9974; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9975; GFX9-NEXT:    v_mov_b32_e32 v3, s7
9976; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
9977; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
9978; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s8, v4
9979; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
9980; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
9981; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9982; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
9983; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
9984; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
9985; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1, 2, vcc
9986; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
9987; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
9988; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v4
9989; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
9990; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
9991; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
9992; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9993; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v3, vcc
9994; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
9995; GFX9-NEXT:    v_mov_b32_e32 v0, s2
9996; GFX9-NEXT:    v_mov_b32_e32 v1, s3
9997; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
9998; GFX9-NEXT:    s_endpgm
9999;
10000; GFX90A-LABEL: udiv_v2i64_mixed_pow2k_denom:
10001; GFX90A:       ; %bb.0:
10002; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
10003; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
10004; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
10005; GFX90A-NEXT:    s_movk_i32 s8, 0xf001
10006; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
10007; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
10008; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10009; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10010; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
10011; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10012; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
10013; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
10014; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10015; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10016; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, s8
10017; GFX90A-NEXT:    v_sub_u32_e32 v2, v2, v0
10018; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s8
10019; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
10020; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s8
10021; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
10022; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
10023; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
10024; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
10025; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
10026; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
10027; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
10028; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
10029; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
10030; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
10031; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
10032; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
10033; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10034; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
10035; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
10036; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
10037; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s8
10038; GFX90A-NEXT:    v_mul_lo_u32 v5, v2, s8
10039; GFX90A-NEXT:    v_sub_u32_e32 v6, v6, v0
10040; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
10041; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s8
10042; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
10043; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
10044; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
10045; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
10046; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
10047; GFX90A-NEXT:    v_mul_hi_u32 v11, v2, v9
10048; GFX90A-NEXT:    v_mul_lo_u32 v9, v2, v9
10049; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
10050; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v5
10051; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v11, vcc
10052; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v4, vcc
10053; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v5
10054; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
10055; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v7, vcc
10056; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
10057; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
10058; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
10059; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10060; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
10061; GFX90A-NEXT:    v_mul_lo_u32 v3, s6, v1
10062; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
10063; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v1
10064; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
10065; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v8, v2, vcc
10066; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
10067; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
10068; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
10069; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
10070; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v6, vcc
10071; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
10072; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
10073; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
10074; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
10075; GFX90A-NEXT:    s_movk_i32 s0, 0xfff
10076; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s0
10077; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s0
10078; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
10079; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s0
10080; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
10081; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
10082; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
10083; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s0, v3
10084; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
10085; GFX90A-NEXT:    s_movk_i32 s0, 0xffe
10086; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
10087; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10088; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
10089; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
10090; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10091; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
10092; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
10093; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
10094; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
10095; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
10096; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
10097; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v3, vcc
10098; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
10099; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
10100; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v0, v5, vcc
10101; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
10102; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
10103; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
10104; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10105; GFX90A-NEXT:    s_endpgm
10106  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
10107  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10108  ret void
10109}
10110
10111define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
10112; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
10113; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
10114; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10115; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
10116; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
10117; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
10118; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
10119; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
10120; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
10121; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
10122; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10123; CHECK-NEXT:    ret void
10124;
10125; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
10126; GFX6:       ; %bb.0:
10127; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
10128; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
10129; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
10130; GFX6-NEXT:    s_mov_b32 s7, 0xf000
10131; GFX6-NEXT:    s_mov_b32 s6, -1
10132; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10133; GFX6-NEXT:    s_add_i32 s0, s0, 12
10134; GFX6-NEXT:    s_add_i32 s2, s2, 12
10135; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
10136; GFX6-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
10137; GFX6-NEXT:    v_mov_b32_e32 v0, s0
10138; GFX6-NEXT:    v_mov_b32_e32 v1, s1
10139; GFX6-NEXT:    v_mov_b32_e32 v2, s2
10140; GFX6-NEXT:    v_mov_b32_e32 v3, s3
10141; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10142; GFX6-NEXT:    s_endpgm
10143;
10144; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
10145; GFX9:       ; %bb.0:
10146; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10147; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10148; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
10149; GFX9-NEXT:    v_mov_b32_e32 v4, 0
10150; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10151; GFX9-NEXT:    s_add_i32 s0, s8, 12
10152; GFX9-NEXT:    s_add_i32 s8, s10, 12
10153; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], s0
10154; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
10155; GFX9-NEXT:    v_mov_b32_e32 v0, s0
10156; GFX9-NEXT:    v_mov_b32_e32 v1, s1
10157; GFX9-NEXT:    v_mov_b32_e32 v2, s4
10158; GFX9-NEXT:    v_mov_b32_e32 v3, s5
10159; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10160; GFX9-NEXT:    s_endpgm
10161;
10162; GFX90A-LABEL: udiv_v2i64_pow2_shl_denom:
10163; GFX90A:       ; %bb.0:
10164; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10165; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10166; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
10167; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
10168; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
10169; GFX90A-NEXT:    s_add_i32 s0, s8, 12
10170; GFX90A-NEXT:    s_add_i32 s8, s10, 12
10171; GFX90A-NEXT:    s_lshr_b64 s[0:1], s[4:5], s0
10172; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
10173; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
10174; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
10175; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
10176; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
10177; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10178; GFX90A-NEXT:    s_endpgm
10179  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
10180  %r = udiv <2 x i64> %x, %shl.y
10181  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10182  ret void
10183}
10184
10185define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
10186; CHECK-LABEL: @urem_i64_oddk_denom(
10187; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
10188; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
10189; CHECK-NEXT:    ret void
10190;
10191; GFX6-LABEL: urem_i64_oddk_denom:
10192; GFX6:       ; %bb.0:
10193; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
10194; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
10195; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
10196; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
10197; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
10198; GFX6-NEXT:    s_mov_b32 s3, 0x689e0837
10199; GFX6-NEXT:    v_mov_b32_e32 v8, 0
10200; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10201; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10202; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
10203; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10204; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
10205; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
10206; GFX6-NEXT:    v_mov_b32_e32 v7, 0
10207; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
10208; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
10209; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
10210; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
10211; GFX6-NEXT:    s_mov_b32 s12, 0x9761f7c9
10212; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10213; GFX6-NEXT:    s_mov_b32 s8, s4
10214; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10215; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
10216; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
10217; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
10218; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
10219; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
10220; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
10221; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10222; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
10223; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
10224; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
10225; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
10226; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
10227; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
10228; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
10229; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10230; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
10231; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
10232; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
10233; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
10234; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
10235; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
10236; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10237; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
10238; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
10239; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
10240; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
10241; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
10242; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
10243; GFX6-NEXT:    s_movk_i32 s4, 0x11f
10244; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
10245; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
10246; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
10247; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
10248; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
10249; GFX6-NEXT:    s_mov_b32 s9, s5
10250; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
10251; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
10252; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
10253; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
10254; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
10255; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10256; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
10257; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10258; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10259; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
10260; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
10261; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
10262; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
10263; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
10264; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10265; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
10266; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
10267; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
10268; GFX6-NEXT:    s_movk_i32 s5, 0x11e
10269; GFX6-NEXT:    s_mov_b32 s11, 0xf000
10270; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10271; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
10272; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
10273; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
10274; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
10275; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
10276; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
10277; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
10278; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
10279; GFX6-NEXT:    s_mov_b32 s10, -1
10280; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10281; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
10282; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
10283; GFX6-NEXT:    v_mov_b32_e32 v3, s4
10284; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
10285; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10286; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
10287; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
10288; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
10289; GFX6-NEXT:    s_mov_b32 s6, 0x9761f7c8
10290; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
10291; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v4
10292; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10293; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10294; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, v5
10295; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
10296; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
10297; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10298; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
10299; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
10300; GFX6-NEXT:    v_mov_b32_e32 v5, s7
10301; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
10302; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
10303; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10304; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
10305; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10306; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
10307; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
10308; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10309; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10310; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
10311; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10312; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
10313; GFX6-NEXT:    s_endpgm
10314;
10315; GFX9-LABEL: urem_i64_oddk_denom:
10316; GFX9:       ; %bb.0:
10317; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
10318; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
10319; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
10320; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
10321; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
10322; GFX9-NEXT:    s_mov_b32 s5, 0x689e0837
10323; GFX9-NEXT:    v_mov_b32_e32 v8, 0
10324; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10325; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10326; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10327; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10328; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
10329; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10330; GFX9-NEXT:    v_mov_b32_e32 v5, 0
10331; GFX9-NEXT:    s_movk_i32 s8, 0x11f
10332; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
10333; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
10334; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
10335; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s5
10336; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
10337; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10338; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
10339; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
10340; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v6
10341; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
10342; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
10343; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
10344; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
10345; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
10346; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
10347; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
10348; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
10349; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
10350; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
10351; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
10352; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10353; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
10354; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
10355; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
10356; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
10357; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
10358; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s5
10359; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
10360; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10361; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
10362; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
10363; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
10364; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v9
10365; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
10366; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
10367; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
10368; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
10369; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v10, vcc
10370; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
10371; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
10372; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
10373; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
10374; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v9, vcc
10375; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
10376; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
10377; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
10378; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
10379; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
10380; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10381; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10382; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
10383; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
10384; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
10385; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
10386; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
10387; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10388; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
10389; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
10390; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
10391; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
10392; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
10393; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
10394; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
10395; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
10396; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
10397; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s9
10398; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
10399; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
10400; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10401; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
10402; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
10403; GFX9-NEXT:    v_mov_b32_e32 v3, s8
10404; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
10405; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
10406; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v0
10407; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
10408; GFX9-NEXT:    s_movk_i32 s6, 0x11e
10409; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
10410; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10411; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v4
10412; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10413; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10414; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
10415; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v4
10416; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
10417; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10418; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
10419; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
10420; GFX9-NEXT:    v_mov_b32_e32 v4, s7
10421; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
10422; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
10423; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
10424; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
10425; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
10426; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10427; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
10428; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
10429; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
10430; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
10431; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10432; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
10433; GFX9-NEXT:    s_endpgm
10434;
10435; GFX90A-LABEL: urem_i64_oddk_denom:
10436; GFX90A:       ; %bb.0:
10437; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
10438; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x4f800000
10439; GFX90A-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
10440; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
10441; GFX90A-NEXT:    s_movk_i32 s2, 0xfee0
10442; GFX90A-NEXT:    s_mov_b32 s3, 0x689e0837
10443; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
10444; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10445; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10446; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
10447; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10448; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
10449; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
10450; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
10451; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10452; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
10453; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s3
10454; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
10455; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s3
10456; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
10457; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s3
10458; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
10459; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
10460; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
10461; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
10462; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
10463; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
10464; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
10465; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
10466; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
10467; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v9, vcc
10468; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
10469; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
10470; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
10471; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
10472; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
10473; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
10474; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
10475; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s3
10476; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s3
10477; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
10478; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
10479; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s3
10480; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
10481; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
10482; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
10483; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
10484; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
10485; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
10486; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
10487; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
10488; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
10489; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v11, vcc
10490; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v2, vcc
10491; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
10492; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
10493; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v7, vcc
10494; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
10495; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
10496; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
10497; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10498; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
10499; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
10500; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
10501; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
10502; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
10503; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
10504; GFX90A-NEXT:    v_mul_hi_u32 v6, s7, v0
10505; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
10506; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
10507; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
10508; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
10509; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
10510; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
10511; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
10512; GFX90A-NEXT:    s_movk_i32 s8, 0x11f
10513; GFX90A-NEXT:    s_mov_b32 s9, 0x9761f7c9
10514; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
10515; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s8
10516; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s9
10517; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
10518; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s9
10519; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
10520; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s9
10521; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v1
10522; GFX90A-NEXT:    v_mov_b32_e32 v4, s8
10523; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
10524; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
10525; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s9, v0
10526; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
10527; GFX90A-NEXT:    s_movk_i32 s6, 0x11e
10528; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
10529; GFX90A-NEXT:    s_mov_b32 s10, 0x9761f7c8
10530; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10531; GFX90A-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v5
10532; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
10533; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10534; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
10535; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v5
10536; GFX90A-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
10537; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
10538; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
10539; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
10540; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
10541; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
10542; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
10543; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10544; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
10545; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
10546; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10547; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
10548; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
10549; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10550; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
10551; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
10552; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
10553; GFX90A-NEXT:    s_endpgm
10554  %r = urem i64 %x, 1235195393993
10555  store i64 %r, i64 addrspace(1)* %out
10556  ret void
10557}
10558
10559define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
10560; CHECK-LABEL: @urem_i64_pow2k_denom(
10561; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
10562; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
10563; CHECK-NEXT:    ret void
10564;
10565; GFX6-LABEL: urem_i64_pow2k_denom:
10566; GFX6:       ; %bb.0:
10567; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
10568; GFX6-NEXT:    s_mov_b32 s3, 0xf000
10569; GFX6-NEXT:    s_mov_b32 s2, -1
10570; GFX6-NEXT:    v_mov_b32_e32 v1, 0
10571; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10572; GFX6-NEXT:    s_mov_b32 s0, s4
10573; GFX6-NEXT:    s_and_b32 s4, s6, 0xfff
10574; GFX6-NEXT:    s_mov_b32 s1, s5
10575; GFX6-NEXT:    v_mov_b32_e32 v0, s4
10576; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
10577; GFX6-NEXT:    s_endpgm
10578;
10579; GFX9-LABEL: urem_i64_pow2k_denom:
10580; GFX9:       ; %bb.0:
10581; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
10582; GFX9-NEXT:    v_mov_b32_e32 v1, 0
10583; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10584; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
10585; GFX9-NEXT:    v_mov_b32_e32 v0, s2
10586; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
10587; GFX9-NEXT:    s_endpgm
10588;
10589; GFX90A-LABEL: urem_i64_pow2k_denom:
10590; GFX90A:       ; %bb.0:
10591; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
10592; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
10593; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
10594; GFX90A-NEXT:    s_and_b32 s2, s2, 0xfff
10595; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
10596; GFX90A-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
10597; GFX90A-NEXT:    s_endpgm
10598  %r = urem i64 %x, 4096
10599  store i64 %r, i64 addrspace(1)* %out
10600  ret void
10601}
10602
10603define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
10604; CHECK-LABEL: @urem_i64_pow2_shl_denom(
10605; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
10606; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
10607; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
10608; CHECK-NEXT:    ret void
10609;
10610; GFX6-LABEL: urem_i64_pow2_shl_denom:
10611; GFX6:       ; %bb.0:
10612; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
10613; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
10614; GFX6-NEXT:    s_mov_b32 s3, 0xf000
10615; GFX6-NEXT:    s_mov_b32 s2, -1
10616; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10617; GFX6-NEXT:    s_mov_b32 s0, s4
10618; GFX6-NEXT:    s_mov_b32 s1, s5
10619; GFX6-NEXT:    s_mov_b64 s[4:5], 0x1000
10620; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
10621; GFX6-NEXT:    s_add_u32 s4, s4, -1
10622; GFX6-NEXT:    s_addc_u32 s5, s5, -1
10623; GFX6-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
10624; GFX6-NEXT:    v_mov_b32_e32 v0, s4
10625; GFX6-NEXT:    v_mov_b32_e32 v1, s5
10626; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
10627; GFX6-NEXT:    s_endpgm
10628;
10629; GFX9-LABEL: urem_i64_pow2_shl_denom:
10630; GFX9:       ; %bb.0:
10631; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10632; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
10633; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
10634; GFX9-NEXT:    v_mov_b32_e32 v2, 0
10635; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10636; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
10637; GFX9-NEXT:    s_add_u32 s0, s0, -1
10638; GFX9-NEXT:    s_addc_u32 s1, s1, -1
10639; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
10640; GFX9-NEXT:    v_mov_b32_e32 v0, s0
10641; GFX9-NEXT:    v_mov_b32_e32 v1, s1
10642; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
10643; GFX9-NEXT:    s_endpgm
10644;
10645; GFX90A-LABEL: urem_i64_pow2_shl_denom:
10646; GFX90A:       ; %bb.0:
10647; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10648; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x34
10649; GFX90A-NEXT:    s_mov_b64 s[0:1], 0x1000
10650; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
10651; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
10652; GFX90A-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
10653; GFX90A-NEXT:    s_add_u32 s0, s0, -1
10654; GFX90A-NEXT:    s_addc_u32 s1, s1, -1
10655; GFX90A-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
10656; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
10657; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
10658; GFX90A-NEXT:    s_endpgm
10659  %shl.y = shl i64 4096, %y
10660  %r = urem i64 %x, %shl.y
10661  store i64 %r, i64 addrspace(1)* %out
10662  ret void
10663}
10664
10665define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
10666; CHECK-LABEL: @urem_v2i64_pow2k_denom(
10667; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10668; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
10669; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
10670; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
10671; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
10672; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
10673; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10674; CHECK-NEXT:    ret void
10675;
10676; GFX6-LABEL: urem_v2i64_pow2k_denom:
10677; GFX6:       ; %bb.0:
10678; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
10679; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
10680; GFX6-NEXT:    s_movk_i32 s8, 0xfff
10681; GFX6-NEXT:    v_mov_b32_e32 v1, 0
10682; GFX6-NEXT:    s_mov_b32 s7, 0xf000
10683; GFX6-NEXT:    s_mov_b32 s6, -1
10684; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10685; GFX6-NEXT:    s_and_b32 s0, s0, s8
10686; GFX6-NEXT:    s_and_b32 s1, s2, s8
10687; GFX6-NEXT:    v_mov_b32_e32 v0, s0
10688; GFX6-NEXT:    v_mov_b32_e32 v2, s1
10689; GFX6-NEXT:    v_mov_b32_e32 v3, v1
10690; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10691; GFX6-NEXT:    s_endpgm
10692;
10693; GFX9-LABEL: urem_v2i64_pow2k_denom:
10694; GFX9:       ; %bb.0:
10695; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10696; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10697; GFX9-NEXT:    s_movk_i32 s0, 0xfff
10698; GFX9-NEXT:    v_mov_b32_e32 v1, 0
10699; GFX9-NEXT:    v_mov_b32_e32 v3, v1
10700; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10701; GFX9-NEXT:    s_and_b32 s1, s4, s0
10702; GFX9-NEXT:    s_and_b32 s0, s6, s0
10703; GFX9-NEXT:    v_mov_b32_e32 v0, s1
10704; GFX9-NEXT:    v_mov_b32_e32 v2, s0
10705; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
10706; GFX9-NEXT:    s_endpgm
10707;
10708; GFX90A-LABEL: urem_v2i64_pow2k_denom:
10709; GFX90A:       ; %bb.0:
10710; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10711; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10712; GFX90A-NEXT:    s_movk_i32 s0, 0xfff
10713; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
10714; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
10715; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
10716; GFX90A-NEXT:    s_and_b32 s1, s4, s0
10717; GFX90A-NEXT:    s_and_b32 s0, s6, s0
10718; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
10719; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
10720; GFX90A-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
10721; GFX90A-NEXT:    s_endpgm
10722  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
10723  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10724  ret void
10725}
10726
10727define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
10728; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
10729; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
10730; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10731; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
10732; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
10733; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
10734; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
10735; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
10736; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
10737; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
10738; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10739; CHECK-NEXT:    ret void
10740;
10741; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
10742; GFX6:       ; %bb.0:
10743; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
10744; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
10745; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
10746; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
10747; GFX6-NEXT:    s_mov_b32 s7, 0xf000
10748; GFX6-NEXT:    s_mov_b32 s6, -1
10749; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10750; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
10751; GFX6-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
10752; GFX6-NEXT:    s_add_u32 s0, s0, -1
10753; GFX6-NEXT:    s_addc_u32 s1, s1, -1
10754; GFX6-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
10755; GFX6-NEXT:    s_add_u32 s2, s2, -1
10756; GFX6-NEXT:    s_addc_u32 s3, s3, -1
10757; GFX6-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
10758; GFX6-NEXT:    v_mov_b32_e32 v0, s0
10759; GFX6-NEXT:    v_mov_b32_e32 v1, s1
10760; GFX6-NEXT:    v_mov_b32_e32 v2, s2
10761; GFX6-NEXT:    v_mov_b32_e32 v3, s3
10762; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10763; GFX6-NEXT:    s_endpgm
10764;
10765; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
10766; GFX9:       ; %bb.0:
10767; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10768; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10769; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
10770; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
10771; GFX9-NEXT:    v_mov_b32_e32 v4, 0
10772; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10773; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
10774; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
10775; GFX9-NEXT:    s_add_u32 s0, s0, -1
10776; GFX9-NEXT:    s_addc_u32 s1, s1, -1
10777; GFX9-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
10778; GFX9-NEXT:    s_add_u32 s4, s10, -1
10779; GFX9-NEXT:    s_addc_u32 s5, s11, -1
10780; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
10781; GFX9-NEXT:    v_mov_b32_e32 v0, s0
10782; GFX9-NEXT:    v_mov_b32_e32 v1, s1
10783; GFX9-NEXT:    v_mov_b32_e32 v2, s4
10784; GFX9-NEXT:    v_mov_b32_e32 v3, s5
10785; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10786; GFX9-NEXT:    s_endpgm
10787;
10788; GFX90A-LABEL: urem_v2i64_pow2_shl_denom:
10789; GFX90A:       ; %bb.0:
10790; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10791; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10792; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
10793; GFX90A-NEXT:    s_mov_b64 s[0:1], 0x1000
10794; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
10795; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
10796; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
10797; GFX90A-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
10798; GFX90A-NEXT:    s_add_u32 s0, s0, -1
10799; GFX90A-NEXT:    s_addc_u32 s1, s1, -1
10800; GFX90A-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
10801; GFX90A-NEXT:    s_add_u32 s4, s10, -1
10802; GFX90A-NEXT:    s_addc_u32 s5, s11, -1
10803; GFX90A-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
10804; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
10805; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
10806; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
10807; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
10808; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10809; GFX90A-NEXT:    s_endpgm
10810  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
10811  %r = urem <2 x i64> %x, %shl.y
10812  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10813  ret void
10814}
10815
10816define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
10817; CHECK-LABEL: @sdiv_i64_oddk_denom(
10818; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
10819; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
10820; CHECK-NEXT:    ret void
10821;
10822; GFX6-LABEL: sdiv_i64_oddk_denom:
10823; GFX6:       ; %bb.0:
10824; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
10825; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
10826; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
10827; GFX6-NEXT:    s_mov_b32 s2, 0xffed2705
10828; GFX6-NEXT:    v_mov_b32_e32 v8, 0
10829; GFX6-NEXT:    v_mov_b32_e32 v7, 0
10830; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10831; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10832; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
10833; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10834; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
10835; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
10836; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
10837; GFX6-NEXT:    s_mov_b32 s7, 0xf000
10838; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
10839; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
10840; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
10841; GFX6-NEXT:    s_mov_b32 s6, -1
10842; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10843; GFX6-NEXT:    s_mov_b32 s4, s8
10844; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10845; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
10846; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
10847; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
10848; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
10849; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
10850; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10851; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
10852; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
10853; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
10854; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
10855; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
10856; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
10857; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
10858; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10859; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
10860; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
10861; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
10862; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
10863; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
10864; GFX6-NEXT:    s_mov_b32 s5, s9
10865; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10866; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
10867; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
10868; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
10869; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
10870; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
10871; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
10872; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
10873; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
10874; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
10875; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
10876; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
10877; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
10878; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
10879; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
10880; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
10881; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
10882; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10883; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
10884; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
10885; GFX6-NEXT:    s_add_u32 s0, s10, s2
10886; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10887; GFX6-NEXT:    s_mov_b32 s3, s2
10888; GFX6-NEXT:    s_addc_u32 s1, s11, s2
10889; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10890; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
10891; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
10892; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
10893; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
10894; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
10895; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
10896; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10897; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
10898; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
10899; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
10900; GFX6-NEXT:    s_mov_b32 s3, 0x12d8fb
10901; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10902; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
10903; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
10904; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
10905; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
10906; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
10907; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
10908; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
10909; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s3
10910; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
10911; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
10912; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
10913; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10914; GFX6-NEXT:    v_mov_b32_e32 v5, s1
10915; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
10916; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
10917; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s3, v8
10918; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
10919; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
10920; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
10921; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10922; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
10923; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
10924; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
10925; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10926; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
10927; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
10928; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
10929; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
10930; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
10931; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
10932; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
10933; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
10934; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
10935; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
10936; GFX6-NEXT:    v_mov_b32_e32 v2, s2
10937; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
10938; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
10939; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
10940; GFX6-NEXT:    s_endpgm
10941;
10942; GFX9-LABEL: sdiv_i64_oddk_denom:
10943; GFX9:       ; %bb.0:
10944; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
10945; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
10946; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
10947; GFX9-NEXT:    s_mov_b32 s8, 0xffed2705
10948; GFX9-NEXT:    v_mov_b32_e32 v7, 0
10949; GFX9-NEXT:    v_mov_b32_e32 v5, 0
10950; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10951; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10952; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10953; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10954; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10955; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
10956; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10957; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
10958; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
10959; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
10960; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10961; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
10962; GFX9-NEXT:    s_mov_b32 s1, s0
10963; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10964; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
10965; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
10966; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
10967; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
10968; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
10969; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
10970; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
10971; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
10972; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v4
10973; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
10974; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
10975; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
10976; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
10977; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10978; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
10979; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
10980; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
10981; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
10982; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s8
10983; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
10984; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
10985; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
10986; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
10987; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
10988; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
10989; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
10990; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
10991; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
10992; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
10993; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
10994; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
10995; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
10996; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
10997; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
10998; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
10999; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
11000; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
11001; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
11002; GFX9-NEXT:    s_add_u32 s2, s6, s0
11003; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
11004; GFX9-NEXT:    s_addc_u32 s3, s7, s0
11005; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11006; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
11007; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
11008; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
11009; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v1
11010; GFX9-NEXT:    v_mul_hi_u32 v6, s3, v1
11011; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v1
11012; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
11013; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
11014; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
11015; GFX9-NEXT:    v_mul_hi_u32 v0, s3, v0
11016; GFX9-NEXT:    s_mov_b32 s1, 0x12d8fb
11017; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
11018; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
11019; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
11020; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
11021; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
11022; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s1
11023; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s1
11024; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s1
11025; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
11026; GFX9-NEXT:    v_mov_b32_e32 v3, s3
11027; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s2, v4
11028; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
11029; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s1, v4
11030; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
11031; GFX9-NEXT:    s_mov_b32 s1, 0x12d8fa
11032; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v3
11033; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
11034; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
11035; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
11036; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
11037; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1, 2, vcc
11038; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
11039; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
11040; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v4
11041; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
11042; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
11043; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
11044; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
11045; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
11046; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
11047; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
11048; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
11049; GFX9-NEXT:    v_mov_b32_e32 v2, s0
11050; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
11051; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
11052; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
11053; GFX9-NEXT:    s_endpgm
11054;
11055; GFX90A-LABEL: sdiv_i64_oddk_denom:
11056; GFX90A:       ; %bb.0:
11057; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
11058; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
11059; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
11060; GFX90A-NEXT:    s_mov_b32 s2, 0xffed2705
11061; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
11062; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
11063; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
11064; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
11065; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
11066; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
11067; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
11068; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
11069; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
11070; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s2
11071; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s2
11072; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
11073; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
11074; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
11075; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
11076; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
11077; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
11078; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
11079; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
11080; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
11081; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
11082; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
11083; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
11084; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v9, vcc
11085; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
11086; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
11087; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
11088; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
11089; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
11090; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
11091; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s2
11092; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s2
11093; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
11094; GFX90A-NEXT:    v_sub_u32_e32 v5, v5, v0
11095; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, s2
11096; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v7
11097; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v7
11098; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
11099; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v7
11100; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
11101; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v12
11102; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, v8, v11, vcc
11103; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
11104; GFX90A-NEXT:    v_mul_hi_u32 v6, v3, v5
11105; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v11, v9, vcc
11106; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v2, vcc
11107; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
11108; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
11109; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
11110; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
11111; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
11112; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
11113; GFX90A-NEXT:    s_ashr_i32 s0, s7, 31
11114; GFX90A-NEXT:    s_add_u32 s2, s6, s0
11115; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
11116; GFX90A-NEXT:    s_mov_b32 s1, s0
11117; GFX90A-NEXT:    s_addc_u32 s3, s7, s0
11118; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11119; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
11120; GFX90A-NEXT:    v_mul_lo_u32 v4, s2, v1
11121; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
11122; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v1
11123; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
11124; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
11125; GFX90A-NEXT:    v_mul_hi_u32 v6, s3, v0
11126; GFX90A-NEXT:    v_mul_lo_u32 v0, s3, v0
11127; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
11128; GFX90A-NEXT:    v_mul_hi_u32 v5, s3, v1
11129; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
11130; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
11131; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v1
11132; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
11133; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
11134; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fb
11135; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s1
11136; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s1
11137; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
11138; GFX90A-NEXT:    v_mul_lo_u32 v4, v0, s1
11139; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
11140; GFX90A-NEXT:    v_sub_co_u32_e32 v4, vcc, s2, v4
11141; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
11142; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s1, v4
11143; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
11144; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fa
11145; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v5
11146; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
11147; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
11148; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
11149; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
11150; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
11151; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
11152; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
11153; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v4
11154; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
11155; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
11156; GFX90A-NEXT:    v_cndmask_b32_e32 v3, -1, v4, vcc
11157; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
11158; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
11159; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
11160; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
11161; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
11162; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
11163; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
11164; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
11165; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
11166; GFX90A-NEXT:    s_endpgm
11167  %r = sdiv i64 %x, 1235195
11168  store i64 %r, i64 addrspace(1)* %out
11169  ret void
11170}
11171
11172define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
11173; CHECK-LABEL: @sdiv_i64_pow2k_denom(
11174; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
11175; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
11176; CHECK-NEXT:    ret void
11177;
11178; GFX6-LABEL: sdiv_i64_pow2k_denom:
11179; GFX6:       ; %bb.0:
11180; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
11181; GFX6-NEXT:    s_mov_b32 s7, 0xf000
11182; GFX6-NEXT:    s_mov_b32 s6, -1
11183; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11184; GFX6-NEXT:    s_mov_b32 s4, s0
11185; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
11186; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
11187; GFX6-NEXT:    s_add_u32 s0, s2, s0
11188; GFX6-NEXT:    s_mov_b32 s5, s1
11189; GFX6-NEXT:    s_addc_u32 s1, s3, 0
11190; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
11191; GFX6-NEXT:    v_mov_b32_e32 v0, s0
11192; GFX6-NEXT:    v_mov_b32_e32 v1, s1
11193; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
11194; GFX6-NEXT:    s_endpgm
11195;
11196; GFX9-LABEL: sdiv_i64_pow2k_denom:
11197; GFX9:       ; %bb.0:
11198; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
11199; GFX9-NEXT:    v_mov_b32_e32 v2, 0
11200; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11201; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
11202; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
11203; GFX9-NEXT:    s_add_u32 s2, s2, s4
11204; GFX9-NEXT:    s_addc_u32 s3, s3, 0
11205; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
11206; GFX9-NEXT:    v_mov_b32_e32 v0, s2
11207; GFX9-NEXT:    v_mov_b32_e32 v1, s3
11208; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
11209; GFX9-NEXT:    s_endpgm
11210;
11211; GFX90A-LABEL: sdiv_i64_pow2k_denom:
11212; GFX90A:       ; %bb.0:
11213; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
11214; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
11215; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
11216; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
11217; GFX90A-NEXT:    s_lshr_b32 s4, s4, 20
11218; GFX90A-NEXT:    s_add_u32 s2, s2, s4
11219; GFX90A-NEXT:    s_addc_u32 s3, s3, 0
11220; GFX90A-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
11221; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
11222; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
11223; GFX90A-NEXT:    s_endpgm
11224  %r = sdiv i64 %x, 4096
11225  store i64 %r, i64 addrspace(1)* %out
11226  ret void
11227}
11228
11229define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
11230; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
11231; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
11232; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
11233; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
11234; CHECK-NEXT:    ret void
11235;
11236; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
11237; GFX6:       ; %bb.0:
11238; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
11239; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
11240; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
11241; GFX6-NEXT:    s_mov_b32 s7, 0xf000
11242; GFX6-NEXT:    s_mov_b32 s6, -1
11243; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11244; GFX6-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
11245; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
11246; GFX6-NEXT:    s_add_u32 s4, s4, s2
11247; GFX6-NEXT:    s_mov_b32 s3, s2
11248; GFX6-NEXT:    s_addc_u32 s5, s5, s2
11249; GFX6-NEXT:    s_xor_b64 s[12:13], s[4:5], s[2:3]
11250; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
11251; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
11252; GFX6-NEXT:    s_sub_u32 s4, 0, s12
11253; GFX6-NEXT:    s_subb_u32 s5, 0, s13
11254; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
11255; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
11256; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
11257; GFX6-NEXT:    s_mov_b32 s15, s14
11258; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
11259; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
11260; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
11261; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
11262; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
11263; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
11264; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
11265; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
11266; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
11267; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
11268; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
11269; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
11270; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
11271; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
11272; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
11273; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
11274; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
11275; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
11276; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
11277; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
11278; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
11279; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
11280; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
11281; GFX6-NEXT:    v_mov_b32_e32 v4, 0
11282; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
11283; GFX6-NEXT:    v_mov_b32_e32 v6, 0
11284; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
11285; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
11286; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
11287; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
11288; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v2
11289; GFX6-NEXT:    v_mul_hi_u32 v7, s4, v0
11290; GFX6-NEXT:    v_mul_lo_u32 v8, s5, v0
11291; GFX6-NEXT:    s_mov_b32 s5, s9
11292; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
11293; GFX6-NEXT:    v_mul_lo_u32 v7, s4, v0
11294; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
11295; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
11296; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
11297; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
11298; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
11299; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
11300; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
11301; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
11302; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
11303; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
11304; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
11305; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
11306; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
11307; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
11308; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
11309; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
11310; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
11311; GFX6-NEXT:    s_add_u32 s0, s10, s14
11312; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
11313; GFX6-NEXT:    s_addc_u32 s1, s11, s14
11314; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11315; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
11316; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
11317; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
11318; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
11319; GFX6-NEXT:    v_mul_hi_u32 v7, s11, v1
11320; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
11321; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
11322; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
11323; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
11324; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
11325; GFX6-NEXT:    s_mov_b32 s4, s8
11326; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
11327; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
11328; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
11329; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
11330; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
11331; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
11332; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
11333; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
11334; GFX6-NEXT:    v_mov_b32_e32 v5, s13
11335; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
11336; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
11337; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
11338; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
11339; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
11340; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
11341; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s12, v3
11342; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
11343; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
11344; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
11345; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
11346; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
11347; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
11348; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
11349; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
11350; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
11351; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
11352; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
11353; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
11354; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
11355; GFX6-NEXT:    v_mov_b32_e32 v6, s11
11356; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
11357; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
11358; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
11359; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
11360; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
11361; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
11362; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
11363; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
11364; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
11365; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
11366; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[2:3]
11367; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
11368; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
11369; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
11370; GFX6-NEXT:    v_mov_b32_e32 v2, s1
11371; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
11372; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
11373; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
11374; GFX6-NEXT:    s_endpgm
11375;
11376; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
11377; GFX9:       ; %bb.0:
11378; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
11379; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
11380; GFX9-NEXT:    v_mov_b32_e32 v2, 0
11381; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11382; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
11383; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
11384; GFX9-NEXT:    s_add_u32 s2, s2, s8
11385; GFX9-NEXT:    s_mov_b32 s9, s8
11386; GFX9-NEXT:    s_addc_u32 s3, s3, s8
11387; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[8:9]
11388; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
11389; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
11390; GFX9-NEXT:    s_sub_u32 s12, 0, s10
11391; GFX9-NEXT:    s_subb_u32 s4, 0, s11
11392; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
11393; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
11394; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
11395; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
11396; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
11397; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
11398; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
11399; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
11400; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
11401; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
11402; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
11403; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v0
11404; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
11405; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
11406; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
11407; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
11408; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
11409; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
11410; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
11411; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
11412; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
11413; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
11414; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
11415; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
11416; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
11417; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
11418; GFX9-NEXT:    v_mov_b32_e32 v6, 0
11419; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
11420; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
11421; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
11422; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
11423; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v3
11424; GFX9-NEXT:    v_mul_hi_u32 v7, s12, v0
11425; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
11426; GFX9-NEXT:    v_mul_lo_u32 v9, s12, v0
11427; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
11428; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
11429; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
11430; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
11431; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
11432; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
11433; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
11434; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
11435; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
11436; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
11437; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
11438; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
11439; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
11440; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
11441; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
11442; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
11443; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
11444; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
11445; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
11446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11447; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
11448; GFX9-NEXT:    s_add_u32 s0, s6, s2
11449; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
11450; GFX9-NEXT:    s_mov_b32 s3, s2
11451; GFX9-NEXT:    s_addc_u32 s1, s7, s2
11452; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11453; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
11454; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
11455; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
11456; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
11457; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
11458; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
11459; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
11460; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
11461; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
11462; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
11463; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
11464; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v4, v0, vcc
11465; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v2, vcc
11466; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
11467; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
11468; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
11469; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
11470; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v0
11471; GFX9-NEXT:    v_mov_b32_e32 v6, s11
11472; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
11473; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v0
11474; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
11475; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v3
11476; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
11477; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
11478; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v4
11479; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
11480; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
11481; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
11482; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
11483; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
11484; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
11485; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
11486; GFX9-NEXT:    v_mov_b32_e32 v7, s7
11487; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
11488; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
11489; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
11490; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
11491; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
11492; GFX9-NEXT:    v_cndmask_b32_e64 v5, 1, 2, s[0:1]
11493; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
11494; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
11495; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v0, v5
11496; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
11497; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
11498; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
11499; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
11500; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[8:9]
11501; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
11502; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
11503; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
11504; GFX9-NEXT:    v_mov_b32_e32 v3, s1
11505; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
11506; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
11507; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
11508; GFX9-NEXT:    s_endpgm
11509;
11510; GFX90A-LABEL: sdiv_i64_pow2_shl_denom:
11511; GFX90A:       ; %bb.0:
11512; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x34
11513; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
11514; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
11515; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
11516; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
11517; GFX90A-NEXT:    s_ashr_i32 s2, s5, 31
11518; GFX90A-NEXT:    s_add_u32 s4, s4, s2
11519; GFX90A-NEXT:    s_mov_b32 s3, s2
11520; GFX90A-NEXT:    s_addc_u32 s5, s5, s2
11521; GFX90A-NEXT:    s_xor_b64 s[8:9], s[4:5], s[2:3]
11522; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
11523; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
11524; GFX90A-NEXT:    s_sub_u32 s10, 0, s8
11525; GFX90A-NEXT:    s_subb_u32 s11, 0, s9
11526; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
11527; GFX90A-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
11528; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
11529; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
11530; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
11531; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
11532; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
11533; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
11534; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
11535; GFX90A-NEXT:    v_mul_lo_u32 v3, s10, v1
11536; GFX90A-NEXT:    v_mul_hi_u32 v5, s10, v0
11537; GFX90A-NEXT:    v_mul_lo_u32 v4, s11, v0
11538; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
11539; GFX90A-NEXT:    v_mul_lo_u32 v6, s10, v0
11540; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
11541; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
11542; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
11543; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
11544; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
11545; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
11546; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
11547; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
11548; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
11549; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
11550; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
11551; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
11552; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
11553; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
11554; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
11555; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
11556; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
11557; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
11558; GFX90A-NEXT:    v_mul_lo_u32 v5, s10, v3
11559; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v0
11560; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
11561; GFX90A-NEXT:    v_mul_lo_u32 v7, s11, v0
11562; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
11563; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v0
11564; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
11565; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
11566; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
11567; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
11568; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
11569; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
11570; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
11571; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
11572; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
11573; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
11574; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v2, vcc
11575; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
11576; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
11577; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
11578; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
11579; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
11580; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
11581; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
11582; GFX90A-NEXT:    s_add_u32 s0, s6, s10
11583; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
11584; GFX90A-NEXT:    s_mov_b32 s11, s10
11585; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
11586; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11587; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
11588; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
11589; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
11590; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
11591; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
11592; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
11593; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
11594; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
11595; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
11596; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
11597; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v7, vcc
11598; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
11599; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
11600; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
11601; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
11602; GFX90A-NEXT:    v_mul_lo_u32 v3, s8, v1
11603; GFX90A-NEXT:    v_mul_hi_u32 v4, s8, v0
11604; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
11605; GFX90A-NEXT:    v_mul_lo_u32 v4, s9, v0
11606; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
11607; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v0
11608; GFX90A-NEXT:    v_sub_u32_e32 v4, s7, v3
11609; GFX90A-NEXT:    v_mov_b32_e32 v6, s9
11610; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
11611; GFX90A-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
11612; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v5
11613; GFX90A-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
11614; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
11615; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
11616; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
11617; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
11618; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
11619; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
11620; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
11621; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
11622; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
11623; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
11624; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
11625; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
11626; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
11627; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
11628; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
11629; GFX90A-NEXT:    v_add_co_u32_e64 v4, s[0:1], v0, v4
11630; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
11631; GFX90A-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
11632; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
11633; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
11634; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
11635; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
11636; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
11637; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
11638; GFX90A-NEXT:    v_mov_b32_e32 v3, s1
11639; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
11640; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
11641; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
11642; GFX90A-NEXT:    s_endpgm
11643  %shl.y = shl i64 4096, %y
11644  %r = sdiv i64 %x, %shl.y
11645  store i64 %r, i64 addrspace(1)* %out
11646  ret void
11647}
11648
11649define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
11650; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
11651; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
11652; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
11653; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
11654; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
11655; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
11656; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
11657; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
11658; CHECK-NEXT:    ret void
11659;
11660; GFX6-LABEL: sdiv_v2i64_pow2k_denom:
11661; GFX6:       ; %bb.0:
11662; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
11663; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
11664; GFX6-NEXT:    s_mov_b32 s7, 0xf000
11665; GFX6-NEXT:    s_mov_b32 s6, -1
11666; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11667; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
11668; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
11669; GFX6-NEXT:    s_add_u32 s0, s0, s8
11670; GFX6-NEXT:    s_addc_u32 s1, s1, 0
11671; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
11672; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
11673; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
11674; GFX6-NEXT:    s_add_u32 s2, s2, s8
11675; GFX6-NEXT:    s_addc_u32 s3, s3, 0
11676; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
11677; GFX6-NEXT:    v_mov_b32_e32 v0, s0
11678; GFX6-NEXT:    v_mov_b32_e32 v1, s1
11679; GFX6-NEXT:    v_mov_b32_e32 v2, s2
11680; GFX6-NEXT:    v_mov_b32_e32 v3, s3
11681; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
11682; GFX6-NEXT:    s_endpgm
11683;
11684; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
11685; GFX9:       ; %bb.0:
11686; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
11687; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
11688; GFX9-NEXT:    v_mov_b32_e32 v4, 0
11689; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11690; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
11691; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
11692; GFX9-NEXT:    s_add_u32 s0, s4, s0
11693; GFX9-NEXT:    s_addc_u32 s1, s5, 0
11694; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
11695; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
11696; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
11697; GFX9-NEXT:    s_add_u32 s4, s6, s4
11698; GFX9-NEXT:    s_addc_u32 s5, s7, 0
11699; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
11700; GFX9-NEXT:    v_mov_b32_e32 v0, s0
11701; GFX9-NEXT:    v_mov_b32_e32 v1, s1
11702; GFX9-NEXT:    v_mov_b32_e32 v2, s4
11703; GFX9-NEXT:    v_mov_b32_e32 v3, s5
11704; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
11705; GFX9-NEXT:    s_endpgm
11706;
11707; GFX90A-LABEL: sdiv_v2i64_pow2k_denom:
11708; GFX90A:       ; %bb.0:
11709; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
11710; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
11711; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
11712; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
11713; GFX90A-NEXT:    s_ashr_i32 s0, s5, 31
11714; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
11715; GFX90A-NEXT:    s_add_u32 s0, s4, s0
11716; GFX90A-NEXT:    s_addc_u32 s1, s5, 0
11717; GFX90A-NEXT:    s_ashr_i32 s4, s7, 31
11718; GFX90A-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
11719; GFX90A-NEXT:    s_lshr_b32 s4, s4, 20
11720; GFX90A-NEXT:    s_add_u32 s4, s6, s4
11721; GFX90A-NEXT:    s_addc_u32 s5, s7, 0
11722; GFX90A-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
11723; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
11724; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
11725; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
11726; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
11727; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
11728; GFX90A-NEXT:    s_endpgm
11729  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
11730  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
11731  ret void
11732}
11733
11734define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
11735; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
11736; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
11737; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
11738; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
11739; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
11740; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
11741; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
11742; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
11743; CHECK-NEXT:    ret void
11744;
11745; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
11746; GFX6:       ; %bb.0:
11747; GFX6-NEXT:    v_mov_b32_e32 v0, 0x457ff000
11748; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
11749; GFX6-NEXT:    v_mac_f32_e32 v0, 0, v1
11750; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
11751; GFX6-NEXT:    s_movk_i32 s6, 0xf001
11752; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
11753; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
11754; GFX6-NEXT:    s_mov_b32 s7, 0xf000
11755; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
11756; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
11757; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
11758; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
11759; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
11760; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
11761; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
11762; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
11763; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
11764; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
11765; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
11766; GFX6-NEXT:    s_add_u32 s2, s8, s0
11767; GFX6-NEXT:    s_addc_u32 s3, s9, 0
11768; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
11769; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
11770; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
11771; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
11772; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
11773; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
11774; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
11775; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
11776; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
11777; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
11778; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
11779; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
11780; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
11781; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
11782; GFX6-NEXT:    s_mov_b32 s9, s8
11783; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
11784; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
11785; GFX6-NEXT:    v_mov_b32_e32 v4, 0
11786; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
11787; GFX6-NEXT:    v_mov_b32_e32 v6, 0
11788; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
11789; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
11790; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
11791; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
11792; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s6
11793; GFX6-NEXT:    v_mul_hi_u32 v7, v0, s6
11794; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
11795; GFX6-NEXT:    v_mul_lo_u32 v7, v0, s6
11796; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
11797; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
11798; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
11799; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
11800; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
11801; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
11802; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
11803; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
11804; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
11805; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
11806; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
11807; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
11808; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
11809; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
11810; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
11811; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
11812; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
11813; GFX6-NEXT:    s_add_u32 s0, s10, s8
11814; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
11815; GFX6-NEXT:    s_addc_u32 s1, s11, s8
11816; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
11817; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
11818; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
11819; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
11820; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v1
11821; GFX6-NEXT:    v_mul_hi_u32 v7, s1, v1
11822; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
11823; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
11824; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
11825; GFX6-NEXT:    v_mul_lo_u32 v5, s1, v0
11826; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
11827; GFX6-NEXT:    s_movk_i32 s9, 0xfff
11828; GFX6-NEXT:    s_mov_b32 s6, -1
11829; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
11830; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
11831; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
11832; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
11833; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
11834; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
11835; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s9
11836; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
11837; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s9
11838; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
11839; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
11840; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
11841; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
11842; GFX6-NEXT:    v_mov_b32_e32 v5, s1
11843; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
11844; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
11845; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v8
11846; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
11847; GFX6-NEXT:    s_movk_i32 s0, 0xffe
11848; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
11849; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
11850; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
11851; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
11852; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
11853; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
11854; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
11855; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
11856; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
11857; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
11858; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
11859; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
11860; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
11861; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
11862; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
11863; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
11864; GFX6-NEXT:    v_mov_b32_e32 v3, s8
11865; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
11866; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
11867; GFX6-NEXT:    v_mov_b32_e32 v0, s2
11868; GFX6-NEXT:    v_mov_b32_e32 v1, s3
11869; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
11870; GFX6-NEXT:    s_endpgm
11871;
11872; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
11873; GFX9:       ; %bb.0:
11874; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
11875; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
11876; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
11877; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
11878; GFX9-NEXT:    s_movk_i32 s8, 0xf001
11879; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
11880; GFX9-NEXT:    v_mov_b32_e32 v4, 0
11881; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
11882; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
11883; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
11884; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
11885; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
11886; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
11887; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11888; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
11889; GFX9-NEXT:    s_lshr_b32 s2, s2, 20
11890; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s8
11891; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
11892; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
11893; GFX9-NEXT:    s_add_u32 s4, s4, s2
11894; GFX9-NEXT:    s_addc_u32 s5, s5, 0
11895; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
11896; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
11897; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
11898; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
11899; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
11900; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
11901; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
11902; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
11903; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
11904; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
11905; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
11906; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
11907; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
11908; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
11909; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
11910; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
11911; GFX9-NEXT:    v_mov_b32_e32 v6, 0
11912; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
11913; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
11914; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
11915; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
11916; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s8
11917; GFX9-NEXT:    v_mul_hi_u32 v7, v0, s8
11918; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
11919; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
11920; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
11921; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v0
11922; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
11923; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
11924; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
11925; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
11926; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
11927; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v5
11928; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
11929; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
11930; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v5
11931; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
11932; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
11933; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
11934; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
11935; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
11936; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
11937; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
11938; GFX9-NEXT:    s_add_u32 s6, s6, s2
11939; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
11940; GFX9-NEXT:    s_mov_b32 s3, s2
11941; GFX9-NEXT:    s_addc_u32 s7, s7, s2
11942; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
11943; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
11944; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
11945; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
11946; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
11947; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
11948; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
11949; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
11950; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
11951; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
11952; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
11953; GFX9-NEXT:    s_movk_i32 s3, 0xfff
11954; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
11955; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
11956; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
11957; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
11958; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
11959; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s3
11960; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
11961; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
11962; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
11963; GFX9-NEXT:    v_mov_b32_e32 v3, s7
11964; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
11965; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
11966; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s3, v5
11967; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
11968; GFX9-NEXT:    s_movk_i32 s3, 0xffe
11969; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
11970; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
11971; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
11972; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
11973; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
11974; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1, 2, vcc
11975; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
11976; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
11977; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v5
11978; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
11979; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
11980; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v5, vcc
11981; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
11982; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
11983; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
11984; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
11985; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
11986; GFX9-NEXT:    v_mov_b32_e32 v3, s2
11987; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
11988; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
11989; GFX9-NEXT:    v_mov_b32_e32 v0, s4
11990; GFX9-NEXT:    v_mov_b32_e32 v1, s5
11991; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
11992; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
11993; GFX9-NEXT:    s_endpgm
11994;
11995; GFX90A-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
11996; GFX90A:       ; %bb.0:
11997; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x457ff000
11998; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x4f800000
11999; GFX90A-NEXT:    v_mac_f32_e32 v0, 0, v1
12000; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
12001; GFX90A-NEXT:    s_movk_i32 s8, 0xf001
12002; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
12003; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
12004; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
12005; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
12006; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
12007; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
12008; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
12009; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
12010; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
12011; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
12012; GFX90A-NEXT:    s_ashr_i32 s0, s5, 31
12013; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
12014; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, s8
12015; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s8
12016; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
12017; GFX90A-NEXT:    v_sub_u32_e32 v2, v2, v0
12018; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s8
12019; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
12020; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
12021; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
12022; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
12023; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
12024; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
12025; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
12026; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
12027; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
12028; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
12029; GFX90A-NEXT:    s_add_u32 s0, s4, s0
12030; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
12031; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
12032; GFX90A-NEXT:    s_addc_u32 s1, s5, 0
12033; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
12034; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
12035; GFX90A-NEXT:    s_ashr_i64 s[4:5], s[0:1], 12
12036; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
12037; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
12038; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
12039; GFX90A-NEXT:    v_mul_lo_u32 v5, v2, s8
12040; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s8
12041; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
12042; GFX90A-NEXT:    v_sub_u32_e32 v5, v5, v0
12043; GFX90A-NEXT:    v_mul_lo_u32 v8, v0, s8
12044; GFX90A-NEXT:    v_mul_hi_u32 v9, v2, v8
12045; GFX90A-NEXT:    v_mul_lo_u32 v10, v2, v8
12046; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
12047; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
12048; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
12049; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
12050; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
12051; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
12052; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
12053; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
12054; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
12055; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v5
12056; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
12057; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
12058; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
12059; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
12060; GFX90A-NEXT:    s_ashr_i32 s0, s7, 31
12061; GFX90A-NEXT:    s_add_u32 s6, s6, s0
12062; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
12063; GFX90A-NEXT:    s_mov_b32 s1, s0
12064; GFX90A-NEXT:    s_addc_u32 s7, s7, s0
12065; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
12066; GFX90A-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
12067; GFX90A-NEXT:    v_mul_lo_u32 v3, s6, v1
12068; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
12069; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v1
12070; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
12071; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
12072; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
12073; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
12074; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
12075; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
12076; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v7, vcc
12077; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
12078; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
12079; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
12080; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
12081; GFX90A-NEXT:    s_movk_i32 s1, 0xfff
12082; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s1
12083; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s1
12084; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
12085; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s1
12086; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
12087; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
12088; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
12089; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s1, v3
12090; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
12091; GFX90A-NEXT:    s_movk_i32 s1, 0xffe
12092; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v5
12093; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
12094; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
12095; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
12096; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
12097; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
12098; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
12099; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
12100; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v3
12101; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
12102; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
12103; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v3, vcc
12104; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
12105; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
12106; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
12107; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
12108; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
12109; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
12110; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
12111; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
12112; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
12113; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
12114; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
12115; GFX90A-NEXT:    s_endpgm
12116  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
12117  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
12118  ret void
12119}
12120
12121define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
12122; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
12123; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
12124; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
12125; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
12126; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
12127; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
12128; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
12129; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
12130; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
12131; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
12132; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
12133; CHECK-NEXT:    ret void
12134;
12135; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
12136; GFX6:       ; %bb.0:
12137; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
12138; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
12139; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
12140; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
12141; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
12142; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
12143; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
12144; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
12145; GFX6-NEXT:    s_ashr_i32 s16, s3, 31
12146; GFX6-NEXT:    s_add_u32 s2, s2, s16
12147; GFX6-NEXT:    s_mov_b32 s17, s16
12148; GFX6-NEXT:    s_addc_u32 s3, s3, s16
12149; GFX6-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
12150; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s14
12151; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s15
12152; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
12153; GFX6-NEXT:    s_sub_u32 s6, 0, s14
12154; GFX6-NEXT:    s_subb_u32 s7, 0, s15
12155; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
12156; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
12157; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
12158; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
12159; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
12160; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
12161; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
12162; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
12163; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
12164; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v0
12165; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v2
12166; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v3
12167; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v3
12168; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
12169; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
12170; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v4
12171; GFX6-NEXT:    v_mul_lo_u32 v0, v3, v1
12172; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
12173; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v1
12174; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v1
12175; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
12176; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
12177; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v5
12178; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
12179; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
12180; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
12181; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v1
12182; GFX6-NEXT:    v_mov_b32_e32 v0, 0
12183; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v7, v0, vcc
12184; GFX6-NEXT:    v_mov_b32_e32 v1, 0
12185; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
12186; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v6, vcc
12187; GFX6-NEXT:    v_add_i32_e64 v3, s[2:3], v3, v4
12188; GFX6-NEXT:    v_addc_u32_e64 v4, vcc, v2, v5, s[2:3]
12189; GFX6-NEXT:    v_mul_lo_u32 v6, s6, v4
12190; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
12191; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
12192; GFX6-NEXT:    s_mov_b32 s7, 0xf000
12193; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
12194; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v3
12195; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
12196; GFX6-NEXT:    v_mul_lo_u32 v10, v3, v6
12197; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v7
12198; GFX6-NEXT:    v_mul_hi_u32 v12, v3, v6
12199; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v7
12200; GFX6-NEXT:    v_mul_lo_u32 v7, v4, v7
12201; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
12202; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
12203; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
12204; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v6
12205; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
12206; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
12207; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v0, vcc
12208; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
12209; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v1, v6, vcc
12210; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
12211; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v2, v6, s[2:3]
12212; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
12213; GFX6-NEXT:    s_ashr_i32 s2, s9, 31
12214; GFX6-NEXT:    s_add_u32 s0, s8, s2
12215; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
12216; GFX6-NEXT:    s_mov_b32 s3, s2
12217; GFX6-NEXT:    s_addc_u32 s1, s9, s2
12218; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
12219; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
12220; GFX6-NEXT:    v_mul_lo_u32 v4, s8, v2
12221; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v3
12222; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v2
12223; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v2
12224; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v2
12225; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
12226; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
12227; GFX6-NEXT:    v_mul_lo_u32 v6, s9, v3
12228; GFX6-NEXT:    v_mul_hi_u32 v3, s9, v3
12229; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[16:17]
12230; GFX6-NEXT:    s_ashr_i32 s2, s13, 31
12231; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
12232; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
12233; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v0, vcc
12234; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
12235; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v4, vcc
12236; GFX6-NEXT:    v_mul_lo_u32 v4, s14, v3
12237; GFX6-NEXT:    v_mul_hi_u32 v5, s14, v2
12238; GFX6-NEXT:    v_mul_lo_u32 v6, s15, v2
12239; GFX6-NEXT:    v_mov_b32_e32 v7, s15
12240; GFX6-NEXT:    s_mov_b32 s3, s2
12241; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
12242; GFX6-NEXT:    v_mul_lo_u32 v5, s14, v2
12243; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
12244; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s9, v4
12245; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s8, v5
12246; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
12247; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v5
12248; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
12249; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v6
12250; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
12251; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
12252; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
12253; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v6
12254; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
12255; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
12256; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
12257; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
12258; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
12259; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
12260; GFX6-NEXT:    s_add_u32 s8, s12, s2
12261; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
12262; GFX6-NEXT:    v_mov_b32_e32 v8, s9
12263; GFX6-NEXT:    s_addc_u32 s9, s13, s2
12264; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
12265; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s8
12266; GFX6-NEXT:    v_cvt_f32_u32_e32 v11, s9
12267; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
12268; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s15, v4
12269; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
12270; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v5
12271; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
12272; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v4
12273; GFX6-NEXT:    v_mac_f32_e32 v10, s18, v11
12274; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
12275; GFX6-NEXT:    v_rcp_f32_e32 v5, v10
12276; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
12277; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
12278; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
12279; GFX6-NEXT:    v_mul_f32_e32 v5, s19, v5
12280; GFX6-NEXT:    v_mul_f32_e32 v6, s20, v5
12281; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
12282; GFX6-NEXT:    v_mac_f32_e32 v5, s21, v6
12283; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
12284; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
12285; GFX6-NEXT:    s_sub_u32 s12, 0, s8
12286; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
12287; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v5
12288; GFX6-NEXT:    v_mul_lo_u32 v7, s12, v6
12289; GFX6-NEXT:    s_subb_u32 s13, 0, s9
12290; GFX6-NEXT:    v_mul_lo_u32 v8, s13, v5
12291; GFX6-NEXT:    v_xor_b32_e32 v2, s16, v2
12292; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
12293; GFX6-NEXT:    v_mul_lo_u32 v7, s12, v5
12294; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
12295; GFX6-NEXT:    v_mul_lo_u32 v8, v5, v4
12296; GFX6-NEXT:    v_mul_hi_u32 v9, v5, v7
12297; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v4
12298; GFX6-NEXT:    v_mul_hi_u32 v11, v6, v4
12299; GFX6-NEXT:    v_mul_lo_u32 v4, v6, v4
12300; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
12301; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
12302; GFX6-NEXT:    v_mul_lo_u32 v10, v6, v7
12303; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
12304; GFX6-NEXT:    v_xor_b32_e32 v3, s17, v3
12305; GFX6-NEXT:    s_mov_b32 s6, -1
12306; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
12307; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
12308; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v0, vcc
12309; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
12310; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v1, v8, vcc
12311; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], v5, v4
12312; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v6, v7, s[0:1]
12313; GFX6-NEXT:    v_mul_lo_u32 v8, s12, v5
12314; GFX6-NEXT:    v_mul_hi_u32 v9, s12, v4
12315; GFX6-NEXT:    v_mul_lo_u32 v10, s13, v4
12316; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
12317; GFX6-NEXT:    v_mul_lo_u32 v9, s12, v4
12318; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
12319; GFX6-NEXT:    v_mul_lo_u32 v12, v4, v8
12320; GFX6-NEXT:    v_mul_hi_u32 v13, v4, v9
12321; GFX6-NEXT:    v_mul_hi_u32 v14, v4, v8
12322; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v9
12323; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v9
12324; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v8
12325; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
12326; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
12327; GFX6-NEXT:    v_mul_lo_u32 v5, v5, v8
12328; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
12329; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
12330; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v0, vcc
12331; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
12332; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
12333; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
12334; GFX6-NEXT:    s_ashr_i32 s12, s11, 31
12335; GFX6-NEXT:    v_addc_u32_e64 v6, vcc, v6, v8, s[0:1]
12336; GFX6-NEXT:    s_add_u32 s0, s10, s12
12337; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
12338; GFX6-NEXT:    s_mov_b32 s13, s12
12339; GFX6-NEXT:    s_addc_u32 s1, s11, s12
12340; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
12341; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[12:13]
12342; GFX6-NEXT:    v_mul_lo_u32 v6, s10, v5
12343; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v4
12344; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v5
12345; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v5
12346; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v5
12347; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
12348; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
12349; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v4
12350; GFX6-NEXT:    v_mul_hi_u32 v4, s11, v4
12351; GFX6-NEXT:    v_mov_b32_e32 v8, s17
12352; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
12353; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
12354; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
12355; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
12356; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v0, vcc
12357; GFX6-NEXT:    v_mul_lo_u32 v6, s8, v5
12358; GFX6-NEXT:    v_mul_hi_u32 v7, s8, v4
12359; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s16, v2
12360; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v4
12361; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v3, v8, vcc
12362; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v7, v6
12363; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
12364; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v4
12365; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s11, v2
12366; GFX6-NEXT:    v_mov_b32_e32 v7, s9
12367; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
12368; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
12369; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s8, v3
12370; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
12371; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
12372; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
12373; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
12374; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
12375; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
12376; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
12377; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v4
12378; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v5, s[0:1]
12379; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v4
12380; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1]
12381; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
12382; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
12383; GFX6-NEXT:    v_mov_b32_e32 v8, s11
12384; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
12385; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
12386; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
12387; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
12388; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
12389; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
12390; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
12391; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
12392; GFX6-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
12393; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
12394; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[2:3]
12395; GFX6-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
12396; GFX6-NEXT:    v_xor_b32_e32 v3, s0, v3
12397; GFX6-NEXT:    v_xor_b32_e32 v4, s1, v2
12398; GFX6-NEXT:    v_mov_b32_e32 v5, s1
12399; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v3
12400; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
12401; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
12402; GFX6-NEXT:    s_endpgm
12403;
12404; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
12405; GFX9:       ; %bb.0:
12406; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
12407; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
12408; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
12409; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
12410; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
12411; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12412; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
12413; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
12414; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
12415; GFX9-NEXT:    s_add_u32 s2, s2, s12
12416; GFX9-NEXT:    s_mov_b32 s13, s12
12417; GFX9-NEXT:    s_addc_u32 s3, s3, s12
12418; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
12419; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
12420; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
12421; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
12422; GFX9-NEXT:    s_sub_u32 s14, 0, s10
12423; GFX9-NEXT:    s_subb_u32 s4, 0, s11
12424; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
12425; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
12426; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
12427; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
12428; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
12429; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
12430; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
12431; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
12432; GFX9-NEXT:    v_mul_lo_u32 v0, s14, v2
12433; GFX9-NEXT:    v_mul_hi_u32 v1, s14, v3
12434; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
12435; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v3
12436; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
12437; GFX9-NEXT:    v_add_u32_e32 v5, v0, v5
12438; GFX9-NEXT:    v_mul_hi_u32 v1, v3, v4
12439; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v5
12440; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
12441; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v5
12442; GFX9-NEXT:    v_mul_lo_u32 v5, v2, v5
12443; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v6
12444; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
12445; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v4
12446; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v4
12447; GFX9-NEXT:    v_mov_b32_e32 v0, 0
12448; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v7
12449; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
12450; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v0, vcc
12451; GFX9-NEXT:    v_mov_b32_e32 v1, 0
12452; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
12453; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v6, vcc
12454; GFX9-NEXT:    v_add_co_u32_e64 v3, s[2:3], v3, v4
12455; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3]
12456; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v4
12457; GFX9-NEXT:    v_mul_hi_u32 v7, s14, v3
12458; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v3
12459; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v3
12460; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
12461; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
12462; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
12463; GFX9-NEXT:    v_mul_lo_u32 v10, v3, v6
12464; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v9
12465; GFX9-NEXT:    v_mul_hi_u32 v12, v3, v6
12466; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v9
12467; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v9
12468; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v6
12469; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
12470; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
12471; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v6
12472; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
12473; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
12474; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v0, vcc
12475; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
12476; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v6, vcc
12477; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
12478; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12479; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
12480; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3]
12481; GFX9-NEXT:    s_add_u32 s2, s4, s14
12482; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
12483; GFX9-NEXT:    s_mov_b32 s15, s14
12484; GFX9-NEXT:    s_addc_u32 s3, s5, s14
12485; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
12486; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
12487; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v2
12488; GFX9-NEXT:    v_mul_hi_u32 v5, s4, v3
12489; GFX9-NEXT:    v_mul_hi_u32 v6, s4, v2
12490; GFX9-NEXT:    v_mul_hi_u32 v7, s5, v2
12491; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v2
12492; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
12493; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
12494; GFX9-NEXT:    v_mul_lo_u32 v6, s5, v3
12495; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
12496; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
12497; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
12498; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
12499; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
12500; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v0, vcc
12501; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
12502; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v4, vcc
12503; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v3
12504; GFX9-NEXT:    v_mul_hi_u32 v5, s10, v2
12505; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v2
12506; GFX9-NEXT:    v_mov_b32_e32 v7, s11
12507; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
12508; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v2
12509; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
12510; GFX9-NEXT:    v_sub_u32_e32 v6, s5, v4
12511; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s4, v5
12512; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v7, vcc
12513; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v5
12514; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
12515; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v6
12516; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
12517; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
12518; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
12519; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v6
12520; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
12521; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
12522; GFX9-NEXT:    v_cndmask_b32_e64 v6, 1, 2, s[0:1]
12523; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v2, v6
12524; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v3, s[0:1]
12525; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
12526; GFX9-NEXT:    s_add_u32 s0, s8, s4
12527; GFX9-NEXT:    v_mov_b32_e32 v8, s5
12528; GFX9-NEXT:    s_mov_b32 s5, s4
12529; GFX9-NEXT:    s_addc_u32 s1, s9, s4
12530; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[4:5]
12531; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v8, v4, vcc
12532; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s8
12533; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s9
12534; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
12535; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
12536; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
12537; GFX9-NEXT:    v_mac_f32_e32 v8, s16, v9
12538; GFX9-NEXT:    v_rcp_f32_e32 v8, v8
12539; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
12540; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
12541; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v5, vcc
12542; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
12543; GFX9-NEXT:    v_mul_f32_e32 v4, s17, v8
12544; GFX9-NEXT:    v_mul_f32_e32 v5, s18, v4
12545; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
12546; GFX9-NEXT:    v_mac_f32_e32 v4, s19, v5
12547; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
12548; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
12549; GFX9-NEXT:    s_sub_u32 s10, 0, s8
12550; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
12551; GFX9-NEXT:    s_subb_u32 s11, 0, s9
12552; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v4
12553; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v5
12554; GFX9-NEXT:    v_mul_lo_u32 v9, s11, v4
12555; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
12556; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v4
12557; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
12558; GFX9-NEXT:    v_add_u32_e32 v6, v6, v9
12559; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v6
12560; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v7
12561; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v6
12562; GFX9-NEXT:    v_mul_hi_u32 v11, v5, v6
12563; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
12564; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
12565; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
12566; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v7
12567; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
12568; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
12569; GFX9-NEXT:    v_xor_b32_e32 v3, s13, v3
12570; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
12571; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
12572; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v0, vcc
12573; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
12574; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v8, vcc
12575; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v4, v6
12576; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
12577; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v6
12578; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v4
12579; GFX9-NEXT:    v_mul_lo_u32 v10, s11, v4
12580; GFX9-NEXT:    v_mul_lo_u32 v11, s10, v4
12581; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
12582; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
12583; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
12584; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v8
12585; GFX9-NEXT:    v_mul_hi_u32 v13, v4, v11
12586; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v8
12587; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v11
12588; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v11
12589; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v8
12590; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
12591; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
12592; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v8
12593; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
12594; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
12595; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v0, vcc
12596; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
12597; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
12598; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
12599; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
12600; GFX9-NEXT:    s_add_u32 s0, s6, s10
12601; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
12602; GFX9-NEXT:    s_mov_b32 s11, s10
12603; GFX9-NEXT:    s_addc_u32 s1, s7, s10
12604; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
12605; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
12606; GFX9-NEXT:    v_mul_lo_u32 v6, s6, v5
12607; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v4
12608; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v5
12609; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v5
12610; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v5
12611; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
12612; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
12613; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
12614; GFX9-NEXT:    v_mul_hi_u32 v4, s7, v4
12615; GFX9-NEXT:    v_mov_b32_e32 v8, s13
12616; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
12617; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
12618; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v10, v0, vcc
12619; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
12620; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v6, vcc
12621; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v5
12622; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v4
12623; GFX9-NEXT:    v_mul_lo_u32 v9, s9, v4
12624; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s12, v2
12625; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v8, vcc
12626; GFX9-NEXT:    v_add_u32_e32 v3, v7, v6
12627; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v4
12628; GFX9-NEXT:    v_add_u32_e32 v3, v3, v9
12629; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v3
12630; GFX9-NEXT:    v_mov_b32_e32 v8, s9
12631; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, s6, v6
12632; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc
12633; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s8, v6
12634; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1]
12635; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v7
12636; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
12637; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
12638; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
12639; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v7
12640; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
12641; GFX9-NEXT:    v_mov_b32_e32 v9, s7
12642; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v9, v3, vcc
12643; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
12644; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
12645; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
12646; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v6
12647; GFX9-NEXT:    v_cndmask_b32_e64 v7, 1, 2, s[0:1]
12648; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
12649; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
12650; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], v4, v7
12651; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v6, vcc
12652; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v5, s[0:1]
12653; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
12654; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
12655; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
12656; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v8, vcc
12657; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
12658; GFX9-NEXT:    v_xor_b32_e32 v4, s1, v4
12659; GFX9-NEXT:    v_mov_b32_e32 v5, s1
12660; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v3
12661; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
12662; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12663; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
12664; GFX9-NEXT:    s_endpgm
12665;
12666; GFX90A-LABEL: sdiv_v2i64_pow2_shl_denom:
12667; GFX90A:       ; %bb.0:
12668; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
12669; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
12670; GFX90A-NEXT:    s_mov_b32 s16, 0x4f800000
12671; GFX90A-NEXT:    s_mov_b32 s17, 0x5f7ffffc
12672; GFX90A-NEXT:    s_mov_b32 s18, 0x2f800000
12673; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
12674; GFX90A-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
12675; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
12676; GFX90A-NEXT:    s_ashr_i32 s10, s3, 31
12677; GFX90A-NEXT:    s_add_u32 s2, s2, s10
12678; GFX90A-NEXT:    s_mov_b32 s11, s10
12679; GFX90A-NEXT:    s_addc_u32 s3, s3, s10
12680; GFX90A-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
12681; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s12
12682; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s13
12683; GFX90A-NEXT:    s_mov_b32 s19, 0xcf800000
12684; GFX90A-NEXT:    s_sub_u32 s14, 0, s12
12685; GFX90A-NEXT:    s_subb_u32 s15, 0, s13
12686; GFX90A-NEXT:    v_mac_f32_e32 v0, s16, v1
12687; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
12688; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
12689; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
12690; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
12691; GFX90A-NEXT:    v_mul_f32_e32 v0, s17, v0
12692; GFX90A-NEXT:    v_mul_f32_e32 v1, s18, v0
12693; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
12694; GFX90A-NEXT:    v_mac_f32_e32 v0, s19, v1
12695; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
12696; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
12697; GFX90A-NEXT:    v_mul_hi_u32 v3, s14, v0
12698; GFX90A-NEXT:    v_mul_lo_u32 v5, s14, v1
12699; GFX90A-NEXT:    v_mul_lo_u32 v2, s15, v0
12700; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
12701; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
12702; GFX90A-NEXT:    v_mul_lo_u32 v6, s14, v0
12703; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
12704; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
12705; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
12706; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
12707; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
12708; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
12709; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
12710; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
12711; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
12712; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
12713; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
12714; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v2
12715; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
12716; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
12717; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v2, v5, vcc
12718; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
12719; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1]
12720; GFX90A-NEXT:    v_mul_lo_u32 v6, s14, v3
12721; GFX90A-NEXT:    v_mul_hi_u32 v7, s14, v0
12722; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
12723; GFX90A-NEXT:    v_mul_lo_u32 v7, s15, v0
12724; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v7
12725; GFX90A-NEXT:    v_mul_lo_u32 v8, s14, v0
12726; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
12727; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
12728; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v6
12729; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
12730; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v6
12731; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
12732; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
12733; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
12734; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
12735; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
12736; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
12737; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v6
12738; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
12739; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v7, vcc
12740; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v5
12741; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
12742; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
12743; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1]
12744; GFX90A-NEXT:    s_add_u32 s0, s4, s14
12745; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
12746; GFX90A-NEXT:    s_mov_b32 s15, s14
12747; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
12748; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
12749; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
12750; GFX90A-NEXT:    v_mul_lo_u32 v5, s4, v1
12751; GFX90A-NEXT:    v_mul_hi_u32 v6, s4, v0
12752; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v1
12753; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
12754; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
12755; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
12756; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
12757; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
12758; GFX90A-NEXT:    v_mul_hi_u32 v6, s5, v1
12759; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v7, vcc
12760; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
12761; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
12762; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
12763; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
12764; GFX90A-NEXT:    v_mul_lo_u32 v3, s12, v1
12765; GFX90A-NEXT:    v_mul_hi_u32 v5, s12, v0
12766; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
12767; GFX90A-NEXT:    v_mul_lo_u32 v5, s13, v0
12768; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
12769; GFX90A-NEXT:    v_mul_lo_u32 v6, s12, v0
12770; GFX90A-NEXT:    v_sub_u32_e32 v5, s5, v3
12771; GFX90A-NEXT:    v_mov_b32_e32 v7, s13
12772; GFX90A-NEXT:    v_sub_co_u32_e32 v6, vcc, s4, v6
12773; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v7, vcc
12774; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s12, v6
12775; GFX90A-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
12776; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v5
12777; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
12778; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
12779; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
12780; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v5
12781; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
12782; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
12783; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, s[0:1]
12784; GFX90A-NEXT:    v_mov_b32_e32 v8, s5
12785; GFX90A-NEXT:    v_add_co_u32_e64 v5, s[0:1], v0, v5
12786; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v8, v3, vcc
12787; GFX90A-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
12788; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
12789; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
12790; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v6
12791; GFX90A-NEXT:    s_xor_b64 s[0:1], s[14:15], s[10:11]
12792; GFX90A-NEXT:    s_ashr_i32 s4, s9, 31
12793; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
12794; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v3
12795; GFX90A-NEXT:    s_add_u32 s8, s8, s4
12796; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
12797; GFX90A-NEXT:    s_mov_b32 s5, s4
12798; GFX90A-NEXT:    s_addc_u32 s9, s9, s4
12799; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
12800; GFX90A-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
12801; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
12802; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s8
12803; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s9
12804; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
12805; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
12806; GFX90A-NEXT:    s_sub_u32 s10, 0, s8
12807; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v5
12808; GFX90A-NEXT:    v_rcp_f32_e32 v3, v3
12809; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
12810; GFX90A-NEXT:    v_mov_b32_e32 v6, s1
12811; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
12812; GFX90A-NEXT:    v_mul_f32_e32 v3, s17, v3
12813; GFX90A-NEXT:    v_mul_f32_e32 v5, s18, v3
12814; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
12815; GFX90A-NEXT:    v_mac_f32_e32 v3, s19, v5
12816; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
12817; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
12818; GFX90A-NEXT:    s_subb_u32 s11, 0, s9
12819; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
12820; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v3
12821; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v5
12822; GFX90A-NEXT:    v_mul_lo_u32 v6, s11, v3
12823; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v8
12824; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
12825; GFX90A-NEXT:    v_mul_lo_u32 v9, s10, v3
12826; GFX90A-NEXT:    v_mul_lo_u32 v8, v3, v6
12827; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v9
12828; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
12829; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
12830; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
12831; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v9
12832; GFX90A-NEXT:    v_mul_lo_u32 v9, v5, v9
12833; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
12834; GFX90A-NEXT:    v_mul_hi_u32 v10, v5, v6
12835; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v11, vcc
12836; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
12837; GFX90A-NEXT:    v_mul_lo_u32 v6, v5, v6
12838; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
12839; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v2, v8, vcc
12840; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v6
12841; GFX90A-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
12842; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v6
12843; GFX90A-NEXT:    v_mul_hi_u32 v9, s10, v3
12844; GFX90A-NEXT:    v_add_u32_e32 v8, v9, v8
12845; GFX90A-NEXT:    v_mul_lo_u32 v9, s11, v3
12846; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v9
12847; GFX90A-NEXT:    v_mul_lo_u32 v10, s10, v3
12848; GFX90A-NEXT:    v_mul_hi_u32 v11, v6, v10
12849; GFX90A-NEXT:    v_mul_lo_u32 v12, v6, v10
12850; GFX90A-NEXT:    v_mul_lo_u32 v14, v3, v8
12851; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v10
12852; GFX90A-NEXT:    v_mul_hi_u32 v13, v3, v8
12853; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v14
12854; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
12855; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
12856; GFX90A-NEXT:    v_mul_hi_u32 v9, v6, v8
12857; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v11, vcc
12858; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v4, vcc
12859; GFX90A-NEXT:    v_mul_lo_u32 v6, v6, v8
12860; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
12861; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v2, v9, vcc
12862; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
12863; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
12864; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
12865; GFX90A-NEXT:    s_add_u32 s0, s6, s10
12866; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
12867; GFX90A-NEXT:    s_mov_b32 s11, s10
12868; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
12869; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
12870; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
12871; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v5
12872; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v3
12873; GFX90A-NEXT:    v_mul_hi_u32 v6, s6, v5
12874; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
12875; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
12876; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v3
12877; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
12878; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
12879; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v5
12880; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v9, vcc
12881; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v4, vcc
12882; GFX90A-NEXT:    v_mul_lo_u32 v5, s7, v5
12883; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
12884; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
12885; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v2
12886; GFX90A-NEXT:    v_mul_hi_u32 v6, s8, v3
12887; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
12888; GFX90A-NEXT:    v_mul_lo_u32 v6, s9, v3
12889; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v6
12890; GFX90A-NEXT:    v_mul_lo_u32 v7, s8, v3
12891; GFX90A-NEXT:    v_sub_u32_e32 v6, s7, v5
12892; GFX90A-NEXT:    v_mov_b32_e32 v8, s9
12893; GFX90A-NEXT:    v_sub_co_u32_e32 v7, vcc, s6, v7
12894; GFX90A-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v8, vcc
12895; GFX90A-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s8, v7
12896; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
12897; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
12898; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
12899; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
12900; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
12901; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
12902; GFX90A-NEXT:    v_cndmask_b32_e64 v6, v9, v8, s[0:1]
12903; GFX90A-NEXT:    v_mov_b32_e32 v9, s7
12904; GFX90A-NEXT:    v_subb_co_u32_e32 v5, vcc, v9, v5, vcc
12905; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
12906; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
12907; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
12908; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v7
12909; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 1, 2, s[0:1]
12910; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
12911; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v5
12912; GFX90A-NEXT:    v_add_co_u32_e64 v6, s[0:1], v3, v6
12913; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
12914; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v2, s[0:1]
12915; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
12916; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
12917; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
12918; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
12919; GFX90A-NEXT:    v_xor_b32_e32 v3, s0, v3
12920; GFX90A-NEXT:    v_xor_b32_e32 v5, s1, v2
12921; GFX90A-NEXT:    v_mov_b32_e32 v6, s1
12922; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
12923; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
12924; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
12925; GFX90A-NEXT:    s_endpgm
12926  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
12927  %r = sdiv <2 x i64> %x, %shl.y
12928  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
12929  ret void
12930}
12931
12932define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
12933; CHECK-LABEL: @srem_i64_oddk_denom(
12934; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
12935; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
12936; CHECK-NEXT:    ret void
12937;
12938; GFX6-LABEL: srem_i64_oddk_denom:
12939; GFX6:       ; %bb.0:
12940; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
12941; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
12942; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
12943; GFX6-NEXT:    s_mov_b32 s2, 0xffed2705
12944; GFX6-NEXT:    v_mov_b32_e32 v8, 0
12945; GFX6-NEXT:    v_mov_b32_e32 v7, 0
12946; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
12947; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
12948; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
12949; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
12950; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
12951; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
12952; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
12953; GFX6-NEXT:    s_mov_b32 s7, 0xf000
12954; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
12955; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
12956; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
12957; GFX6-NEXT:    s_mov_b32 s6, -1
12958; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
12959; GFX6-NEXT:    s_mov_b32 s4, s8
12960; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
12961; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
12962; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
12963; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
12964; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
12965; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
12966; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
12967; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
12968; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
12969; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
12970; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
12971; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
12972; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
12973; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
12974; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
12975; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
12976; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
12977; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
12978; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
12979; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
12980; GFX6-NEXT:    s_mov_b32 s5, s9
12981; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
12982; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
12983; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
12984; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
12985; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
12986; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
12987; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
12988; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
12989; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
12990; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
12991; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
12992; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
12993; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
12994; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
12995; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
12996; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
12997; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
12998; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
12999; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
13000; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
13001; GFX6-NEXT:    s_add_u32 s0, s10, s2
13002; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
13003; GFX6-NEXT:    s_mov_b32 s3, s2
13004; GFX6-NEXT:    s_addc_u32 s1, s11, s2
13005; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13006; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
13007; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
13008; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
13009; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
13010; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
13011; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
13012; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
13013; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
13014; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
13015; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
13016; GFX6-NEXT:    s_mov_b32 s3, 0x12d8fb
13017; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
13018; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
13019; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
13020; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
13021; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
13022; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
13023; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
13024; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
13025; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
13026; GFX6-NEXT:    v_mov_b32_e32 v2, s1
13027; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
13028; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
13029; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
13030; GFX6-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
13031; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
13032; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
13033; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
13034; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
13035; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
13036; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
13037; GFX6-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
13038; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
13039; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
13040; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
13041; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
13042; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
13043; GFX6-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
13044; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
13045; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
13046; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
13047; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
13048; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
13049; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
13050; GFX6-NEXT:    v_mov_b32_e32 v2, s2
13051; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
13052; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
13053; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
13054; GFX6-NEXT:    s_endpgm
13055;
13056; GFX9-LABEL: srem_i64_oddk_denom:
13057; GFX9:       ; %bb.0:
13058; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
13059; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
13060; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
13061; GFX9-NEXT:    s_mov_b32 s8, 0xffed2705
13062; GFX9-NEXT:    v_mov_b32_e32 v7, 0
13063; GFX9-NEXT:    v_mov_b32_e32 v5, 0
13064; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
13065; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
13066; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
13067; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
13068; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
13069; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
13070; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
13071; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
13072; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
13073; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
13074; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13075; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
13076; GFX9-NEXT:    s_mov_b32 s1, s0
13077; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
13078; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
13079; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
13080; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
13081; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
13082; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
13083; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
13084; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
13085; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
13086; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v4
13087; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
13088; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
13089; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
13090; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
13091; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
13092; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
13093; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
13094; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
13095; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
13096; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s8
13097; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
13098; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
13099; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
13100; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
13101; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
13102; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
13103; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
13104; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
13105; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
13106; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
13107; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
13108; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
13109; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
13110; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
13111; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
13112; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
13113; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
13114; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
13115; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
13116; GFX9-NEXT:    s_add_u32 s2, s6, s0
13117; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
13118; GFX9-NEXT:    s_addc_u32 s3, s7, s0
13119; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
13120; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
13121; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
13122; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
13123; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v1
13124; GFX9-NEXT:    v_mul_hi_u32 v6, s3, v1
13125; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v1
13126; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
13127; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
13128; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
13129; GFX9-NEXT:    v_mul_hi_u32 v0, s3, v0
13130; GFX9-NEXT:    s_mov_b32 s1, 0x12d8fb
13131; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
13132; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
13133; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
13134; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
13135; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
13136; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s1
13137; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s1
13138; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s1
13139; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
13140; GFX9-NEXT:    v_mov_b32_e32 v2, s3
13141; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
13142; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
13143; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s1, v0
13144; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc
13145; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s1, v2
13146; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
13147; GFX9-NEXT:    s_mov_b32 s1, 0x12d8fa
13148; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v2
13149; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
13150; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
13151; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
13152; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
13153; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
13154; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
13155; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v0
13156; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
13157; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
13158; GFX9-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
13159; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
13160; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
13161; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
13162; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
13163; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
13164; GFX9-NEXT:    v_mov_b32_e32 v2, s0
13165; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
13166; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
13167; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
13168; GFX9-NEXT:    s_endpgm
13169;
13170; GFX90A-LABEL: srem_i64_oddk_denom:
13171; GFX90A:       ; %bb.0:
13172; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
13173; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
13174; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
13175; GFX90A-NEXT:    s_mov_b32 s2, 0xffed2705
13176; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
13177; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
13178; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
13179; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
13180; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
13181; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
13182; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
13183; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
13184; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
13185; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s2
13186; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s2
13187; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
13188; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
13189; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
13190; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
13191; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
13192; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
13193; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
13194; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
13195; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
13196; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
13197; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
13198; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
13199; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v9, vcc
13200; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
13201; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
13202; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
13203; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
13204; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
13205; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
13206; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s2
13207; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s2
13208; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
13209; GFX90A-NEXT:    v_sub_u32_e32 v5, v5, v0
13210; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, s2
13211; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v7
13212; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v7
13213; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
13214; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v7
13215; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
13216; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v12
13217; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, v8, v11, vcc
13218; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
13219; GFX90A-NEXT:    v_mul_hi_u32 v6, v3, v5
13220; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v11, v9, vcc
13221; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v2, vcc
13222; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
13223; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
13224; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
13225; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
13226; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
13227; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
13228; GFX90A-NEXT:    s_ashr_i32 s0, s7, 31
13229; GFX90A-NEXT:    s_add_u32 s2, s6, s0
13230; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
13231; GFX90A-NEXT:    s_mov_b32 s1, s0
13232; GFX90A-NEXT:    s_addc_u32 s3, s7, s0
13233; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
13234; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
13235; GFX90A-NEXT:    v_mul_lo_u32 v4, s2, v1
13236; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
13237; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v1
13238; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
13239; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
13240; GFX90A-NEXT:    v_mul_hi_u32 v6, s3, v0
13241; GFX90A-NEXT:    v_mul_lo_u32 v0, s3, v0
13242; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
13243; GFX90A-NEXT:    v_mul_hi_u32 v5, s3, v1
13244; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v6, vcc
13245; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
13246; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v1
13247; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
13248; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
13249; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fb
13250; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s1
13251; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s1
13252; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s1
13253; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
13254; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
13255; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
13256; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
13257; GFX90A-NEXT:    v_subrev_co_u32_e32 v3, vcc, s1, v0
13258; GFX90A-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc
13259; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s1, v3
13260; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc
13261; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fa
13262; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v3
13263; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
13264; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
13265; GFX90A-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
13266; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
13267; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
13268; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
13269; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v0
13270; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
13271; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
13272; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
13273; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
13274; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
13275; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
13276; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
13277; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
13278; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
13279; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
13280; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
13281; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
13282; GFX90A-NEXT:    s_endpgm
13283  %r = srem i64 %x, 1235195
13284  store i64 %r, i64 addrspace(1)* %out
13285  ret void
13286}
13287
13288define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
13289; CHECK-LABEL: @srem_i64_pow2k_denom(
13290; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
13291; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
13292; CHECK-NEXT:    ret void
13293;
13294; GFX6-LABEL: srem_i64_pow2k_denom:
13295; GFX6:       ; %bb.0:
13296; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13297; GFX6-NEXT:    s_mov_b32 s3, 0xf000
13298; GFX6-NEXT:    s_mov_b32 s2, -1
13299; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13300; GFX6-NEXT:    s_mov_b32 s0, s4
13301; GFX6-NEXT:    s_ashr_i32 s4, s7, 31
13302; GFX6-NEXT:    s_lshr_b32 s4, s4, 20
13303; GFX6-NEXT:    s_add_u32 s4, s6, s4
13304; GFX6-NEXT:    s_mov_b32 s1, s5
13305; GFX6-NEXT:    s_addc_u32 s5, s7, 0
13306; GFX6-NEXT:    s_and_b32 s4, s4, 0xfffff000
13307; GFX6-NEXT:    s_sub_u32 s4, s6, s4
13308; GFX6-NEXT:    s_subb_u32 s5, s7, s5
13309; GFX6-NEXT:    v_mov_b32_e32 v0, s4
13310; GFX6-NEXT:    v_mov_b32_e32 v1, s5
13311; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
13312; GFX6-NEXT:    s_endpgm
13313;
13314; GFX9-LABEL: srem_i64_pow2k_denom:
13315; GFX9:       ; %bb.0:
13316; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
13317; GFX9-NEXT:    v_mov_b32_e32 v2, 0
13318; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13319; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
13320; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
13321; GFX9-NEXT:    s_add_u32 s4, s2, s4
13322; GFX9-NEXT:    s_addc_u32 s5, s3, 0
13323; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
13324; GFX9-NEXT:    s_sub_u32 s2, s2, s4
13325; GFX9-NEXT:    s_subb_u32 s3, s3, s5
13326; GFX9-NEXT:    v_mov_b32_e32 v0, s2
13327; GFX9-NEXT:    v_mov_b32_e32 v1, s3
13328; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
13329; GFX9-NEXT:    s_endpgm
13330;
13331; GFX90A-LABEL: srem_i64_pow2k_denom:
13332; GFX90A:       ; %bb.0:
13333; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
13334; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
13335; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
13336; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
13337; GFX90A-NEXT:    s_lshr_b32 s4, s4, 20
13338; GFX90A-NEXT:    s_add_u32 s4, s2, s4
13339; GFX90A-NEXT:    s_addc_u32 s5, s3, 0
13340; GFX90A-NEXT:    s_and_b32 s4, s4, 0xfffff000
13341; GFX90A-NEXT:    s_sub_u32 s2, s2, s4
13342; GFX90A-NEXT:    s_subb_u32 s3, s3, s5
13343; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
13344; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
13345; GFX90A-NEXT:    s_endpgm
13346  %r = srem i64 %x, 4096
13347  store i64 %r, i64 addrspace(1)* %out
13348  ret void
13349}
13350
13351define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
13352; CHECK-LABEL: @srem_i64_pow2_shl_denom(
13353; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
13354; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
13355; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
13356; CHECK-NEXT:    ret void
13357;
13358; GFX6-LABEL: srem_i64_pow2_shl_denom:
13359; GFX6:       ; %bb.0:
13360; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
13361; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
13362; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
13363; GFX6-NEXT:    s_mov_b32 s7, 0xf000
13364; GFX6-NEXT:    s_mov_b32 s6, -1
13365; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13366; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
13367; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
13368; GFX6-NEXT:    s_add_u32 s2, s2, s4
13369; GFX6-NEXT:    s_mov_b32 s5, s4
13370; GFX6-NEXT:    s_addc_u32 s3, s3, s4
13371; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
13372; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
13373; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
13374; GFX6-NEXT:    s_sub_u32 s2, 0, s12
13375; GFX6-NEXT:    s_subb_u32 s3, 0, s13
13376; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
13377; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
13378; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
13379; GFX6-NEXT:    s_mov_b32 s15, s14
13380; GFX6-NEXT:    s_mov_b32 s4, s8
13381; GFX6-NEXT:    s_mov_b32 s5, s9
13382; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
13383; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
13384; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
13385; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
13386; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
13387; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
13388; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
13389; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
13390; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
13391; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
13392; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
13393; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
13394; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
13395; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
13396; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
13397; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
13398; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
13399; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
13400; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
13401; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
13402; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
13403; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
13404; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
13405; GFX6-NEXT:    v_mov_b32_e32 v4, 0
13406; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
13407; GFX6-NEXT:    v_mov_b32_e32 v6, 0
13408; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
13409; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
13410; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
13411; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
13412; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
13413; GFX6-NEXT:    v_mul_hi_u32 v7, s2, v0
13414; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v0
13415; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
13416; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v0
13417; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
13418; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
13419; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
13420; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
13421; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
13422; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
13423; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
13424; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
13425; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
13426; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
13427; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
13428; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
13429; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
13430; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
13431; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
13432; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
13433; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
13434; GFX6-NEXT:    s_add_u32 s0, s10, s14
13435; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
13436; GFX6-NEXT:    s_addc_u32 s1, s11, s14
13437; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
13438; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
13439; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
13440; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
13441; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
13442; GFX6-NEXT:    v_mul_hi_u32 v7, s11, v1
13443; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
13444; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
13445; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
13446; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
13447; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
13448; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
13449; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
13450; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
13451; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
13452; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
13453; GFX6-NEXT:    v_mul_lo_u32 v1, s12, v1
13454; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
13455; GFX6-NEXT:    v_mul_lo_u32 v3, s13, v0
13456; GFX6-NEXT:    v_mul_lo_u32 v0, s12, v0
13457; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
13458; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
13459; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
13460; GFX6-NEXT:    v_mov_b32_e32 v3, s13
13461; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
13462; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
13463; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
13464; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
13465; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
13466; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
13467; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
13468; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
13469; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
13470; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
13471; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
13472; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
13473; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
13474; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
13475; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
13476; GFX6-NEXT:    v_mov_b32_e32 v5, s11
13477; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
13478; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
13479; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
13480; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
13481; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
13482; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
13483; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
13484; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
13485; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
13486; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
13487; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
13488; GFX6-NEXT:    v_xor_b32_e32 v0, s14, v0
13489; GFX6-NEXT:    v_xor_b32_e32 v1, s14, v1
13490; GFX6-NEXT:    v_mov_b32_e32 v2, s14
13491; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
13492; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
13493; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
13494; GFX6-NEXT:    s_endpgm
13495;
13496; GFX9-LABEL: srem_i64_pow2_shl_denom:
13497; GFX9:       ; %bb.0:
13498; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
13499; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
13500; GFX9-NEXT:    v_mov_b32_e32 v2, 0
13501; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13502; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
13503; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
13504; GFX9-NEXT:    s_add_u32 s2, s2, s4
13505; GFX9-NEXT:    s_mov_b32 s5, s4
13506; GFX9-NEXT:    s_addc_u32 s3, s3, s4
13507; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
13508; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
13509; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
13510; GFX9-NEXT:    s_sub_u32 s10, 0, s8
13511; GFX9-NEXT:    s_subb_u32 s4, 0, s9
13512; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
13513; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
13514; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
13515; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
13516; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
13517; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
13518; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
13519; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
13520; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
13521; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
13522; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
13523; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
13524; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
13525; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
13526; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
13527; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
13528; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
13529; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
13530; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
13531; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
13532; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
13533; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
13534; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
13535; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
13536; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
13537; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
13538; GFX9-NEXT:    v_mov_b32_e32 v6, 0
13539; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
13540; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
13541; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
13542; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
13543; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v3
13544; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v0
13545; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
13546; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v0
13547; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
13548; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
13549; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
13550; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
13551; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
13552; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
13553; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
13554; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
13555; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
13556; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
13557; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
13558; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
13559; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
13560; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
13561; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
13562; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
13563; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
13564; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
13565; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13566; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
13567; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
13568; GFX9-NEXT:    s_add_u32 s0, s6, s10
13569; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
13570; GFX9-NEXT:    s_mov_b32 s11, s10
13571; GFX9-NEXT:    s_addc_u32 s1, s7, s10
13572; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
13573; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
13574; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
13575; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
13576; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
13577; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
13578; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
13579; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
13580; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
13581; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
13582; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
13583; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
13584; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v4, v0, vcc
13585; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v2, vcc
13586; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
13587; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
13588; GFX9-NEXT:    v_mul_lo_u32 v1, s8, v1
13589; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
13590; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
13591; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
13592; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
13593; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
13594; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v1
13595; GFX9-NEXT:    v_mov_b32_e32 v4, s9
13596; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
13597; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
13598; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
13599; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
13600; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
13601; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
13602; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
13603; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
13604; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
13605; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
13606; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
13607; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
13608; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
13609; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
13610; GFX9-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
13611; GFX9-NEXT:    v_mov_b32_e32 v5, s7
13612; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
13613; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
13614; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
13615; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
13616; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
13617; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
13618; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
13619; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
13620; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
13621; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
13622; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
13623; GFX9-NEXT:    v_xor_b32_e32 v0, s10, v0
13624; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
13625; GFX9-NEXT:    v_mov_b32_e32 v3, s10
13626; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s10, v0
13627; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
13628; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
13629; GFX9-NEXT:    s_endpgm
13630;
13631; GFX90A-LABEL: srem_i64_pow2_shl_denom:
13632; GFX90A:       ; %bb.0:
13633; GFX90A-NEXT:    s_load_dword s4, s[0:1], 0x34
13634; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
13635; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
13636; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
13637; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
13638; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
13639; GFX90A-NEXT:    s_add_u32 s2, s2, s4
13640; GFX90A-NEXT:    s_mov_b32 s5, s4
13641; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
13642; GFX90A-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
13643; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
13644; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
13645; GFX90A-NEXT:    s_sub_u32 s2, 0, s8
13646; GFX90A-NEXT:    s_subb_u32 s3, 0, s9
13647; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
13648; GFX90A-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
13649; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
13650; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
13651; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
13652; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
13653; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
13654; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
13655; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
13656; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
13657; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
13658; GFX90A-NEXT:    s_mov_b32 s11, s10
13659; GFX90A-NEXT:    v_mul_lo_u32 v3, s2, v1
13660; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
13661; GFX90A-NEXT:    v_mul_lo_u32 v4, s3, v0
13662; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
13663; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v0
13664; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
13665; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
13666; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
13667; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
13668; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
13669; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
13670; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
13671; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
13672; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
13673; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
13674; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
13675; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
13676; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
13677; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
13678; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
13679; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
13680; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
13681; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
13682; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v3
13683; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v0
13684; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
13685; GFX90A-NEXT:    v_mul_lo_u32 v7, s3, v0
13686; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
13687; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v0
13688; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
13689; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
13690; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
13691; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
13692; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
13693; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
13694; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
13695; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
13696; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
13697; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
13698; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v2, vcc
13699; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
13700; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
13701; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
13702; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
13703; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
13704; GFX90A-NEXT:    s_add_u32 s0, s6, s10
13705; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
13706; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
13707; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
13708; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
13709; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
13710; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
13711; GFX90A-NEXT:    v_mul_hi_u32 v3, s6, v1
13712; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
13713; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
13714; GFX90A-NEXT:    v_mul_hi_u32 v7, s7, v0
13715; GFX90A-NEXT:    v_mul_lo_u32 v0, s7, v0
13716; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
13717; GFX90A-NEXT:    v_mul_hi_u32 v5, s7, v1
13718; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v7, vcc
13719; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v2, vcc
13720; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
13721; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
13722; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
13723; GFX90A-NEXT:    v_mul_lo_u32 v1, s8, v1
13724; GFX90A-NEXT:    v_mul_hi_u32 v3, s8, v0
13725; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
13726; GFX90A-NEXT:    v_mul_lo_u32 v3, s9, v0
13727; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
13728; GFX90A-NEXT:    v_mul_lo_u32 v0, s8, v0
13729; GFX90A-NEXT:    v_sub_u32_e32 v3, s7, v1
13730; GFX90A-NEXT:    v_mov_b32_e32 v4, s9
13731; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
13732; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
13733; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
13734; GFX90A-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
13735; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
13736; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
13737; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
13738; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
13739; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
13740; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
13741; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
13742; GFX90A-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
13743; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
13744; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
13745; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
13746; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
13747; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
13748; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
13749; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
13750; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
13751; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
13752; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
13753; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
13754; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
13755; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
13756; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
13757; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
13758; GFX90A-NEXT:    v_xor_b32_e32 v0, s10, v0
13759; GFX90A-NEXT:    v_xor_b32_e32 v1, s10, v1
13760; GFX90A-NEXT:    v_mov_b32_e32 v3, s10
13761; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s10, v0
13762; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
13763; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
13764; GFX90A-NEXT:    s_endpgm
13765  %shl.y = shl i64 4096, %y
13766  %r = srem i64 %x, %shl.y
13767  store i64 %r, i64 addrspace(1)* %out
13768  ret void
13769}
13770
13771define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
13772; CHECK-LABEL: @srem_v2i64_pow2k_denom(
13773; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
13774; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
13775; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
13776; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
13777; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
13778; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
13779; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
13780; CHECK-NEXT:    ret void
13781;
13782; GFX6-LABEL: srem_v2i64_pow2k_denom:
13783; GFX6:       ; %bb.0:
13784; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
13785; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
13786; GFX6-NEXT:    s_movk_i32 s8, 0xf000
13787; GFX6-NEXT:    s_mov_b32 s7, 0xf000
13788; GFX6-NEXT:    s_mov_b32 s6, -1
13789; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13790; GFX6-NEXT:    s_ashr_i32 s9, s1, 31
13791; GFX6-NEXT:    s_lshr_b32 s9, s9, 20
13792; GFX6-NEXT:    s_add_u32 s9, s0, s9
13793; GFX6-NEXT:    s_addc_u32 s10, s1, 0
13794; GFX6-NEXT:    s_and_b32 s9, s9, s8
13795; GFX6-NEXT:    s_sub_u32 s0, s0, s9
13796; GFX6-NEXT:    s_subb_u32 s1, s1, s10
13797; GFX6-NEXT:    s_ashr_i32 s9, s3, 31
13798; GFX6-NEXT:    s_lshr_b32 s9, s9, 20
13799; GFX6-NEXT:    s_add_u32 s9, s2, s9
13800; GFX6-NEXT:    s_addc_u32 s10, s3, 0
13801; GFX6-NEXT:    s_and_b32 s8, s9, s8
13802; GFX6-NEXT:    s_sub_u32 s2, s2, s8
13803; GFX6-NEXT:    s_subb_u32 s3, s3, s10
13804; GFX6-NEXT:    v_mov_b32_e32 v0, s0
13805; GFX6-NEXT:    v_mov_b32_e32 v1, s1
13806; GFX6-NEXT:    v_mov_b32_e32 v2, s2
13807; GFX6-NEXT:    v_mov_b32_e32 v3, s3
13808; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
13809; GFX6-NEXT:    s_endpgm
13810;
13811; GFX9-LABEL: srem_v2i64_pow2k_denom:
13812; GFX9:       ; %bb.0:
13813; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
13814; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
13815; GFX9-NEXT:    s_movk_i32 s8, 0xf000
13816; GFX9-NEXT:    v_mov_b32_e32 v4, 0
13817; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13818; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
13819; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
13820; GFX9-NEXT:    s_add_u32 s0, s4, s0
13821; GFX9-NEXT:    s_addc_u32 s1, s5, 0
13822; GFX9-NEXT:    s_and_b32 s0, s0, s8
13823; GFX9-NEXT:    s_sub_u32 s0, s4, s0
13824; GFX9-NEXT:    s_subb_u32 s1, s5, s1
13825; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
13826; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
13827; GFX9-NEXT:    s_add_u32 s4, s6, s4
13828; GFX9-NEXT:    s_addc_u32 s5, s7, 0
13829; GFX9-NEXT:    s_and_b32 s4, s4, s8
13830; GFX9-NEXT:    s_sub_u32 s4, s6, s4
13831; GFX9-NEXT:    s_subb_u32 s5, s7, s5
13832; GFX9-NEXT:    v_mov_b32_e32 v0, s0
13833; GFX9-NEXT:    v_mov_b32_e32 v1, s1
13834; GFX9-NEXT:    v_mov_b32_e32 v2, s4
13835; GFX9-NEXT:    v_mov_b32_e32 v3, s5
13836; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
13837; GFX9-NEXT:    s_endpgm
13838;
13839; GFX90A-LABEL: srem_v2i64_pow2k_denom:
13840; GFX90A:       ; %bb.0:
13841; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
13842; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
13843; GFX90A-NEXT:    s_movk_i32 s8, 0xf000
13844; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
13845; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
13846; GFX90A-NEXT:    s_ashr_i32 s0, s5, 31
13847; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
13848; GFX90A-NEXT:    s_add_u32 s0, s4, s0
13849; GFX90A-NEXT:    s_addc_u32 s1, s5, 0
13850; GFX90A-NEXT:    s_and_b32 s0, s0, s8
13851; GFX90A-NEXT:    s_sub_u32 s0, s4, s0
13852; GFX90A-NEXT:    s_subb_u32 s1, s5, s1
13853; GFX90A-NEXT:    s_ashr_i32 s4, s7, 31
13854; GFX90A-NEXT:    s_lshr_b32 s4, s4, 20
13855; GFX90A-NEXT:    s_add_u32 s4, s6, s4
13856; GFX90A-NEXT:    s_addc_u32 s5, s7, 0
13857; GFX90A-NEXT:    s_and_b32 s4, s4, s8
13858; GFX90A-NEXT:    s_sub_u32 s4, s6, s4
13859; GFX90A-NEXT:    s_subb_u32 s5, s7, s5
13860; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
13861; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
13862; GFX90A-NEXT:    v_mov_b32_e32 v2, s4
13863; GFX90A-NEXT:    v_mov_b32_e32 v3, s5
13864; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
13865; GFX90A-NEXT:    s_endpgm
13866  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
13867  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
13868  ret void
13869}
13870
13871define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
13872; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
13873; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
13874; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
13875; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
13876; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
13877; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
13878; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
13879; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
13880; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
13881; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
13882; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
13883; CHECK-NEXT:    ret void
13884;
13885; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
13886; GFX6:       ; %bb.0:
13887; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
13888; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
13889; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
13890; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
13891; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
13892; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13893; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
13894; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
13895; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
13896; GFX6-NEXT:    s_add_u32 s2, s2, s4
13897; GFX6-NEXT:    s_mov_b32 s5, s4
13898; GFX6-NEXT:    s_addc_u32 s3, s3, s4
13899; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[4:5]
13900; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s16
13901; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s17
13902; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
13903; GFX6-NEXT:    s_sub_u32 s6, 0, s16
13904; GFX6-NEXT:    s_subb_u32 s7, 0, s17
13905; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
13906; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
13907; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
13908; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
13909; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
13910; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
13911; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
13912; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
13913; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
13914; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v0
13915; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
13916; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
13917; GFX6-NEXT:    s_add_u32 s0, s8, s12
13918; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v2
13919; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v3
13920; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v3
13921; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
13922; GFX6-NEXT:    s_mov_b32 s13, s12
13923; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
13924; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v4
13925; GFX6-NEXT:    v_mul_lo_u32 v0, v3, v1
13926; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
13927; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v1
13928; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v1
13929; GFX6-NEXT:    s_addc_u32 s1, s9, s12
13930; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
13931; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
13932; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v5
13933; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
13934; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
13935; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
13936; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
13937; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v1
13938; GFX6-NEXT:    v_mov_b32_e32 v0, 0
13939; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v7, v0, vcc
13940; GFX6-NEXT:    v_mov_b32_e32 v1, 0
13941; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
13942; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v6, vcc
13943; GFX6-NEXT:    v_add_i32_e64 v3, s[2:3], v3, v4
13944; GFX6-NEXT:    v_addc_u32_e64 v4, vcc, v2, v5, s[2:3]
13945; GFX6-NEXT:    v_mul_lo_u32 v6, s6, v4
13946; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
13947; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
13948; GFX6-NEXT:    s_mov_b32 s7, 0xf000
13949; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
13950; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v3
13951; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
13952; GFX6-NEXT:    v_mul_lo_u32 v10, v3, v6
13953; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v7
13954; GFX6-NEXT:    v_mul_hi_u32 v12, v3, v6
13955; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v7
13956; GFX6-NEXT:    v_mul_lo_u32 v7, v4, v7
13957; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
13958; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
13959; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
13960; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v6
13961; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
13962; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
13963; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v0, vcc
13964; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
13965; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v1, v6, vcc
13966; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
13967; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v2, v6, s[2:3]
13968; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
13969; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
13970; GFX6-NEXT:    v_mul_lo_u32 v4, s8, v2
13971; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v3
13972; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v2
13973; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v2
13974; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v2
13975; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
13976; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
13977; GFX6-NEXT:    v_mul_lo_u32 v6, s9, v3
13978; GFX6-NEXT:    v_mul_hi_u32 v3, s9, v3
13979; GFX6-NEXT:    s_mov_b32 s6, -1
13980; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
13981; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
13982; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v0, vcc
13983; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
13984; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v4, vcc
13985; GFX6-NEXT:    v_mul_lo_u32 v3, s16, v3
13986; GFX6-NEXT:    v_mul_hi_u32 v4, s16, v2
13987; GFX6-NEXT:    v_mul_lo_u32 v5, s17, v2
13988; GFX6-NEXT:    v_mul_lo_u32 v2, s16, v2
13989; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
13990; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
13991; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s9, v3
13992; GFX6-NEXT:    v_mov_b32_e32 v5, s17
13993; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s8, v2
13994; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
13995; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s16, v2
13996; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
13997; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
13998; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
13999; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v6
14000; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
14001; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
14002; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
14003; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v6
14004; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
14005; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
14006; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
14007; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
14008; GFX6-NEXT:    s_add_u32 s8, s14, s2
14009; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
14010; GFX6-NEXT:    v_mov_b32_e32 v7, s9
14011; GFX6-NEXT:    s_mov_b32 s3, s2
14012; GFX6-NEXT:    s_addc_u32 s9, s15, s2
14013; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
14014; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s8
14015; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s9
14016; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
14017; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v3
14018; GFX6-NEXT:    v_mac_f32_e32 v8, s18, v9
14019; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
14020; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v2
14021; GFX6-NEXT:    v_rcp_f32_e32 v8, v8
14022; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
14023; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v3
14024; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
14025; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
14026; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
14027; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
14028; GFX6-NEXT:    v_mul_f32_e32 v5, s19, v8
14029; GFX6-NEXT:    v_mul_f32_e32 v6, s20, v5
14030; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
14031; GFX6-NEXT:    v_mac_f32_e32 v5, s21, v6
14032; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
14033; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
14034; GFX6-NEXT:    s_sub_u32 s2, 0, s8
14035; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
14036; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v5
14037; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v6
14038; GFX6-NEXT:    s_subb_u32 s3, 0, s9
14039; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v5
14040; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
14041; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
14042; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v5
14043; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
14044; GFX6-NEXT:    v_mul_lo_u32 v8, v5, v4
14045; GFX6-NEXT:    v_mul_hi_u32 v9, v5, v7
14046; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v4
14047; GFX6-NEXT:    v_mul_hi_u32 v11, v6, v4
14048; GFX6-NEXT:    v_mul_lo_u32 v4, v6, v4
14049; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
14050; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
14051; GFX6-NEXT:    v_mul_lo_u32 v10, v6, v7
14052; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
14053; GFX6-NEXT:    s_mov_b32 s15, s14
14054; GFX6-NEXT:    v_xor_b32_e32 v2, s12, v2
14055; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
14056; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
14057; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v0, vcc
14058; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
14059; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v1, v8, vcc
14060; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], v5, v4
14061; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v6, v7, s[0:1]
14062; GFX6-NEXT:    v_mul_lo_u32 v8, s2, v5
14063; GFX6-NEXT:    v_mul_hi_u32 v9, s2, v4
14064; GFX6-NEXT:    v_mul_lo_u32 v10, s3, v4
14065; GFX6-NEXT:    v_xor_b32_e32 v3, s12, v3
14066; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
14067; GFX6-NEXT:    v_mul_lo_u32 v9, s2, v4
14068; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
14069; GFX6-NEXT:    v_mul_lo_u32 v12, v4, v8
14070; GFX6-NEXT:    v_mul_hi_u32 v13, v4, v9
14071; GFX6-NEXT:    v_mul_hi_u32 v14, v4, v8
14072; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v9
14073; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v9
14074; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v8
14075; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
14076; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
14077; GFX6-NEXT:    v_mul_lo_u32 v5, v5, v8
14078; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
14079; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
14080; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v0, vcc
14081; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
14082; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
14083; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
14084; GFX6-NEXT:    v_addc_u32_e64 v6, vcc, v6, v8, s[0:1]
14085; GFX6-NEXT:    s_add_u32 s0, s10, s14
14086; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
14087; GFX6-NEXT:    s_addc_u32 s1, s11, s14
14088; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
14089; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
14090; GFX6-NEXT:    v_mul_lo_u32 v6, s10, v5
14091; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v4
14092; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v5
14093; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v5
14094; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v5
14095; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
14096; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
14097; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v4
14098; GFX6-NEXT:    v_mul_hi_u32 v4, s11, v4
14099; GFX6-NEXT:    v_mov_b32_e32 v8, s12
14100; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
14101; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
14102; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
14103; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
14104; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v1, v0, vcc
14105; GFX6-NEXT:    v_mul_lo_u32 v5, s8, v0
14106; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v4
14107; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v2
14108; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v4
14109; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v3, v8, vcc
14110; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v5
14111; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
14112; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v4
14113; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
14114; GFX6-NEXT:    v_mov_b32_e32 v5, s9
14115; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
14116; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
14117; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v3
14118; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
14119; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
14120; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
14121; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
14122; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
14123; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
14124; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
14125; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
14126; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
14127; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
14128; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
14129; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
14130; GFX6-NEXT:    v_mov_b32_e32 v7, s11
14131; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v7, v2, vcc
14132; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
14133; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
14134; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
14135; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
14136; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
14137; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
14138; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
14139; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
14140; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
14141; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
14142; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
14143; GFX6-NEXT:    v_xor_b32_e32 v4, s14, v2
14144; GFX6-NEXT:    v_mov_b32_e32 v5, s14
14145; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v3
14146; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
14147; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
14148; GFX6-NEXT:    s_endpgm
14149;
14150; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
14151; GFX9:       ; %bb.0:
14152; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
14153; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
14154; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
14155; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
14156; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
14157; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14158; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
14159; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
14160; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
14161; GFX9-NEXT:    s_add_u32 s2, s2, s4
14162; GFX9-NEXT:    s_mov_b32 s5, s4
14163; GFX9-NEXT:    s_addc_u32 s3, s3, s4
14164; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
14165; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
14166; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
14167; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
14168; GFX9-NEXT:    s_sub_u32 s8, 0, s12
14169; GFX9-NEXT:    s_subb_u32 s4, 0, s13
14170; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
14171; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
14172; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
14173; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
14174; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
14175; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
14176; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
14177; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
14178; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v2
14179; GFX9-NEXT:    v_mul_hi_u32 v1, s8, v3
14180; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
14181; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v3
14182; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
14183; GFX9-NEXT:    v_add_u32_e32 v5, v0, v5
14184; GFX9-NEXT:    v_mul_hi_u32 v1, v3, v4
14185; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v5
14186; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
14187; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v5
14188; GFX9-NEXT:    v_mul_lo_u32 v5, v2, v5
14189; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v6
14190; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
14191; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v4
14192; GFX9-NEXT:    v_mul_hi_u32 v4, v2, v4
14193; GFX9-NEXT:    v_mov_b32_e32 v0, 0
14194; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v7
14195; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
14196; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v0, vcc
14197; GFX9-NEXT:    v_mov_b32_e32 v1, 0
14198; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
14199; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v6, vcc
14200; GFX9-NEXT:    v_add_co_u32_e64 v3, s[2:3], v3, v4
14201; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3]
14202; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v4
14203; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v3
14204; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v3
14205; GFX9-NEXT:    v_mul_lo_u32 v9, s8, v3
14206; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
14207; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
14208; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
14209; GFX9-NEXT:    v_mul_lo_u32 v10, v3, v6
14210; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v9
14211; GFX9-NEXT:    v_mul_hi_u32 v12, v3, v6
14212; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v9
14213; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v9
14214; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v6
14215; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
14216; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
14217; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v6
14218; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
14219; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
14220; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v0, vcc
14221; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
14222; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v6, vcc
14223; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
14224; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14225; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
14226; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3]
14227; GFX9-NEXT:    s_add_u32 s2, s4, s8
14228; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
14229; GFX9-NEXT:    s_mov_b32 s9, s8
14230; GFX9-NEXT:    s_addc_u32 s3, s5, s8
14231; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
14232; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
14233; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v2
14234; GFX9-NEXT:    v_mul_hi_u32 v5, s14, v3
14235; GFX9-NEXT:    v_mul_hi_u32 v6, s14, v2
14236; GFX9-NEXT:    v_mul_hi_u32 v7, s15, v2
14237; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v2
14238; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
14239; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
14240; GFX9-NEXT:    v_mul_lo_u32 v6, s15, v3
14241; GFX9-NEXT:    v_mul_hi_u32 v3, s15, v3
14242; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
14243; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
14244; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
14245; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v0, vcc
14246; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
14247; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v4, vcc
14248; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v3
14249; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v2
14250; GFX9-NEXT:    v_mul_lo_u32 v5, s13, v2
14251; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v2
14252; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
14253; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
14254; GFX9-NEXT:    v_sub_u32_e32 v4, s15, v3
14255; GFX9-NEXT:    v_mov_b32_e32 v5, s13
14256; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s14, v2
14257; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc
14258; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s12, v2
14259; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v4, s[0:1]
14260; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
14261; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
14262; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v6
14263; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1]
14264; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
14265; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
14266; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v6
14267; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
14268; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
14269; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
14270; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
14271; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
14272; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
14273; GFX9-NEXT:    s_add_u32 s2, s10, s0
14274; GFX9-NEXT:    s_mov_b32 s1, s0
14275; GFX9-NEXT:    s_addc_u32 s3, s11, s0
14276; GFX9-NEXT:    v_mov_b32_e32 v6, s15
14277; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[0:1]
14278; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
14279; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s10
14280; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s11
14281; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
14282; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
14283; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
14284; GFX9-NEXT:    v_mac_f32_e32 v6, s16, v7
14285; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
14286; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
14287; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v3
14288; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
14289; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
14290; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
14291; GFX9-NEXT:    v_mul_f32_e32 v5, s17, v6
14292; GFX9-NEXT:    v_mul_f32_e32 v6, s18, v5
14293; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
14294; GFX9-NEXT:    v_mac_f32_e32 v5, s19, v6
14295; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
14296; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
14297; GFX9-NEXT:    s_sub_u32 s2, 0, s10
14298; GFX9-NEXT:    s_subb_u32 s3, 0, s11
14299; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v5
14300; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v6
14301; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v5
14302; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
14303; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v5
14304; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
14305; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
14306; GFX9-NEXT:    v_mul_lo_u32 v8, v5, v7
14307; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v4
14308; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v7
14309; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v7
14310; GFX9-NEXT:    v_mul_lo_u32 v7, v6, v7
14311; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
14312; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
14313; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v4
14314; GFX9-NEXT:    v_mul_hi_u32 v4, v6, v4
14315; GFX9-NEXT:    s_ashr_i32 s12, s7, 31
14316; GFX9-NEXT:    s_mov_b32 s13, s12
14317; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
14318; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
14319; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v0, vcc
14320; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
14321; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v8, vcc
14322; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v5, v4
14323; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v6, v7, s[0:1]
14324; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v5
14325; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v4
14326; GFX9-NEXT:    v_mul_lo_u32 v10, s3, v4
14327; GFX9-NEXT:    v_mul_lo_u32 v11, s2, v4
14328; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
14329; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
14330; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
14331; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v8
14332; GFX9-NEXT:    v_mul_hi_u32 v13, v4, v11
14333; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v8
14334; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v11
14335; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v11
14336; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v8
14337; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
14338; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
14339; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v8
14340; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
14341; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
14342; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v0, vcc
14343; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
14344; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
14345; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v6, v8, s[0:1]
14346; GFX9-NEXT:    s_add_u32 s0, s6, s12
14347; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
14348; GFX9-NEXT:    s_addc_u32 s1, s7, s12
14349; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
14350; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
14351; GFX9-NEXT:    v_mul_lo_u32 v6, s6, v5
14352; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v4
14353; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v5
14354; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v5
14355; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v5
14356; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
14357; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
14358; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
14359; GFX9-NEXT:    v_mul_hi_u32 v4, s7, v4
14360; GFX9-NEXT:    v_xor_b32_e32 v2, s8, v2
14361; GFX9-NEXT:    v_xor_b32_e32 v3, s8, v3
14362; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
14363; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
14364; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v10, v0, vcc
14365; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
14366; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
14367; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v1
14368; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v4
14369; GFX9-NEXT:    v_mul_lo_u32 v7, s11, v4
14370; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v4
14371; GFX9-NEXT:    v_mov_b32_e32 v8, s8
14372; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s8, v2
14373; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v8, vcc
14374; GFX9-NEXT:    v_add_u32_e32 v3, v6, v5
14375; GFX9-NEXT:    v_add_u32_e32 v3, v3, v7
14376; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v3
14377; GFX9-NEXT:    v_mov_b32_e32 v6, s11
14378; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
14379; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
14380; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v4
14381; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
14382; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s11, v8
14383; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
14384; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v7
14385; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1]
14386; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
14387; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s11, v8
14388; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v7
14389; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
14390; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
14391; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
14392; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
14393; GFX9-NEXT:    v_mov_b32_e32 v7, s7
14394; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
14395; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
14396; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
14397; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
14398; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
14399; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
14400; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
14401; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
14402; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
14403; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
14404; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
14405; GFX9-NEXT:    v_xor_b32_e32 v4, s12, v4
14406; GFX9-NEXT:    v_xor_b32_e32 v5, s12, v3
14407; GFX9-NEXT:    v_mov_b32_e32 v6, s12
14408; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s12, v4
14409; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v5, v6, vcc
14410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14411; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
14412; GFX9-NEXT:    s_endpgm
14413;
14414; GFX90A-LABEL: srem_v2i64_pow2_shl_denom:
14415; GFX90A:       ; %bb.0:
14416; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
14417; GFX90A-NEXT:    s_mov_b64 s[2:3], 0x1000
14418; GFX90A-NEXT:    s_mov_b32 s16, 0x4f800000
14419; GFX90A-NEXT:    s_mov_b32 s17, 0x5f7ffffc
14420; GFX90A-NEXT:    s_mov_b32 s18, 0x2f800000
14421; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
14422; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
14423; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
14424; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
14425; GFX90A-NEXT:    s_add_u32 s2, s2, s4
14426; GFX90A-NEXT:    s_mov_b32 s5, s4
14427; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
14428; GFX90A-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
14429; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s12
14430; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s13
14431; GFX90A-NEXT:    s_mov_b32 s19, 0xcf800000
14432; GFX90A-NEXT:    s_sub_u32 s2, 0, s12
14433; GFX90A-NEXT:    s_subb_u32 s3, 0, s13
14434; GFX90A-NEXT:    v_mac_f32_e32 v0, s16, v1
14435; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
14436; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
14437; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
14438; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
14439; GFX90A-NEXT:    v_mul_f32_e32 v0, s17, v0
14440; GFX90A-NEXT:    v_mul_f32_e32 v1, s18, v0
14441; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
14442; GFX90A-NEXT:    v_mac_f32_e32 v0, s19, v1
14443; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
14444; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
14445; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
14446; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
14447; GFX90A-NEXT:    s_mov_b32 s15, s14
14448; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v0
14449; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v1
14450; GFX90A-NEXT:    v_mul_lo_u32 v2, s3, v0
14451; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
14452; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
14453; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v0
14454; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
14455; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
14456; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
14457; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
14458; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
14459; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
14460; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
14461; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
14462; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
14463; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
14464; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
14465; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v2
14466; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
14467; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
14468; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v2, v5, vcc
14469; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
14470; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1]
14471; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v3
14472; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v0
14473; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
14474; GFX90A-NEXT:    v_mul_lo_u32 v7, s3, v0
14475; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v7
14476; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v0
14477; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
14478; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
14479; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v6
14480; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
14481; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v6
14482; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
14483; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
14484; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
14485; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
14486; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
14487; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
14488; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v6
14489; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
14490; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v7, vcc
14491; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v5
14492; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1]
14493; GFX90A-NEXT:    s_add_u32 s0, s4, s14
14494; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
14495; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
14496; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
14497; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
14498; GFX90A-NEXT:    v_mul_lo_u32 v5, s4, v1
14499; GFX90A-NEXT:    v_mul_hi_u32 v6, s4, v0
14500; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v1
14501; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
14502; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
14503; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
14504; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
14505; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
14506; GFX90A-NEXT:    v_mul_hi_u32 v6, s5, v1
14507; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v7, vcc
14508; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
14509; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
14510; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
14511; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
14512; GFX90A-NEXT:    v_mul_lo_u32 v1, s12, v1
14513; GFX90A-NEXT:    v_mul_hi_u32 v3, s12, v0
14514; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
14515; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v0
14516; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
14517; GFX90A-NEXT:    v_mul_lo_u32 v0, s12, v0
14518; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v1
14519; GFX90A-NEXT:    v_mov_b32_e32 v5, s13
14520; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
14521; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v5, vcc
14522; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s12, v0
14523; GFX90A-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v3, s[0:1]
14524; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
14525; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
14526; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v6
14527; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v5, s[0:1]
14528; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
14529; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
14530; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v6
14531; GFX90A-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
14532; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
14533; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
14534; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
14535; GFX90A-NEXT:    v_mov_b32_e32 v6, s5
14536; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
14537; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
14538; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[0:1]
14539; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
14540; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
14541; GFX90A-NEXT:    s_ashr_i32 s0, s11, 31
14542; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
14543; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
14544; GFX90A-NEXT:    s_add_u32 s2, s10, s0
14545; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
14546; GFX90A-NEXT:    s_mov_b32 s1, s0
14547; GFX90A-NEXT:    s_addc_u32 s3, s11, s0
14548; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
14549; GFX90A-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
14550; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
14551; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
14552; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
14553; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s5
14554; GFX90A-NEXT:    v_xor_b32_e32 v0, s14, v0
14555; GFX90A-NEXT:    s_sub_u32 s2, 0, s4
14556; GFX90A-NEXT:    v_xor_b32_e32 v1, s14, v1
14557; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v5
14558; GFX90A-NEXT:    v_rcp_f32_e32 v3, v3
14559; GFX90A-NEXT:    v_mov_b32_e32 v6, s14
14560; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
14561; GFX90A-NEXT:    v_mul_f32_e32 v3, s17, v3
14562; GFX90A-NEXT:    v_mul_f32_e32 v5, s18, v3
14563; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
14564; GFX90A-NEXT:    v_mac_f32_e32 v3, s19, v5
14565; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
14566; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
14567; GFX90A-NEXT:    s_subb_u32 s3, 0, s5
14568; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
14569; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v3
14570; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v5
14571; GFX90A-NEXT:    v_mul_lo_u32 v6, s3, v3
14572; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v8
14573; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
14574; GFX90A-NEXT:    v_mul_lo_u32 v9, s2, v3
14575; GFX90A-NEXT:    v_mul_lo_u32 v8, v3, v6
14576; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v9
14577; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
14578; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
14579; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
14580; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v9
14581; GFX90A-NEXT:    v_mul_lo_u32 v9, v5, v9
14582; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
14583; GFX90A-NEXT:    v_mul_hi_u32 v10, v5, v6
14584; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v11, vcc
14585; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
14586; GFX90A-NEXT:    v_mul_lo_u32 v6, v5, v6
14587; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
14588; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v2, v8, vcc
14589; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v6
14590; GFX90A-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
14591; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v6
14592; GFX90A-NEXT:    v_mul_hi_u32 v9, s2, v3
14593; GFX90A-NEXT:    v_add_u32_e32 v8, v9, v8
14594; GFX90A-NEXT:    v_mul_lo_u32 v9, s3, v3
14595; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v9
14596; GFX90A-NEXT:    v_mul_lo_u32 v10, s2, v3
14597; GFX90A-NEXT:    v_mul_hi_u32 v11, v6, v10
14598; GFX90A-NEXT:    v_mul_lo_u32 v12, v6, v10
14599; GFX90A-NEXT:    v_mul_lo_u32 v14, v3, v8
14600; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v10
14601; GFX90A-NEXT:    v_mul_hi_u32 v13, v3, v8
14602; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v14
14603; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
14604; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
14605; GFX90A-NEXT:    v_mul_hi_u32 v9, v6, v8
14606; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v11, vcc
14607; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v4, vcc
14608; GFX90A-NEXT:    v_mul_lo_u32 v6, v6, v8
14609; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
14610; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v2, v9, vcc
14611; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
14612; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
14613; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
14614; GFX90A-NEXT:    s_add_u32 s0, s6, s10
14615; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
14616; GFX90A-NEXT:    s_mov_b32 s11, s10
14617; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
14618; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
14619; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
14620; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v5
14621; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v3
14622; GFX90A-NEXT:    v_mul_hi_u32 v6, s6, v5
14623; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
14624; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
14625; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v3
14626; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
14627; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
14628; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v5
14629; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v9, vcc
14630; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v4, vcc
14631; GFX90A-NEXT:    v_mul_lo_u32 v5, s7, v5
14632; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
14633; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
14634; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v2
14635; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v3
14636; GFX90A-NEXT:    v_add_u32_e32 v2, v5, v2
14637; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v3
14638; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v5
14639; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v3
14640; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v2
14641; GFX90A-NEXT:    v_mov_b32_e32 v6, s5
14642; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
14643; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
14644; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v3
14645; GFX90A-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
14646; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v8
14647; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
14648; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v7
14649; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1]
14650; GFX90A-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
14651; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s5, v8
14652; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v7
14653; GFX90A-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
14654; GFX90A-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
14655; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
14656; GFX90A-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
14657; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
14658; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
14659; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
14660; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
14661; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
14662; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
14663; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
14664; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v2
14665; GFX90A-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
14666; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
14667; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
14668; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
14669; GFX90A-NEXT:    v_xor_b32_e32 v3, s10, v3
14670; GFX90A-NEXT:    v_xor_b32_e32 v5, s10, v2
14671; GFX90A-NEXT:    v_mov_b32_e32 v6, s10
14672; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s10, v3
14673; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
14674; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
14675; GFX90A-NEXT:    s_endpgm
14676  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
14677  %r = srem <2 x i64> %x, %shl.y
14678  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
14679  ret void
14680}
14681