1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
2; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
3; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
5
6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
7; CHECK-LABEL: @udiv_i32(
8; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
9; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
10; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
11; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
12; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
13; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
14; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
15; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
16; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
17; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
18; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
19; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
20; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
21; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
22; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
23; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
24; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
25; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
26; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
27; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
28; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
29; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
30; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
31; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
32; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
33; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
34; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
35; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
36; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
37; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
38; CHECK-NEXT:    ret void
39;
40; GFX6-LABEL: udiv_i32:
41; GFX6:       ; %bb.0:
42; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
43; GFX6-NEXT:    s_mov_b32 s7, 0xf000
44; GFX6-NEXT:    s_mov_b32 s6, -1
45; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
47; GFX6-NEXT:    s_sub_i32 s4, 0, s3
48; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
49; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
50; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
51; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
52; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
53; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
54; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
55; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
56; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
57; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
58; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
59; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
60; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
61; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
62; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
63; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
64; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
65; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
66; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
68; GFX6-NEXT:    s_endpgm
69;
70; GFX9-LABEL: udiv_i32:
71; GFX9:       ; %bb.0:
72; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
73; GFX9-NEXT:    v_mov_b32_e32 v2, 0
74; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
75; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
77; GFX9-NEXT:    s_sub_i32 s4, 0, s3
78; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
79; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
80; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
81; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
82; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
83; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
84; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
85; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
86; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
87; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
88; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
89; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
90; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
91; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
92; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
93; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
94; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
95; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
96; GFX9-NEXT:    s_endpgm
97  %r = udiv i32 %x, %y
98  store i32 %r, i32 addrspace(1)* %out
99  ret void
100}
101
102define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
103; CHECK-LABEL: @urem_i32(
104; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
105; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
106; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
107; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
108; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
109; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
110; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
111; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
112; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
113; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
114; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
115; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
116; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
117; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
118; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
119; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
120; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
121; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
122; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
123; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
124; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
125; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
126; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
127; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
128; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
129; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
130; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
131; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
132; CHECK-NEXT:    ret void
133;
134; GFX6-LABEL: urem_i32:
135; GFX6:       ; %bb.0:
136; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
137; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
138; GFX6-NEXT:    s_mov_b32 s3, 0xf000
139; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
141; GFX6-NEXT:    s_sub_i32 s2, 0, s5
142; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
143; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
144; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
145; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
146; GFX6-NEXT:    s_mov_b32 s2, -1
147; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
148; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
149; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
150; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
151; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
152; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
153; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
154; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
155; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
156; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
157; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
158; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
159; GFX6-NEXT:    s_endpgm
160;
161; GFX9-LABEL: urem_i32:
162; GFX9:       ; %bb.0:
163; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
166; GFX9-NEXT:    s_sub_i32 s4, 0, s3
167; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
168; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
169; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
170; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
171; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
172; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
173; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
174; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
175; GFX9-NEXT:    v_mov_b32_e32 v1, 0
176; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
177; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
178; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
179; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
180; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
181; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
182; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
183; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
184; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
186; GFX9-NEXT:    s_endpgm
187  %r = urem i32 %x, %y
188  store i32 %r, i32 addrspace(1)* %out
189  ret void
190}
191
192define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
193; CHECK-LABEL: @sdiv_i32(
194; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
195; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
196; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
197; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
198; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
199; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
200; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
201; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
202; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
203; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
204; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
205; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
206; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
207; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
208; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
209; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
210; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
211; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
212; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
213; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
214; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
215; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
216; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
217; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
218; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
219; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
220; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
221; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
222; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
223; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
224; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
225; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
226; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
227; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
228; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
229; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
230; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
231; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
232; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
233; CHECK-NEXT:    ret void
234;
235; GFX6-LABEL: sdiv_i32:
236; GFX6:       ; %bb.0:
237; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
238; GFX6-NEXT:    s_mov_b32 s7, 0xf000
239; GFX6-NEXT:    s_mov_b32 s6, -1
240; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
242; GFX6-NEXT:    s_add_i32 s3, s3, s8
243; GFX6-NEXT:    s_xor_b32 s3, s3, s8
244; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
245; GFX6-NEXT:    s_sub_i32 s4, 0, s3
246; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
247; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
248; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
249; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
250; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
251; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
252; GFX6-NEXT:    s_add_i32 s1, s2, s0
253; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
254; GFX6-NEXT:    s_xor_b32 s1, s1, s0
255; GFX6-NEXT:    s_xor_b32 s2, s0, s8
256; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
257; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
258; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
259; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
260; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
261; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
262; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
263; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
264; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
265; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
266; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
267; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
268; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
269; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
270; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
271; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
272; GFX6-NEXT:    s_endpgm
273;
274; GFX9-LABEL: sdiv_i32:
275; GFX9:       ; %bb.0:
276; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
277; GFX9-NEXT:    v_mov_b32_e32 v2, 0
278; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
279; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
280; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
281; GFX9-NEXT:    s_add_i32 s3, s3, s4
282; GFX9-NEXT:    s_xor_b32 s3, s3, s4
283; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
284; GFX9-NEXT:    s_sub_i32 s5, 0, s3
285; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
286; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
287; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
288; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
289; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
290; GFX9-NEXT:    s_add_i32 s2, s2, s5
291; GFX9-NEXT:    s_xor_b32 s2, s2, s5
292; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
293; GFX9-NEXT:    s_xor_b32 s4, s5, s4
294; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
295; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
296; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
297; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
298; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
299; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
300; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
301; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
302; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
303; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
304; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
305; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
306; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
307; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
308; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
309; GFX9-NEXT:    s_endpgm
310  %r = sdiv i32 %x, %y
311  store i32 %r, i32 addrspace(1)* %out
312  ret void
313}
314
315define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
316; CHECK-LABEL: @srem_i32(
317; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
318; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
319; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
320; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
321; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
322; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
323; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
324; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
325; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
326; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
327; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
328; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
329; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
330; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
331; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
332; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
333; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
334; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
335; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
336; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
337; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
338; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
339; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
340; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
341; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
342; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
343; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
344; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
345; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
346; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
347; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
348; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
349; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
350; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
351; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
352; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
353; CHECK-NEXT:    ret void
354;
355; GFX6-LABEL: srem_i32:
356; GFX6:       ; %bb.0:
357; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
358; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
359; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
361; GFX6-NEXT:    s_add_i32 s3, s3, s4
362; GFX6-NEXT:    s_xor_b32 s4, s3, s4
363; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
364; GFX6-NEXT:    s_sub_i32 s3, 0, s4
365; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
366; GFX6-NEXT:    s_add_i32 s2, s2, s5
367; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
368; GFX6-NEXT:    s_xor_b32 s6, s2, s5
369; GFX6-NEXT:    s_mov_b32 s2, -1
370; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
371; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
372; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
373; GFX6-NEXT:    s_mov_b32 s3, 0xf000
374; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
375; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
376; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
377; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
378; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
379; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
380; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
381; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
382; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
383; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
384; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
385; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
386; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
387; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
388; GFX6-NEXT:    s_endpgm
389;
390; GFX9-LABEL: srem_i32:
391; GFX9:       ; %bb.0:
392; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
393; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
395; GFX9-NEXT:    s_add_i32 s3, s3, s4
396; GFX9-NEXT:    s_xor_b32 s3, s3, s4
397; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
398; GFX9-NEXT:    s_sub_i32 s4, 0, s3
399; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
400; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
401; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
402; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
403; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
404; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
405; GFX9-NEXT:    s_add_i32 s2, s2, s4
406; GFX9-NEXT:    s_xor_b32 s2, s2, s4
407; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
408; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
409; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
410; GFX9-NEXT:    v_mov_b32_e32 v1, 0
411; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
412; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
413; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
414; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
415; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
416; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
417; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
418; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
419; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
420; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
421; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
423; GFX9-NEXT:    s_endpgm
424  %r = srem i32 %x, %y
425  store i32 %r, i32 addrspace(1)* %out
426  ret void
427}
428
429define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
430; CHECK-LABEL: @udiv_i16(
431; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
432; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
433; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
434; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
435; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
436; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
437; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
438; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
439; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
440; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
441; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
442; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
443; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
444; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
445; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
446; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
447; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
448; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
449; CHECK-NEXT:    ret void
450;
451; GFX6-LABEL: udiv_i16:
452; GFX6:       ; %bb.0:
453; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
454; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
455; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
457; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
458; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
459; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s2
460; GFX6-NEXT:    s_mov_b32 s3, 0xf000
461; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
462; GFX6-NEXT:    s_mov_b32 s2, -1
463; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
464; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
465; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
466; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
467; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
468; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
469; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
470; GFX6-NEXT:    s_endpgm
471;
472; GFX9-LABEL: udiv_i16:
473; GFX9:       ; %bb.0:
474; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
475; GFX9-NEXT:    v_mov_b32_e32 v3, 0
476; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
477; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
479; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
480; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
481; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
482; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
483; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
484; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
485; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
486; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
487; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
488; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
489; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
490; GFX9-NEXT:    s_endpgm
491  %r = udiv i16 %x, %y
492  store i16 %r, i16 addrspace(1)* %out
493  ret void
494}
495
496define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
497; CHECK-LABEL: @urem_i16(
498; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
499; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
500; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
501; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
502; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
503; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
504; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
505; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
506; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
507; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
508; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
509; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
510; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
511; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
512; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
513; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
514; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
515; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
516; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
517; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
518; CHECK-NEXT:    ret void
519;
520; GFX6-LABEL: urem_i16:
521; GFX6:       ; %bb.0:
522; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
523; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
524; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX6-NEXT:    s_lshr_b32 s2, s4, 16
526; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
527; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
528; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
529; GFX6-NEXT:    s_mov_b32 s3, 0xf000
530; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
531; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
532; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
533; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
534; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
535; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
536; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
537; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
538; GFX6-NEXT:    s_mov_b32 s2, -1
539; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
540; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
541; GFX6-NEXT:    s_endpgm
542;
543; GFX9-LABEL: urem_i16:
544; GFX9:       ; %bb.0:
545; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
546; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
547; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
548; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
549; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
550; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
551; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
552; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
553; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
554; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
555; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
556; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
557; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
558; GFX9-NEXT:    v_mov_b32_e32 v1, 0
559; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
560; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
561; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
562; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
563; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
564; GFX9-NEXT:    s_endpgm
565  %r = urem i16 %x, %y
566  store i16 %r, i16 addrspace(1)* %out
567  ret void
568}
569
570define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
571; CHECK-LABEL: @sdiv_i16(
572; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
573; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
574; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
575; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
576; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
577; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
578; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
579; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
580; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
581; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
582; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
583; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
584; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
585; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
586; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
587; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
588; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
589; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
590; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
591; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
592; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
593; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
594; CHECK-NEXT:    ret void
595;
596; GFX6-LABEL: sdiv_i16:
597; GFX6:       ; %bb.0:
598; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
599; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
600; GFX6-NEXT:    s_mov_b32 s3, 0xf000
601; GFX6-NEXT:    s_mov_b32 s2, -1
602; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
603; GFX6-NEXT:    s_ashr_i32 s5, s4, 16
604; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
605; GFX6-NEXT:    s_sext_i32_i16 s4, s4
606; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
607; GFX6-NEXT:    s_xor_b32 s4, s4, s5
608; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
609; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
610; GFX6-NEXT:    s_or_b32 s4, s4, 1
611; GFX6-NEXT:    v_mov_b32_e32 v3, s4
612; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
613; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
614; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
615; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
616; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
617; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
618; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
619; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
620; GFX6-NEXT:    s_endpgm
621;
622; GFX9-LABEL: sdiv_i16:
623; GFX9:       ; %bb.0:
624; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
625; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
626; GFX9-NEXT:    v_mov_b32_e32 v1, 0
627; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
628; GFX9-NEXT:    s_ashr_i32 s0, s4, 16
629; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
630; GFX9-NEXT:    s_sext_i32_i16 s1, s4
631; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
632; GFX9-NEXT:    s_xor_b32 s0, s1, s0
633; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
634; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
635; GFX9-NEXT:    s_or_b32 s4, s0, 1
636; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
637; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
638; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
639; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
640; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
641; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
642; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
643; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
644; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
645; GFX9-NEXT:    s_endpgm
646  %r = sdiv i16 %x, %y
647  store i16 %r, i16 addrspace(1)* %out
648  ret void
649}
650
651define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
652; CHECK-LABEL: @srem_i16(
653; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
654; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
655; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
656; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
657; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
658; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
659; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
660; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
661; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
662; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
663; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
664; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
665; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
666; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
667; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
668; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
669; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
670; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
671; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
672; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
673; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
674; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
675; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
676; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
677; CHECK-NEXT:    ret void
678;
679; GFX6-LABEL: srem_i16:
680; GFX6:       ; %bb.0:
681; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
682; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
683; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX6-NEXT:    s_ashr_i32 s2, s4, 16
685; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
686; GFX6-NEXT:    s_sext_i32_i16 s3, s4
687; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
688; GFX6-NEXT:    s_xor_b32 s3, s3, s2
689; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
690; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
691; GFX6-NEXT:    s_or_b32 s3, s3, 1
692; GFX6-NEXT:    v_mov_b32_e32 v3, s3
693; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
694; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
695; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
696; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
697; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
698; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
699; GFX6-NEXT:    s_mov_b32 s3, 0xf000
700; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
701; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
702; GFX6-NEXT:    s_mov_b32 s2, -1
703; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
704; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
705; GFX6-NEXT:    s_endpgm
706;
707; GFX9-LABEL: srem_i16:
708; GFX9:       ; %bb.0:
709; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
710; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
711; GFX9-NEXT:    s_ashr_i32 s5, s4, 16
712; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
713; GFX9-NEXT:    s_sext_i32_i16 s2, s4
714; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
715; GFX9-NEXT:    s_xor_b32 s2, s2, s5
716; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
717; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
718; GFX9-NEXT:    s_or_b32 s6, s2, 1
719; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
720; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
721; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
722; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
723; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
724; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
725; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
726; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
727; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
728; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
729; GFX9-NEXT:    v_mov_b32_e32 v1, 0
730; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
731; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
733; GFX9-NEXT:    s_endpgm
734  %r = srem i16 %x, %y
735  store i16 %r, i16 addrspace(1)* %out
736  ret void
737}
738
739define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
740; CHECK-LABEL: @udiv_i8(
741; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
742; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
743; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
744; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
745; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
746; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
747; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
748; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
749; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
750; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
751; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
752; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
753; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
754; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
755; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
756; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
757; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
758; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
759; CHECK-NEXT:    ret void
760;
761; GFX6-LABEL: udiv_i8:
762; GFX6:       ; %bb.0:
763; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
764; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
765; GFX6-NEXT:    s_mov_b32 s3, 0xf000
766; GFX6-NEXT:    s_mov_b32 s2, -1
767; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
768; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
769; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
770; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
771; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
772; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
773; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
774; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
775; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
776; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
777; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
778; GFX6-NEXT:    s_endpgm
779;
780; GFX9-LABEL: udiv_i8:
781; GFX9:       ; %bb.0:
782; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
783; GFX9-NEXT:    v_mov_b32_e32 v2, 0
784; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
785; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
787; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
788; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
789; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
790; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
791; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
792; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
793; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
794; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
795; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
796; GFX9-NEXT:    s_endpgm
797  %r = udiv i8 %x, %y
798  store i8 %r, i8 addrspace(1)* %out
799  ret void
800}
801
802define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
803; CHECK-LABEL: @urem_i8(
804; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
805; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
806; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
807; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
808; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
809; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
810; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
811; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
812; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
813; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
814; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
815; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
816; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
817; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
818; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
819; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
820; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
821; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
822; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
823; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
824; CHECK-NEXT:    ret void
825;
826; GFX6-LABEL: urem_i8:
827; GFX6:       ; %bb.0:
828; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
829; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
830; GFX6-NEXT:    s_mov_b32 s3, 0xf000
831; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
832; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
833; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
834; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
835; GFX6-NEXT:    s_lshr_b32 s2, s4, 8
836; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
837; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
838; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
839; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
840; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
841; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
842; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
843; GFX6-NEXT:    s_mov_b32 s2, -1
844; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
845; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
846; GFX6-NEXT:    s_endpgm
847;
848; GFX9-LABEL: urem_i8:
849; GFX9:       ; %bb.0:
850; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
851; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
853; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
854; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
855; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
856; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
857; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
858; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
859; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
860; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
861; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
862; GFX9-NEXT:    v_mov_b32_e32 v1, 0
863; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
864; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
865; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
866; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
868; GFX9-NEXT:    s_endpgm
869  %r = urem i8 %x, %y
870  store i8 %r, i8 addrspace(1)* %out
871  ret void
872}
873
874define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
875; CHECK-LABEL: @sdiv_i8(
876; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
877; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
878; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
879; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
880; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
881; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
882; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
883; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
884; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
885; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
886; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
887; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
888; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
889; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
890; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
891; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
892; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
893; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
894; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
895; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
896; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
897; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
898; CHECK-NEXT:    ret void
899;
900; GFX6-LABEL: sdiv_i8:
901; GFX6:       ; %bb.0:
902; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
903; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
904; GFX6-NEXT:    s_mov_b32 s3, 0xf000
905; GFX6-NEXT:    s_mov_b32 s2, -1
906; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX6-NEXT:    s_bfe_i32 s5, s4, 0x80008
908; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
909; GFX6-NEXT:    s_sext_i32_i8 s4, s4
910; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
911; GFX6-NEXT:    s_xor_b32 s4, s4, s5
912; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
913; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
914; GFX6-NEXT:    s_or_b32 s4, s4, 1
915; GFX6-NEXT:    v_mov_b32_e32 v3, s4
916; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
917; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
918; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
919; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
920; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
921; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
922; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
923; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
924; GFX6-NEXT:    s_endpgm
925;
926; GFX9-LABEL: sdiv_i8:
927; GFX9:       ; %bb.0:
928; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
929; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
930; GFX9-NEXT:    v_mov_b32_e32 v1, 0
931; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x80008
933; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
934; GFX9-NEXT:    s_sext_i32_i8 s1, s4
935; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
936; GFX9-NEXT:    s_xor_b32 s0, s1, s0
937; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
938; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
939; GFX9-NEXT:    s_or_b32 s4, s0, 1
940; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
941; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
942; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
943; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
944; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
945; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
946; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
947; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
948; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
949; GFX9-NEXT:    s_endpgm
950  %r = sdiv i8 %x, %y
951  store i8 %r, i8 addrspace(1)* %out
952  ret void
953}
954
955define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
956; CHECK-LABEL: @srem_i8(
957; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
958; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
959; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
960; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
961; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
962; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
963; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
964; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
965; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
966; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
967; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
968; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
969; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
970; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
971; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
972; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
973; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
974; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
975; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
976; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
977; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
978; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
979; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
980; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
981; CHECK-NEXT:    ret void
982;
983; GFX6-LABEL: srem_i8:
984; GFX6:       ; %bb.0:
985; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
986; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
987; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX6-NEXT:    s_bfe_i32 s2, s4, 0x80008
989; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
990; GFX6-NEXT:    s_sext_i32_i8 s5, s4
991; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
992; GFX6-NEXT:    s_xor_b32 s2, s5, s2
993; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
994; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
995; GFX6-NEXT:    s_or_b32 s2, s2, 1
996; GFX6-NEXT:    v_mov_b32_e32 v3, s2
997; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
998; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
999; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
1000; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
1001; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
1002; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
1003; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
1004; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1005; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
1006; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1007; GFX6-NEXT:    s_mov_b32 s2, -1
1008; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1009; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1010; GFX6-NEXT:    s_endpgm
1011;
1012; GFX9-LABEL: srem_i8:
1013; GFX9:       ; %bb.0:
1014; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
1015; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1016; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1017; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x80008
1018; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
1019; GFX9-NEXT:    s_sext_i32_i8 s1, s4
1020; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
1021; GFX9-NEXT:    s_xor_b32 s0, s1, s0
1022; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1023; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
1024; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
1025; GFX9-NEXT:    s_or_b32 s6, s0, 1
1026; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
1027; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1028; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
1029; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
1030; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
1031; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
1032; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
1033; GFX9-NEXT:    v_add_u32_e32 v0, s0, v2
1034; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
1035; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1036; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1037; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
1038; GFX9-NEXT:    s_endpgm
1039  %r = srem i8 %x, %y
1040  store i8 %r, i8 addrspace(1)* %out
1041  ret void
1042}
1043
1044define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1045; CHECK-LABEL: @udiv_v4i32(
1046; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1047; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1048; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1049; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1050; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1051; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1052; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1053; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1054; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1055; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1056; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1057; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1058; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1059; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1060; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1061; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1062; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1063; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1064; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1065; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1066; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1067; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1068; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1069; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1070; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
1071; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
1072; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1073; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
1074; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
1075; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
1076; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
1077; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0
1078; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
1079; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1080; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
1081; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
1082; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
1083; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
1084; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
1085; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
1086; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
1087; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
1088; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
1089; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1090; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
1091; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
1092; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
1093; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
1094; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
1095; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
1096; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1097; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
1098; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
1099; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
1100; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
1101; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
1102; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
1103; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
1104; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
1105; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
1106; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
1107; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
1108; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
1109; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
1110; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
1111; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1112; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
1113; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
1114; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
1115; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
1116; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
1117; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
1118; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
1119; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
1120; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
1121; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
1122; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
1123; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
1124; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
1125; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
1126; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
1127; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
1128; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
1129; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
1130; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
1131; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
1132; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
1133; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
1134; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
1135; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
1136; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
1137; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
1138; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
1139; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
1140; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
1141; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
1142; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
1143; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1144; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
1145; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
1146; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
1147; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
1148; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
1149; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
1150; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
1151; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1152; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1153; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1154; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1155; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1156; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
1157; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
1158; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
1159; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
1160; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
1161; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
1162; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
1163; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
1164; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
1165; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
1166; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
1167; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
1168; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
1169; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
1170; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
1171; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
1172; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
1173; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
1174; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1175; CHECK-NEXT:    ret void
1176;
1177; GFX6-LABEL: udiv_v4i32:
1178; GFX6:       ; %bb.0:
1179; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1180; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1181; GFX6-NEXT:    s_mov_b32 s15, 0xf000
1182; GFX6-NEXT:    s_mov_b32 s14, -1
1183; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1184; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1185; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1186; GFX6-NEXT:    s_sub_i32 s2, 0, s8
1187; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s10
1188; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1189; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1190; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s11
1191; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1192; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1193; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1194; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1195; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
1196; GFX6-NEXT:    s_sub_i32 s2, 0, s9
1197; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
1198; GFX6-NEXT:    s_sub_i32 s2, 0, s10
1199; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1200; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
1201; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1202; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1203; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
1204; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1205; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
1206; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1207; GFX6-NEXT:    v_mul_lo_u32 v5, v1, s9
1208; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
1209; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
1210; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1211; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
1212; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1213; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1214; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
1215; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v4
1216; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1217; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
1218; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1219; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1220; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1221; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
1222; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1223; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
1224; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
1225; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1226; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
1227; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
1228; GFX6-NEXT:    s_sub_i32 s0, 0, s11
1229; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1230; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v6
1231; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1232; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1233; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1234; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
1235; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1236; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s10
1237; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1238; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1239; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
1240; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
1241; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1242; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1243; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
1244; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1245; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
1246; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
1247; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1248; GFX6-NEXT:    v_mul_lo_u32 v6, v4, s11
1249; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1250; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1251; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1252; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
1253; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
1254; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1255; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
1256; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1257; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1258; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1259; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1260; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1261; GFX6-NEXT:    s_endpgm
1262;
1263; GFX9-LABEL: udiv_v4i32:
1264; GFX9:       ; %bb.0:
1265; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1266; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1267; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1268; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1269; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1270; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1271; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1272; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1273; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1274; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1275; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
1276; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
1277; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1278; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1279; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1280; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1281; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1282; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
1283; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1284; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
1285; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
1286; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1287; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1288; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
1289; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1290; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1291; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1292; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v5
1293; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1294; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s8
1295; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
1296; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
1297; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1298; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
1299; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1300; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
1301; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v3
1302; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1303; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1304; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v2
1305; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1306; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s9
1307; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
1308; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
1309; GFX9-NEXT:    v_add_u32_e32 v8, 1, v1
1310; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
1311; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
1312; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
1313; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v6
1314; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
1315; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
1316; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
1317; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v3
1318; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s10
1319; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v5
1320; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1321; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
1322; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
1323; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
1324; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
1325; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v8
1326; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
1327; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
1328; GFX9-NEXT:    v_subrev_u32_e32 v6, s10, v5
1329; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1330; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s11
1331; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
1332; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
1333; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
1334; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
1335; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v6
1336; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
1337; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
1338; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
1339; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1340; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v5
1341; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1342; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
1343; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
1344; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1345; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1346; GFX9-NEXT:    s_endpgm
1347  %r = udiv <4 x i32> %x, %y
1348  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1349  ret void
1350}
1351
1352define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1353; CHECK-LABEL: @urem_v4i32(
1354; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1355; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1356; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1357; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1358; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1359; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1360; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1361; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1362; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1363; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1364; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1365; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1366; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1367; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1368; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1369; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1370; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1371; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1372; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1373; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1374; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1375; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1376; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1377; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1378; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1379; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
1380; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
1381; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
1382; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
1383; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0
1384; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
1385; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1386; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
1387; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
1388; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
1389; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
1390; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
1391; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
1392; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
1393; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
1394; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
1395; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
1396; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
1397; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1398; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
1399; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
1400; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1401; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1402; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1403; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1404; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1405; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1406; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1407; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1408; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1409; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1410; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1411; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1412; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1413; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1414; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1415; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1416; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1417; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1418; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1419; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1420; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1421; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1422; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1423; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1424; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1425; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1426; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1427; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1428; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1429; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1430; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1431; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1432; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1433; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1434; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1435; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1436; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1437; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1438; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1439; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1440; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1441; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1442; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1443; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1444; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1445; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1446; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1447; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1448; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1449; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1450; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1451; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1452; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1453; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1454; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1455; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1456; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1457; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1458; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1459; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1460; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1461; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1462; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1463; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1464; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1465; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1466; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1467; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1468; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1469; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1470; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1471; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1472; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1473; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1474; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1475; CHECK-NEXT:    ret void
1476;
1477; GFX6-LABEL: urem_v4i32:
1478; GFX6:       ; %bb.0:
1479; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1480; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1481; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1482; GFX6-NEXT:    s_mov_b32 s2, -1
1483; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1484; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1485; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1486; GFX6-NEXT:    s_sub_i32 s12, 0, s8
1487; GFX6-NEXT:    s_sub_i32 s13, 0, s9
1488; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1489; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1490; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
1491; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
1492; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1493; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1494; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1495; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1496; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1497; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v0
1498; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v1
1499; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1500; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
1501; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1502; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1503; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1504; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1505; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
1506; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
1507; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1508; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
1509; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1510; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1511; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1512; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1513; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1514; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1515; GFX6-NEXT:    s_sub_i32 s4, 0, s10
1516; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1517; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v2
1518; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1519; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1520; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1521; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
1522; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1523; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
1524; GFX6-NEXT:    s_sub_i32 s4, 0, s11
1525; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1526; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
1527; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1528; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1529; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1530; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
1531; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1532; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1533; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
1534; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
1535; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
1536; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1537; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
1538; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1539; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1540; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
1541; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1542; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1543; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1544; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1545; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1546; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1547; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1548; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1549; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1550; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1551; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1552; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1553; GFX6-NEXT:    s_endpgm
1554;
1555; GFX9-LABEL: urem_v4i32:
1556; GFX9:       ; %bb.0:
1557; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1558; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1559; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1560; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1561; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1562; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1563; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1564; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s10
1565; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1566; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1567; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1568; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1569; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1570; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1571; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1572; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1573; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
1574; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v0
1575; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1576; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v1
1577; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1578; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
1579; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
1580; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
1581; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s11
1582; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
1583; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
1584; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1585; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1586; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1587; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
1588; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1589; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
1590; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1591; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
1592; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
1593; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
1594; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
1595; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
1596; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1597; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
1598; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
1599; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1600; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1601; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s10
1602; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
1603; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
1604; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
1605; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
1606; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1607; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1608; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
1609; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1610; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1611; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s11
1612; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
1613; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1614; GFX9-NEXT:    v_sub_u32_e32 v2, s6, v2
1615; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1616; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
1617; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1618; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1619; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
1620; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1621; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
1622; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1623; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
1624; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1625; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1626; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
1627; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1628; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1629; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1630; GFX9-NEXT:    s_endpgm
1631  %r = urem <4 x i32> %x, %y
1632  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1633  ret void
1634}
1635
1636define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1637; CHECK-LABEL: @sdiv_v4i32(
1638; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1639; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1640; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1641; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1642; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1643; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1644; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1645; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1646; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1647; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1648; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1649; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1650; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1651; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1652; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1653; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1654; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1655; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1656; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1657; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1658; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1659; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1660; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1661; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1662; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1663; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1664; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1665; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1666; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1667; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1668; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1669; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
1670; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1671; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1672; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1673; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1674; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
1675; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1676; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1677; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1678; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0
1679; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1680; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1681; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1682; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1683; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1684; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1685; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1686; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1687; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1688; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1689; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1690; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1691; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1692; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1693; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1694; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1695; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1696; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1697; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1698; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1699; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1700; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1701; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1702; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1703; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1704; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1705; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1706; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1707; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1708; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1709; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1710; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
1711; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1712; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1713; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1714; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1715; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
1716; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1717; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1718; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1719; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1720; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1721; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1722; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1723; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1724; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1725; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1726; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1727; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1728; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1729; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1730; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1731; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1732; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1733; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1734; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1735; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1736; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1737; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1738; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1739; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1740; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1741; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1742; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1743; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1744; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1745; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1746; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1747; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1748; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1749; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1750; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1751; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
1752; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1753; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1754; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1755; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1756; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
1757; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1758; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1759; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1760; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1761; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1762; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1763; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1764; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1765; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1766; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1767; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1768; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1769; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1770; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1771; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1772; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1773; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1774; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1775; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1776; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1777; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1778; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1779; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1780; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1781; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1782; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1783; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1784; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1785; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1786; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1787; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1788; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1789; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1790; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1791; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1792; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
1793; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1794; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1795; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1796; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1797; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
1798; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1799; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1800; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1801; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1802; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1803; CHECK-NEXT:    ret void
1804;
1805; GFX6-LABEL: sdiv_v4i32:
1806; GFX6:       ; %bb.0:
1807; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1808; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1809; GFX6-NEXT:    s_mov_b32 s15, 0xf000
1810; GFX6-NEXT:    s_mov_b32 s14, -1
1811; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1812; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
1813; GFX6-NEXT:    s_add_i32 s3, s8, s2
1814; GFX6-NEXT:    s_xor_b32 s3, s3, s2
1815; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
1816; GFX6-NEXT:    s_ashr_i32 s8, s9, 31
1817; GFX6-NEXT:    s_add_i32 s0, s9, s8
1818; GFX6-NEXT:    s_xor_b32 s9, s0, s8
1819; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1820; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1821; GFX6-NEXT:    s_sub_i32 s1, 0, s3
1822; GFX6-NEXT:    s_ashr_i32 s0, s4, 31
1823; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1824; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1825; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1826; GFX6-NEXT:    s_xor_b32 s2, s0, s2
1827; GFX6-NEXT:    v_mul_lo_u32 v2, s1, v0
1828; GFX6-NEXT:    s_add_i32 s1, s4, s0
1829; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1830; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1831; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1832; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1833; GFX6-NEXT:    s_sub_i32 s0, 0, s9
1834; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1835; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
1836; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
1837; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
1838; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
1839; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1840; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1841; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v3
1842; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1843; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v3
1844; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
1845; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1846; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1847; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
1848; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
1849; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1850; GFX6-NEXT:    s_add_i32 s1, s5, s0
1851; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
1852; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
1853; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1854; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
1855; GFX6-NEXT:    s_xor_b32 s2, s0, s8
1856; GFX6-NEXT:    s_add_i32 s0, s10, s3
1857; GFX6-NEXT:    s_xor_b32 s4, s0, s3
1858; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
1859; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
1860; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1861; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
1862; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1863; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
1864; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
1865; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1866; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
1867; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1868; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v2
1869; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1870; GFX6-NEXT:    s_sub_i32 s0, 0, s4
1871; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
1872; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1873; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1874; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1875; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v5
1876; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
1877; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
1878; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
1879; GFX6-NEXT:    s_ashr_i32 s0, s6, 31
1880; GFX6-NEXT:    s_add_i32 s5, s11, s2
1881; GFX6-NEXT:    s_add_i32 s1, s6, s0
1882; GFX6-NEXT:    s_xor_b32 s5, s5, s2
1883; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1884; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1885; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
1886; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
1887; GFX6-NEXT:    s_xor_b32 s3, s0, s3
1888; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1889; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s4
1890; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1891; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
1892; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1893; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1894; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
1895; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1896; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v3
1897; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1898; GFX6-NEXT:    s_sub_i32 s0, 0, s5
1899; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1900; GFX6-NEXT:    s_ashr_i32 s0, s7, 31
1901; GFX6-NEXT:    s_add_i32 s1, s7, s0
1902; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1903; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1904; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1905; GFX6-NEXT:    s_xor_b32 s2, s0, s2
1906; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1907; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
1908; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
1909; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1910; GFX6-NEXT:    v_xor_b32_e32 v2, s3, v2
1911; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s5
1912; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1913; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
1914; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1915; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v3
1916; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1917; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v3
1918; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1919; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1920; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
1921; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1922; GFX6-NEXT:    v_xor_b32_e32 v3, s2, v3
1923; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
1924; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1925; GFX6-NEXT:    s_endpgm
1926;
1927; GFX9-LABEL: sdiv_v4i32:
1928; GFX9:       ; %bb.0:
1929; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1930; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1931; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1932; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1933; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
1934; GFX9-NEXT:    s_add_i32 s3, s8, s2
1935; GFX9-NEXT:    s_xor_b32 s3, s3, s2
1936; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
1937; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
1938; GFX9-NEXT:    s_add_i32 s9, s9, s12
1939; GFX9-NEXT:    s_xor_b32 s9, s9, s12
1940; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1941; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1942; GFX9-NEXT:    s_sub_i32 s14, 0, s3
1943; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
1944; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1945; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1946; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1947; GFX9-NEXT:    s_add_i32 s4, s4, s8
1948; GFX9-NEXT:    s_xor_b32 s4, s4, s8
1949; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v0
1950; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
1951; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1952; GFX9-NEXT:    s_sub_i32 s14, 0, s9
1953; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1954; GFX9-NEXT:    s_ashr_i32 s13, s5, 31
1955; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
1956; GFX9-NEXT:    s_add_i32 s5, s5, s13
1957; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1958; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1959; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
1960; GFX9-NEXT:    s_xor_b32 s5, s5, s13
1961; GFX9-NEXT:    s_xor_b32 s2, s8, s2
1962; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
1963; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
1964; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
1965; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1966; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
1967; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
1968; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1969; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v3
1970; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1971; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
1972; GFX9-NEXT:    s_ashr_i32 s3, s10, 31
1973; GFX9-NEXT:    s_add_i32 s4, s10, s3
1974; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
1975; GFX9-NEXT:    s_xor_b32 s4, s4, s3
1976; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1977; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
1978; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s9
1979; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1980; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
1981; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1982; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
1983; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1984; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1985; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
1986; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1987; GFX9-NEXT:    v_subrev_u32_e32 v5, s9, v2
1988; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1989; GFX9-NEXT:    s_sub_i32 s5, 0, s4
1990; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1991; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v3
1992; GFX9-NEXT:    s_add_i32 s9, s11, s8
1993; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1994; GFX9-NEXT:    s_xor_b32 s9, s9, s8
1995; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1996; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
1997; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
1998; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
1999; GFX9-NEXT:    s_add_i32 s6, s6, s5
2000; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
2001; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v5
2002; GFX9-NEXT:    s_xor_b32 s6, s6, s5
2003; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
2004; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
2005; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
2006; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2007; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
2008; GFX9-NEXT:    s_xor_b32 s2, s13, s12
2009; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s4
2010; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
2011; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
2012; GFX9-NEXT:    s_xor_b32 s2, s5, s3
2013; GFX9-NEXT:    s_sub_i32 s3, 0, s9
2014; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v3
2015; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v5
2016; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2017; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2018; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2019; GFX9-NEXT:    v_subrev_u32_e32 v6, s4, v5
2020; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2021; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v7
2022; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
2023; GFX9-NEXT:    s_add_i32 s5, s7, s3
2024; GFX9-NEXT:    s_xor_b32 s5, s5, s3
2025; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
2026; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
2027; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2028; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2029; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2030; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s9
2031; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2032; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
2033; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
2034; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
2035; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2036; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2037; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v5
2038; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2039; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2040; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2041; GFX9-NEXT:    s_xor_b32 s2, s3, s8
2042; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2043; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
2044; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
2045; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2046; GFX9-NEXT:    s_endpgm
2047  %r = sdiv <4 x i32> %x, %y
2048  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2049  ret void
2050}
2051
2052define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
2053; CHECK-LABEL: @srem_v4i32(
2054; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
2055; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
2056; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
2057; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
2058; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
2059; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
2060; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
2061; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
2062; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
2063; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2064; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
2065; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
2066; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
2067; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
2068; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
2069; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
2070; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
2071; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
2072; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
2073; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
2074; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
2075; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
2076; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
2077; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
2078; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
2079; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
2080; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
2081; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
2082; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
2083; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
2084; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
2085; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
2086; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
2087; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
2088; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
2089; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
2090; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
2091; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0
2092; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
2093; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
2094; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
2095; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
2096; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
2097; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
2098; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
2099; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
2100; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
2101; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
2102; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
2103; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
2104; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
2105; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
2106; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
2107; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
2108; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
2109; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
2110; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
2111; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
2112; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
2113; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
2114; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
2115; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
2116; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
2117; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
2118; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
2119; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
2120; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
2121; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
2122; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
2123; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
2124; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
2125; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
2126; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
2127; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
2128; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
2129; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
2130; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
2131; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
2132; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
2133; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
2134; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
2135; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
2136; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
2137; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
2138; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
2139; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
2140; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
2141; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
2142; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
2143; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
2144; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
2145; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
2146; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
2147; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
2148; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
2149; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
2150; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
2151; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
2152; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
2153; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
2154; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
2155; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
2156; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
2157; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
2158; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
2159; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
2160; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
2161; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
2162; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
2163; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
2164; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
2165; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
2166; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
2167; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
2168; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
2169; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
2170; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
2171; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
2172; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
2173; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
2174; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
2175; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
2176; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
2177; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
2178; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
2179; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
2180; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
2181; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
2182; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
2183; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
2184; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
2185; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
2186; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
2187; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
2188; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
2189; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
2190; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
2191; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
2192; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
2193; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
2194; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
2195; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
2196; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
2197; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
2198; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
2199; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
2200; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
2201; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
2202; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
2203; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
2204; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
2205; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
2206; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
2207; CHECK-NEXT:    ret void
2208;
2209; GFX6-LABEL: srem_v4i32:
2210; GFX6:       ; %bb.0:
2211; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
2212; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2213; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2214; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2215; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
2216; GFX6-NEXT:    s_add_i32 s8, s8, s2
2217; GFX6-NEXT:    s_xor_b32 s8, s8, s2
2218; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
2219; GFX6-NEXT:    s_ashr_i32 s13, s9, 31
2220; GFX6-NEXT:    s_add_i32 s9, s9, s13
2221; GFX6-NEXT:    s_xor_b32 s9, s9, s13
2222; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2223; GFX6-NEXT:    s_sub_i32 s14, 0, s8
2224; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
2225; GFX6-NEXT:    s_ashr_i32 s12, s4, 31
2226; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2227; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2228; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2229; GFX6-NEXT:    s_add_i32 s4, s4, s12
2230; GFX6-NEXT:    s_xor_b32 s4, s4, s12
2231; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v0
2232; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2233; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2234; GFX6-NEXT:    s_sub_i32 s14, 0, s9
2235; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
2236; GFX6-NEXT:    s_ashr_i32 s13, s5, 31
2237; GFX6-NEXT:    s_add_i32 s5, s5, s13
2238; GFX6-NEXT:    s_xor_b32 s5, s5, s13
2239; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2240; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
2241; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v1
2242; GFX6-NEXT:    s_mov_b32 s2, -1
2243; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
2244; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
2245; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
2246; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
2247; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2248; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2249; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
2250; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2251; GFX6-NEXT:    s_ashr_i32 s4, s10, 31
2252; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2253; GFX6-NEXT:    s_add_i32 s8, s10, s4
2254; GFX6-NEXT:    s_xor_b32 s4, s8, s4
2255; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s4
2256; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
2257; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2258; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
2259; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2260; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
2261; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
2262; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
2263; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2264; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
2265; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
2266; GFX6-NEXT:    s_sub_i32 s5, 0, s4
2267; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2268; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v2
2269; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2270; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
2271; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2272; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2273; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
2274; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
2275; GFX6-NEXT:    s_add_i32 s9, s11, s8
2276; GFX6-NEXT:    s_ashr_i32 s5, s6, 31
2277; GFX6-NEXT:    s_xor_b32 s8, s9, s8
2278; GFX6-NEXT:    s_add_i32 s6, s6, s5
2279; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
2280; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
2281; GFX6-NEXT:    s_xor_b32 s6, s6, s5
2282; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
2283; GFX6-NEXT:    v_xor_b32_e32 v1, s13, v1
2284; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2285; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s13, v1
2286; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
2287; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
2288; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
2289; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
2290; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v2
2291; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
2292; GFX6-NEXT:    s_sub_i32 s6, 0, s8
2293; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2294; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
2295; GFX6-NEXT:    s_ashr_i32 s6, s7, 31
2296; GFX6-NEXT:    s_add_i32 s7, s7, s6
2297; GFX6-NEXT:    s_xor_b32 s7, s7, s6
2298; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
2299; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v2
2300; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
2301; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
2302; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
2303; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2304; GFX6-NEXT:    v_xor_b32_e32 v2, s5, v2
2305; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s8
2306; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s5, v2
2307; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
2308; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2309; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2310; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2311; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2312; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2313; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2314; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
2315; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v3
2316; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2317; GFX6-NEXT:    s_endpgm
2318;
2319; GFX9-LABEL: srem_v4i32:
2320; GFX9:       ; %bb.0:
2321; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
2322; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2323; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2324; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2325; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
2326; GFX9-NEXT:    s_add_i32 s8, s8, s2
2327; GFX9-NEXT:    s_xor_b32 s2, s8, s2
2328; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
2329; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
2330; GFX9-NEXT:    s_add_i32 s8, s9, s3
2331; GFX9-NEXT:    s_sub_i32 s12, 0, s2
2332; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2333; GFX9-NEXT:    s_xor_b32 s3, s8, s3
2334; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
2335; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
2336; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2337; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2338; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2339; GFX9-NEXT:    s_add_i32 s4, s4, s8
2340; GFX9-NEXT:    s_xor_b32 s4, s4, s8
2341; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
2342; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2343; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2344; GFX9-NEXT:    s_sub_i32 s12, 0, s3
2345; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
2346; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
2347; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
2348; GFX9-NEXT:    s_ashr_i32 s12, s10, 31
2349; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
2350; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
2351; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
2352; GFX9-NEXT:    s_add_i32 s5, s5, s9
2353; GFX9-NEXT:    s_xor_b32 s5, s5, s9
2354; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
2355; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
2356; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
2357; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
2358; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
2359; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2360; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2361; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
2362; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2363; GFX9-NEXT:    s_add_i32 s2, s10, s12
2364; GFX9-NEXT:    s_xor_b32 s2, s2, s12
2365; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2366; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
2367; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
2368; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
2369; GFX9-NEXT:    v_subrev_u32_e32 v0, s8, v0
2370; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2371; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
2372; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
2373; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2374; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
2375; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2376; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2377; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
2378; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2379; GFX9-NEXT:    s_sub_i32 s3, 0, s2
2380; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2381; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
2382; GFX9-NEXT:    s_ashr_i32 s3, s11, 31
2383; GFX9-NEXT:    s_add_i32 s4, s11, s3
2384; GFX9-NEXT:    s_xor_b32 s3, s4, s3
2385; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s3
2386; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
2387; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
2388; GFX9-NEXT:    s_add_i32 s5, s6, s4
2389; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
2390; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
2391; GFX9-NEXT:    s_xor_b32 s5, s5, s4
2392; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
2393; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v5
2394; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2395; GFX9-NEXT:    s_sub_i32 s6, 0, s3
2396; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
2397; GFX9-NEXT:    v_xor_b32_e32 v1, s9, v1
2398; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v3
2399; GFX9-NEXT:    v_subrev_u32_e32 v1, s9, v1
2400; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
2401; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
2402; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
2403; GFX9-NEXT:    s_add_i32 s6, s7, s5
2404; GFX9-NEXT:    s_xor_b32 s6, s6, s5
2405; GFX9-NEXT:    v_subrev_u32_e32 v6, s2, v2
2406; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
2407; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
2408; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
2409; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2410; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v2
2411; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s3
2412; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
2413; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2414; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
2415; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
2416; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
2417; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
2418; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2419; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
2420; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
2421; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2422; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
2423; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
2424; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
2425; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2426; GFX9-NEXT:    s_endpgm
2427  %r = srem <4 x i32> %x, %y
2428  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2429  ret void
2430}
2431
2432define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2433; CHECK-LABEL: @udiv_v4i16(
2434; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2435; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2436; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2437; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2438; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2439; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2440; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2441; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2442; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2443; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2444; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2445; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2446; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2447; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2448; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2449; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2450; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2451; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2452; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2453; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
2454; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
2455; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2456; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2457; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2458; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2459; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2460; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2461; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2462; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2463; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2464; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2465; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2466; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2467; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2468; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2469; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2470; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2471; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2472; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2473; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2474; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
2475; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2476; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2477; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2478; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2479; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2480; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2481; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2482; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2483; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2484; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2485; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2486; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2487; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2488; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2489; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2490; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2491; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2492; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2493; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2494; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
2495; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2496; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
2497; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
2498; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
2499; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
2500; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
2501; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
2502; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
2503; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
2504; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
2505; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
2506; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
2507; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2508; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
2509; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
2510; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
2511; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
2512; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
2513; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
2514; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2515; CHECK-NEXT:    ret void
2516;
2517; GFX6-LABEL: udiv_v4i16:
2518; GFX6:       ; %bb.0:
2519; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
2520; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2521; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2522; GFX6-NEXT:    s_mov_b32 s2, -1
2523; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2524; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
2525; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
2526; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
2527; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
2528; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
2529; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
2530; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2531; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
2532; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
2533; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2534; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
2535; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2536; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
2537; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2538; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2539; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2540; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
2541; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
2542; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
2543; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
2544; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
2545; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
2546; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2547; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
2548; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2549; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2550; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
2551; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
2552; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2553; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
2554; GFX6-NEXT:    v_mad_f32 v3, -v1, v4, v5
2555; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
2556; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
2557; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
2558; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2559; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2560; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2561; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2562; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2563; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
2564; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2565; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
2566; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
2567; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2568; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2569; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2570; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2571; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2572; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
2573; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2574; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2575; GFX6-NEXT:    s_endpgm
2576;
2577; GFX9-LABEL: udiv_v4i16:
2578; GFX9:       ; %bb.0:
2579; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
2580; GFX9-NEXT:    v_mov_b32_e32 v6, 0
2581; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2582; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2583; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
2584; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
2585; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
2586; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
2587; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
2588; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
2589; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
2590; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
2591; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
2592; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
2593; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
2594; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
2595; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
2596; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
2597; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
2598; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
2599; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
2600; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2601; GFX9-NEXT:    s_and_b32 s2, s5, 0xffff
2602; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
2603; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
2604; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
2605; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2606; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
2607; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
2608; GFX9-NEXT:    s_lshr_b32 s2, s7, 16
2609; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
2610; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2611; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
2612; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
2613; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2614; GFX9-NEXT:    s_lshr_b32 s2, s5, 16
2615; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
2616; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
2617; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
2618; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2619; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2620; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
2621; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2622; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
2623; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2624; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
2625; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2626; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
2627; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2628; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2629; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
2630; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
2631; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
2632; GFX9-NEXT:    s_endpgm
2633  %r = udiv <4 x i16> %x, %y
2634  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2635  ret void
2636}
2637
2638define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2639; CHECK-LABEL: @urem_v4i16(
2640; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2641; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2642; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2643; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2644; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2645; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2646; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2647; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2648; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2649; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2650; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2651; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2652; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2653; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2654; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2655; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2656; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2657; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2658; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2659; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2660; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2661; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
2662; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
2663; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2664; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2665; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2666; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2667; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2668; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2669; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2670; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2671; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2672; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2673; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2674; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2675; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2676; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2677; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2678; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2679; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2680; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2681; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2682; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2683; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2684; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
2685; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2686; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2687; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2688; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2689; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2690; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2691; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2692; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2693; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2694; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2695; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2696; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2697; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2698; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2699; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2700; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2701; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2702; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2703; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2704; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2705; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2706; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
2707; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2708; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
2709; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
2710; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
2711; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
2712; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
2713; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
2714; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
2715; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
2716; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
2717; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
2718; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
2719; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
2720; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
2721; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
2722; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
2723; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
2724; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
2725; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
2726; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
2727; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
2728; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2729; CHECK-NEXT:    ret void
2730;
2731; GFX6-LABEL: urem_v4i16:
2732; GFX6:       ; %bb.0:
2733; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
2734; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2735; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2736; GFX6-NEXT:    s_mov_b32 s2, -1
2737; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2738; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
2739; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
2740; GFX6-NEXT:    v_mov_b32_e32 v4, s6
2741; GFX6-NEXT:    v_alignbit_b32 v4, s7, v4, 16
2742; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
2743; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v4
2744; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
2745; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2746; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
2747; GFX6-NEXT:    v_mov_b32_e32 v1, s4
2748; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
2749; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v1
2750; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
2751; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, v6
2752; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2753; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2754; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
2755; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
2756; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2757; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
2758; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2759; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2760; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
2761; GFX6-NEXT:    v_mad_f32 v2, -v2, v5, v6
2762; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
2763; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
2764; GFX6-NEXT:    s_and_b32 s6, s7, 0xffff
2765; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2766; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
2767; GFX6-NEXT:    s_and_b32 s6, s5, 0xffff
2768; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
2769; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
2770; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2771; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
2772; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
2773; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v1, v2
2774; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2775; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
2776; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
2777; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s6
2778; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2779; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2780; GFX6-NEXT:    v_mad_f32 v4, -v1, v3, v4
2781; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2782; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
2783; GFX6-NEXT:    v_mul_f32_e32 v3, v6, v7
2784; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2785; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
2786; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2787; GFX6-NEXT:    v_mad_f32 v3, -v3, v5, v6
2788; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2789; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2790; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
2791; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s4
2792; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2793; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2794; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
2795; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
2796; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2797; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2798; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
2799; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2800; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2801; GFX6-NEXT:    s_endpgm
2802;
2803; GFX9-LABEL: urem_v4i16:
2804; GFX9:       ; %bb.0:
2805; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
2806; GFX9-NEXT:    v_mov_b32_e32 v6, 0
2807; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2808; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2809; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
2810; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
2811; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
2812; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
2813; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
2814; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
2815; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
2816; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
2817; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
2818; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
2819; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
2820; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
2821; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
2822; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2823; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
2824; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
2825; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
2826; GFX9-NEXT:    s_and_b32 s3, s7, 0xffff
2827; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
2828; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
2829; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
2830; GFX9-NEXT:    s_and_b32 s8, s5, 0xffff
2831; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
2832; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2833; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
2834; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2835; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
2836; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
2837; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
2838; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
2839; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
2840; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
2841; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
2842; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
2843; GFX9-NEXT:    s_lshr_b32 s5, s5, 16
2844; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s5
2845; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2846; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
2847; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2848; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
2849; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
2850; GFX9-NEXT:    v_mul_f32_e32 v3, v7, v8
2851; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2852; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
2853; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v7
2854; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
2855; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
2856; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
2857; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s6
2858; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v1
2859; GFX9-NEXT:    v_sub_u32_e32 v1, s8, v2
2860; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2861; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v3
2862; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2863; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
2864; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
2865; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
2866; GFX9-NEXT:    s_endpgm
2867  %r = urem <4 x i16> %x, %y
2868  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2869  ret void
2870}
2871
2872define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2873; CHECK-LABEL: @sdiv_v4i16(
2874; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2875; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2876; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2877; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2878; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2879; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2880; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2881; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2882; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2883; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2884; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2885; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2886; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2887; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2888; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2889; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2890; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2891; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2892; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2893; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2894; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2895; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2896; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2897; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
2898; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2899; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2900; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2901; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2902; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2903; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2904; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2905; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2906; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2907; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2908; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2909; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2910; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2911; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2912; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2913; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2914; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2915; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2916; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2917; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2918; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2919; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2920; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2921; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2922; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2923; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2924; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2925; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2926; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2927; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2928; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2929; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2930; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2931; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2932; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2933; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2934; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2935; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2936; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2937; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2938; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2939; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2940; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2941; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2942; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2943; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2944; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2945; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2946; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2947; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2948; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2949; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2950; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2951; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2952; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2953; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2954; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2955; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2956; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2957; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2958; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2959; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2960; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2961; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2962; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2963; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2964; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2965; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2966; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2967; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2968; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2969; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2970; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2971; CHECK-NEXT:    ret void
2972;
2973; GFX6-LABEL: sdiv_v4i16:
2974; GFX6:       ; %bb.0:
2975; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
2976; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2977; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2978; GFX6-NEXT:    s_mov_b32 s2, -1
2979; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2980; GFX6-NEXT:    s_sext_i32_i16 s8, s6
2981; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
2982; GFX6-NEXT:    s_sext_i32_i16 s9, s4
2983; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
2984; GFX6-NEXT:    s_xor_b32 s8, s9, s8
2985; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2986; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
2987; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
2988; GFX6-NEXT:    s_or_b32 s8, s8, 1
2989; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
2990; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2991; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
2992; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
2993; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2994; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
2995; GFX6-NEXT:    v_mov_b32_e32 v3, s8
2996; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2997; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
2998; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2999; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
3000; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3001; GFX6-NEXT:    s_xor_b32 s4, s4, s6
3002; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3003; GFX6-NEXT:    s_or_b32 s4, s4, 1
3004; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
3005; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3006; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
3007; GFX6-NEXT:    v_mov_b32_e32 v4, s4
3008; GFX6-NEXT:    s_sext_i32_i16 s4, s7
3009; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3010; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3011; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
3012; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3013; GFX6-NEXT:    s_sext_i32_i16 s6, s5
3014; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
3015; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
3016; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3017; GFX6-NEXT:    s_xor_b32 s4, s6, s4
3018; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3019; GFX6-NEXT:    s_or_b32 s4, s4, 1
3020; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3021; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3022; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
3023; GFX6-NEXT:    v_mov_b32_e32 v5, s4
3024; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
3025; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3026; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
3027; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
3028; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
3029; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3030; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3031; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
3032; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3033; GFX6-NEXT:    s_xor_b32 s4, s5, s4
3034; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3035; GFX6-NEXT:    s_or_b32 s4, s4, 1
3036; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3037; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3038; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
3039; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3040; GFX6-NEXT:    v_mov_b32_e32 v6, s4
3041; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
3042; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
3043; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
3044; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3045; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3046; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3047; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
3048; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3049; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3050; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3051; GFX6-NEXT:    s_endpgm
3052;
3053; GFX9-LABEL: sdiv_v4i16:
3054; GFX9:       ; %bb.0:
3055; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3056; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3057; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3058; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3059; GFX9-NEXT:    s_sext_i32_i16 s0, s6
3060; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3061; GFX9-NEXT:    s_sext_i32_i16 s1, s4
3062; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
3063; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3064; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3065; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3066; GFX9-NEXT:    s_or_b32 s8, s0, 1
3067; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3068; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3069; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3070; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3071; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3072; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3073; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
3074; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3075; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
3076; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s4
3077; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3078; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3079; GFX9-NEXT:    v_add_u32_e32 v3, s0, v3
3080; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
3081; GFX9-NEXT:    s_xor_b32 s0, s4, s1
3082; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3083; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3084; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
3085; GFX9-NEXT:    s_or_b32 s4, s0, 1
3086; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3087; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3088; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3089; GFX9-NEXT:    s_sext_i32_i16 s1, s7
3090; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3091; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3092; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
3093; GFX9-NEXT:    s_sext_i32_i16 s0, s5
3094; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
3095; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
3096; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3097; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3098; GFX9-NEXT:    s_or_b32 s4, s0, 1
3099; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
3100; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3101; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
3102; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3103; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3104; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3105; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3106; GFX9-NEXT:    s_ashr_i32 s1, s7, 16
3107; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3108; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
3109; GFX9-NEXT:    s_ashr_i32 s0, s5, 16
3110; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
3111; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
3112; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3113; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3114; GFX9-NEXT:    s_or_b32 s4, s0, 1
3115; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3116; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3117; GFX9-NEXT:    v_mad_f32 v5, -v6, v0, v5
3118; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3119; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
3120; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3121; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3122; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
3123; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3124; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
3125; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
3126; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
3127; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3128; GFX9-NEXT:    s_endpgm
3129  %r = sdiv <4 x i16> %x, %y
3130  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3131  ret void
3132}
3133
3134define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
3135; CHECK-LABEL: @srem_v4i16(
3136; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3137; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3138; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3139; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3140; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3141; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3142; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3143; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3144; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3145; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3146; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3147; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3148; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3149; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3150; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3151; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3152; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3153; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3154; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3155; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3156; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3157; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3158; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3159; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3160; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3161; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
3162; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
3163; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3164; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3165; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3166; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3167; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3168; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3169; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3170; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3171; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3172; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3173; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3174; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3175; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3176; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3177; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3178; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3179; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3180; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3181; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3182; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3183; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3184; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3185; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3186; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3187; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3188; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
3189; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3190; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3191; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3192; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3193; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3194; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3195; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3196; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3197; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3198; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3199; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3200; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3201; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3202; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3203; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3204; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3205; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3206; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3207; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3208; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3209; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3210; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3211; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3212; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3213; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3214; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
3215; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3216; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
3217; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
3218; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
3219; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
3220; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
3221; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
3222; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
3223; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
3224; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
3225; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
3226; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
3227; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
3228; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
3229; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
3230; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
3231; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
3232; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
3233; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
3234; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
3235; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
3236; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
3237; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
3238; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
3239; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
3240; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
3241; CHECK-NEXT:    ret void
3242;
3243; GFX6-LABEL: srem_v4i16:
3244; GFX6:       ; %bb.0:
3245; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
3246; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3247; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3248; GFX6-NEXT:    s_mov_b32 s2, -1
3249; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3250; GFX6-NEXT:    s_sext_i32_i16 s8, s6
3251; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
3252; GFX6-NEXT:    s_sext_i32_i16 s9, s4
3253; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
3254; GFX6-NEXT:    s_xor_b32 s8, s9, s8
3255; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3256; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3257; GFX6-NEXT:    s_or_b32 s8, s8, 1
3258; GFX6-NEXT:    v_mov_b32_e32 v3, s8
3259; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3260; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3261; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3262; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3263; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3264; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3265; GFX6-NEXT:    v_mov_b32_e32 v1, s4
3266; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3267; GFX6-NEXT:    v_mov_b32_e32 v2, s6
3268; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
3269; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
3270; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
3271; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
3272; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
3273; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
3274; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3275; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
3276; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
3277; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
3278; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
3279; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3280; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
3281; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3282; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3283; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
3284; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
3285; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
3286; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3287; GFX6-NEXT:    s_sext_i32_i16 s4, s7
3288; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
3289; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
3290; GFX6-NEXT:    s_sext_i32_i16 s6, s5
3291; GFX6-NEXT:    s_xor_b32 s4, s6, s4
3292; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
3293; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
3294; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v3
3295; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3296; GFX6-NEXT:    s_or_b32 s4, s4, 1
3297; GFX6-NEXT:    v_mov_b32_e32 v5, s4
3298; GFX6-NEXT:    v_mul_f32_e32 v4, v2, v4
3299; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3300; GFX6-NEXT:    v_mad_f32 v2, -v4, v3, v2
3301; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3302; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
3303; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v3|
3304; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
3305; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3306; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3307; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
3308; GFX6-NEXT:    s_lshr_b32 s6, s7, 16
3309; GFX6-NEXT:    s_ashr_i32 s7, s5, 16
3310; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s7
3311; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3312; GFX6-NEXT:    s_xor_b32 s4, s7, s4
3313; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3314; GFX6-NEXT:    s_or_b32 s4, s4, 1
3315; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3316; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3317; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
3318; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3319; GFX6-NEXT:    v_mov_b32_e32 v6, s4
3320; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
3321; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3322; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3323; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s6
3324; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
3325; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
3326; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
3327; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3328; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3329; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3330; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
3331; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3332; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
3333; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3334; GFX6-NEXT:    s_endpgm
3335;
3336; GFX9-LABEL: srem_v4i16:
3337; GFX9:       ; %bb.0:
3338; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3339; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3340; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3341; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3342; GFX9-NEXT:    s_sext_i32_i16 s8, s6
3343; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
3344; GFX9-NEXT:    s_sext_i32_i16 s9, s4
3345; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
3346; GFX9-NEXT:    s_xor_b32 s0, s9, s8
3347; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3348; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3349; GFX9-NEXT:    s_or_b32 s10, s0, 1
3350; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3351; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3352; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3353; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3354; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3355; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
3356; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
3357; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3358; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s6
3359; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
3360; GFX9-NEXT:    v_add_u32_e32 v1, s0, v3
3361; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
3362; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3363; GFX9-NEXT:    s_xor_b32 s0, s4, s6
3364; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3365; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
3366; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
3367; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3368; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
3369; GFX9-NEXT:    s_or_b32 s8, s0, 1
3370; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
3371; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3372; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3373; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3374; GFX9-NEXT:    s_sext_i32_i16 s8, s7
3375; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s8
3376; GFX9-NEXT:    v_add_u32_e32 v0, s0, v4
3377; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
3378; GFX9-NEXT:    s_sext_i32_i16 s6, s5
3379; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s6
3380; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3381; GFX9-NEXT:    s_xor_b32 s0, s6, s8
3382; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3383; GFX9-NEXT:    s_or_b32 s10, s0, 1
3384; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
3385; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3386; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
3387; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
3388; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3389; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
3390; GFX9-NEXT:    s_ashr_i32 s7, s7, 16
3391; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3392; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s7
3393; GFX9-NEXT:    s_ashr_i32 s5, s5, 16
3394; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3395; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
3396; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s5
3397; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3398; GFX9-NEXT:    s_xor_b32 s0, s5, s7
3399; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3400; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s8
3401; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3402; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3403; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
3404; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3405; GFX9-NEXT:    s_or_b32 s8, s0, 1
3406; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
3407; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3408; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3409; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
3410; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s7
3411; GFX9-NEXT:    v_sub_u32_e32 v5, s9, v1
3412; GFX9-NEXT:    v_sub_u32_e32 v1, s6, v3
3413; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3414; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v4
3415; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
3416; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v5
3417; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
3418; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3419; GFX9-NEXT:    s_endpgm
3420  %r = srem <4 x i16> %x, %y
3421  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3422  ret void
3423}
3424
3425define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3426; CHECK-LABEL: @udiv_i3(
3427; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3428; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3429; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3430; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3431; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3432; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3433; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3434; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3435; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3436; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3437; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3438; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3439; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3440; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3441; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3442; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
3443; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
3444; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
3445; CHECK-NEXT:    ret void
3446;
3447; GFX6-LABEL: udiv_i3:
3448; GFX6:       ; %bb.0:
3449; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
3450; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3451; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3452; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3453; GFX6-NEXT:    s_bfe_u32 s2, s4, 0x30008
3454; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
3455; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3456; GFX6-NEXT:    s_and_b32 s4, s4, 7
3457; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
3458; GFX6-NEXT:    s_mov_b32 s2, -1
3459; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3460; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3461; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3462; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3463; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3464; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3465; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3466; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3467; GFX6-NEXT:    s_endpgm
3468;
3469; GFX9-LABEL: udiv_i3:
3470; GFX9:       ; %bb.0:
3471; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3472; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3473; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3474; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3475; GFX9-NEXT:    s_bfe_u32 s0, s4, 0x30008
3476; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
3477; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3478; GFX9-NEXT:    s_and_b32 s0, s4, 7
3479; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
3480; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
3481; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3482; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
3483; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
3484; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3485; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
3486; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3487; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
3488; GFX9-NEXT:    s_endpgm
3489  %r = udiv i3 %x, %y
3490  store i3 %r, i3 addrspace(1)* %out
3491  ret void
3492}
3493
3494define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3495; CHECK-LABEL: @urem_i3(
3496; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3497; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3498; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3499; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3500; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3501; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3502; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3503; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3504; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3505; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3506; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3507; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3508; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3509; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3510; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3511; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
3512; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
3513; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
3514; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
3515; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
3516; CHECK-NEXT:    ret void
3517;
3518; GFX6-LABEL: urem_i3:
3519; GFX6:       ; %bb.0:
3520; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
3521; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3522; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3523; GFX6-NEXT:    s_bfe_u32 s2, s4, 0x30008
3524; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
3525; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3526; GFX6-NEXT:    s_and_b32 s3, s4, 7
3527; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s3
3528; GFX6-NEXT:    s_lshr_b32 s2, s4, 8
3529; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3530; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3531; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3532; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3533; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3534; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3535; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3536; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
3537; GFX6-NEXT:    s_mov_b32 s2, -1
3538; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3539; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3540; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3541; GFX6-NEXT:    s_endpgm
3542;
3543; GFX9-LABEL: urem_i3:
3544; GFX9:       ; %bb.0:
3545; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
3546; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3547; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x30008
3548; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
3549; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3550; GFX9-NEXT:    s_and_b32 s4, s2, 7
3551; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
3552; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
3553; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
3554; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3555; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
3556; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
3557; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3558; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3559; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3560; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
3561; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3562; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
3563; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3564; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3565; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3566; GFX9-NEXT:    s_endpgm
3567  %r = urem i3 %x, %y
3568  store i3 %r, i3 addrspace(1)* %out
3569  ret void
3570}
3571
3572define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3573; CHECK-LABEL: @sdiv_i3(
3574; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3575; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3576; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3577; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3578; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3579; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3580; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3581; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3582; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3583; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3584; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3585; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3586; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3587; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3588; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3589; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3590; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3591; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3592; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
3593; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
3594; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
3595; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
3596; CHECK-NEXT:    ret void
3597;
3598; GFX6-LABEL: sdiv_i3:
3599; GFX6:       ; %bb.0:
3600; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
3601; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3602; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3603; GFX6-NEXT:    s_mov_b32 s2, -1
3604; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3605; GFX6-NEXT:    s_bfe_i32 s5, s4, 0x30008
3606; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
3607; GFX6-NEXT:    s_bfe_i32 s4, s4, 0x30000
3608; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
3609; GFX6-NEXT:    s_xor_b32 s4, s4, s5
3610; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3611; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
3612; GFX6-NEXT:    s_or_b32 s4, s4, 1
3613; GFX6-NEXT:    v_mov_b32_e32 v3, s4
3614; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3615; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3616; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3617; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3618; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3619; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3620; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3621; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3622; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3623; GFX6-NEXT:    s_endpgm
3624;
3625; GFX9-LABEL: sdiv_i3:
3626; GFX9:       ; %bb.0:
3627; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3628; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3629; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3630; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3631; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x30008
3632; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3633; GFX9-NEXT:    s_bfe_i32 s1, s4, 0x30000
3634; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
3635; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3636; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3637; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3638; GFX9-NEXT:    s_or_b32 s4, s0, 1
3639; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
3640; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3641; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
3642; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3643; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
3644; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
3645; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3646; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
3647; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3648; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
3649; GFX9-NEXT:    s_endpgm
3650  %r = sdiv i3 %x, %y
3651  store i3 %r, i3 addrspace(1)* %out
3652  ret void
3653}
3654
3655define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3656; CHECK-LABEL: @srem_i3(
3657; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3658; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3659; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3660; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3661; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3662; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3663; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3664; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3665; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3666; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3667; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3668; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3669; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3670; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3671; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3672; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3673; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3674; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3675; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
3676; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
3677; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
3678; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
3679; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
3680; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
3681; CHECK-NEXT:    ret void
3682;
3683; GFX6-LABEL: srem_i3:
3684; GFX6:       ; %bb.0:
3685; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
3686; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3687; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3688; GFX6-NEXT:    s_bfe_i32 s2, s4, 0x30008
3689; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
3690; GFX6-NEXT:    s_bfe_i32 s5, s4, 0x30000
3691; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
3692; GFX6-NEXT:    s_xor_b32 s2, s5, s2
3693; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3694; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
3695; GFX6-NEXT:    s_or_b32 s2, s2, 1
3696; GFX6-NEXT:    v_mov_b32_e32 v3, s2
3697; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3698; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3699; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3700; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3701; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3702; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3703; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
3704; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3705; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
3706; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3707; GFX6-NEXT:    s_mov_b32 s2, -1
3708; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
3709; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3710; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
3711; GFX6-NEXT:    s_endpgm
3712;
3713; GFX9-LABEL: srem_i3:
3714; GFX9:       ; %bb.0:
3715; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3716; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3717; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x30008
3718; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
3719; GFX9-NEXT:    s_bfe_i32 s3, s4, 0x30000
3720; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
3721; GFX9-NEXT:    s_xor_b32 s2, s3, s2
3722; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3723; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
3724; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
3725; GFX9-NEXT:    s_or_b32 s6, s2, 1
3726; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
3727; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3728; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
3729; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
3730; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
3731; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
3732; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
3733; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
3734; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
3735; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3736; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3737; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3738; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3739; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3740; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3741; GFX9-NEXT:    s_endpgm
3742  %r = srem i3 %x, %y
3743  store i3 %r, i3 addrspace(1)* %out
3744  ret void
3745}
3746
3747define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3748; CHECK-LABEL: @udiv_v3i16(
3749; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3750; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3751; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3752; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3753; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3754; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3755; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3756; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3757; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3758; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3759; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3760; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3761; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3762; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3763; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3764; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3765; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3766; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
3767; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
3768; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
3769; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
3770; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3771; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
3772; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
3773; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3774; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3775; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3776; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3777; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3778; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3779; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3780; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3781; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3782; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3783; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3784; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3785; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3786; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
3787; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
3788; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
3789; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
3790; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3791; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
3792; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
3793; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3794; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3795; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3796; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3797; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3798; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3799; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3800; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3801; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3802; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3803; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3804; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3805; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3806; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
3807; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
3808; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
3809; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3810; CHECK-NEXT:    ret void
3811;
3812; GFX6-LABEL: udiv_v3i16:
3813; GFX6:       ; %bb.0:
3814; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
3815; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3816; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3817; GFX6-NEXT:    s_mov_b32 s2, -1
3818; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3819; GFX6-NEXT:    s_and_b32 s9, s6, 0xffff
3820; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
3821; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
3822; GFX6-NEXT:    s_and_b32 s8, s4, 0xffff
3823; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
3824; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
3825; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3826; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
3827; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
3828; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3829; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
3830; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3831; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
3832; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3833; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
3834; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3835; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
3836; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
3837; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v4
3838; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
3839; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
3840; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
3841; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
3842; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3843; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
3844; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
3845; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3846; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
3847; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3848; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
3849; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3850; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
3851; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3852; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
3853; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3854; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3855; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
3856; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3857; GFX6-NEXT:    s_endpgm
3858;
3859; GFX9-LABEL: udiv_v3i16:
3860; GFX9:       ; %bb.0:
3861; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3862; GFX9-NEXT:    v_mov_b32_e32 v6, 0
3863; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3864; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3865; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
3866; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
3867; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
3868; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
3869; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
3870; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
3871; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3872; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
3873; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
3874; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
3875; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
3876; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3877; GFX9-NEXT:    s_and_b32 s2, s7, 0xffff
3878; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
3879; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
3880; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s2
3881; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
3882; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
3883; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
3884; GFX9-NEXT:    s_and_b32 s2, s5, 0xffff
3885; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
3886; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
3887; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
3888; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
3889; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3890; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3891; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
3892; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
3893; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3894; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
3895; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
3896; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3897; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3898; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
3899; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
3900; GFX9-NEXT:    global_store_short v6, v2, s[0:1] offset:4
3901; GFX9-NEXT:    global_store_dword v6, v0, s[0:1]
3902; GFX9-NEXT:    s_endpgm
3903  %r = udiv <3 x i16> %x, %y
3904  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3905  ret void
3906}
3907
3908define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3909; CHECK-LABEL: @urem_v3i16(
3910; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3911; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3912; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3913; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3914; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3915; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3916; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3917; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3918; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3919; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3920; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3921; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3922; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3923; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3924; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3925; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3926; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3927; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3928; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3929; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
3930; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
3931; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
3932; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
3933; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3934; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
3935; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
3936; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3937; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3938; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3939; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3940; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3941; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3942; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3943; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3944; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3945; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3946; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3947; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3948; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3949; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3950; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3951; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
3952; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
3953; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
3954; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
3955; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3956; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
3957; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
3958; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3959; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3960; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3961; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3962; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3963; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3964; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3965; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3966; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3967; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3968; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3969; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3970; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3971; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3972; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3973; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
3974; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
3975; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
3976; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3977; CHECK-NEXT:    ret void
3978;
3979; GFX6-LABEL: urem_v3i16:
3980; GFX6:       ; %bb.0:
3981; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
3982; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3983; GFX6-NEXT:    s_mov_b32 s3, 0xf000
3984; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3985; GFX6-NEXT:    s_and_b32 s8, s6, 0xffff
3986; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
3987; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
3988; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
3989; GFX6-NEXT:    v_mov_b32_e32 v2, s6
3990; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3991; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
3992; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v2
3993; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, v5
3994; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
3995; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3996; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v4
3997; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
3998; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3999; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4000; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
4001; GFX6-NEXT:    v_alignbit_b32 v0, s5, v0, 16
4002; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
4003; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v0
4004; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
4005; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
4006; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
4007; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
4008; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
4009; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4010; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4011; GFX6-NEXT:    v_mad_f32 v3, -v4, v5, v3
4012; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4013; GFX6-NEXT:    s_and_b32 s4, s5, 0xffff
4014; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s4
4015; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4016; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
4017; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4018; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4019; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4020; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4021; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
4022; GFX6-NEXT:    v_mad_f32 v3, -v3, v6, v7
4023; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
4024; GFX6-NEXT:    s_mov_b32 s2, -1
4025; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4026; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
4027; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
4028; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
4029; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
4030; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4031; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
4032; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
4033; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4034; GFX6-NEXT:    s_endpgm
4035;
4036; GFX9-LABEL: urem_v3i16:
4037; GFX9:       ; %bb.0:
4038; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
4039; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4040; GFX9-NEXT:    s_and_b32 s3, s6, 0xffff
4041; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
4042; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
4043; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
4044; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
4045; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4046; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
4047; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
4048; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
4049; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
4050; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4051; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4052; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v4
4053; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
4054; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
4055; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v5
4056; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4057; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4058; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
4059; GFX9-NEXT:    s_and_b32 s3, s7, 0xffff
4060; GFX9-NEXT:    v_mad_f32 v2, -v5, v1, v3
4061; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
4062; GFX9-NEXT:    s_and_b32 s5, s5, 0xffff
4063; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v5
4064; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
4065; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
4066; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
4067; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
4068; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
4069; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4070; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
4071; GFX9-NEXT:    v_mad_f32 v2, -v2, v3, v5
4072; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
4073; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4074; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
4075; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
4076; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
4077; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
4078; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4079; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
4080; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
4081; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4082; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
4083; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4084; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
4085; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
4086; GFX9-NEXT:    s_endpgm
4087  %r = urem <3 x i16> %x, %y
4088  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4089  ret void
4090}
4091
4092define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4093; CHECK-LABEL: @sdiv_v3i16(
4094; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4095; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4096; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4097; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4098; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4099; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4100; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4101; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4102; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4103; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4104; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4105; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4106; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4107; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4108; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4109; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4110; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4111; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4112; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4113; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4114; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
4115; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
4116; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
4117; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
4118; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
4119; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4120; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
4121; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
4122; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4123; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4124; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4125; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4126; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4127; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4128; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4129; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4130; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4131; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4132; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4133; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4134; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4135; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4136; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4137; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4138; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
4139; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
4140; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
4141; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
4142; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
4143; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4144; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
4145; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
4146; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4147; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4148; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4149; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4150; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4151; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4152; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4153; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4154; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4155; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4156; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4157; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4158; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4159; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4160; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4161; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4162; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
4163; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
4164; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
4165; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
4166; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4167; CHECK-NEXT:    ret void
4168;
4169; GFX6-LABEL: sdiv_v3i16:
4170; GFX6:       ; %bb.0:
4171; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
4172; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4173; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4174; GFX6-NEXT:    s_mov_b32 s2, -1
4175; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4176; GFX6-NEXT:    s_sext_i32_i16 s8, s6
4177; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
4178; GFX6-NEXT:    s_sext_i32_i16 s9, s4
4179; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
4180; GFX6-NEXT:    s_xor_b32 s8, s9, s8
4181; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4182; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
4183; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
4184; GFX6-NEXT:    s_or_b32 s8, s8, 1
4185; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4186; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4187; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4188; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4189; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4190; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
4191; GFX6-NEXT:    v_mov_b32_e32 v3, s8
4192; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4193; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
4194; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4195; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
4196; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
4197; GFX6-NEXT:    s_xor_b32 s4, s4, s6
4198; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4199; GFX6-NEXT:    s_or_b32 s4, s4, 1
4200; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
4201; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4202; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
4203; GFX6-NEXT:    v_mov_b32_e32 v4, s4
4204; GFX6-NEXT:    s_sext_i32_i16 s4, s7
4205; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
4206; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
4207; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
4208; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
4209; GFX6-NEXT:    s_sext_i32_i16 s5, s5
4210; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
4211; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
4212; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4213; GFX6-NEXT:    s_xor_b32 s4, s5, s4
4214; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4215; GFX6-NEXT:    s_or_b32 s4, s4, 1
4216; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4217; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4218; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
4219; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
4220; GFX6-NEXT:    v_mov_b32_e32 v5, s4
4221; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
4222; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
4223; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
4224; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4225; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4226; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4227; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
4228; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4229; GFX6-NEXT:    s_endpgm
4230;
4231; GFX9-LABEL: sdiv_v3i16:
4232; GFX9:       ; %bb.0:
4233; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
4234; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4235; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4236; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4237; GFX9-NEXT:    s_sext_i32_i16 s0, s6
4238; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
4239; GFX9-NEXT:    s_sext_i32_i16 s1, s4
4240; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
4241; GFX9-NEXT:    s_xor_b32 s0, s1, s0
4242; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4243; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4244; GFX9-NEXT:    s_or_b32 s8, s0, 1
4245; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4246; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4247; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4248; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
4249; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4250; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
4251; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
4252; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4253; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
4254; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
4255; GFX9-NEXT:    v_add_u32_e32 v2, s0, v3
4256; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
4257; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4258; GFX9-NEXT:    s_xor_b32 s0, s4, s1
4259; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4260; GFX9-NEXT:    s_or_b32 s4, s0, 1
4261; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4262; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4263; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
4264; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
4265; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4266; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4267; GFX9-NEXT:    s_sext_i32_i16 s1, s7
4268; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
4269; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4270; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
4271; GFX9-NEXT:    s_sext_i32_i16 s0, s5
4272; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
4273; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
4274; GFX9-NEXT:    s_xor_b32 s0, s0, s1
4275; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4276; GFX9-NEXT:    s_or_b32 s4, s0, 1
4277; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4278; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4279; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
4280; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
4281; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
4282; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
4283; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4284; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
4285; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4286; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
4287; GFX9-NEXT:    global_store_short v1, v0, s[2:3] offset:4
4288; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
4289; GFX9-NEXT:    s_endpgm
4290  %r = sdiv <3 x i16> %x, %y
4291  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4292  ret void
4293}
4294
4295define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4296; CHECK-LABEL: @srem_v3i16(
4297; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4298; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4299; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4300; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4301; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4302; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4303; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4304; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4305; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4306; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4307; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4308; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4309; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4310; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4311; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4312; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4313; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4314; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4315; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4316; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4317; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
4318; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
4319; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
4320; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
4321; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
4322; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
4323; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
4324; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4325; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
4326; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
4327; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
4328; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
4329; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
4330; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
4331; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
4332; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
4333; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
4334; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
4335; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
4336; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
4337; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
4338; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
4339; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
4340; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
4341; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
4342; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
4343; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
4344; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
4345; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
4346; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
4347; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
4348; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
4349; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
4350; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4351; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
4352; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
4353; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
4354; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
4355; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
4356; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
4357; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
4358; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
4359; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
4360; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
4361; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
4362; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
4363; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
4364; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
4365; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
4366; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
4367; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
4368; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
4369; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
4370; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
4371; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
4372; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
4373; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
4374; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
4375; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4376; CHECK-NEXT:    ret void
4377;
4378; GFX6-LABEL: srem_v3i16:
4379; GFX6:       ; %bb.0:
4380; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
4381; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4382; GFX6-NEXT:    s_mov_b32 s3, 0xf000
4383; GFX6-NEXT:    s_mov_b32 s2, -1
4384; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4385; GFX6-NEXT:    s_sext_i32_i16 s8, s6
4386; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
4387; GFX6-NEXT:    s_sext_i32_i16 s9, s4
4388; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
4389; GFX6-NEXT:    s_xor_b32 s8, s9, s8
4390; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4391; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
4392; GFX6-NEXT:    s_or_b32 s8, s8, 1
4393; GFX6-NEXT:    v_mov_b32_e32 v3, s8
4394; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4395; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4396; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4397; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4398; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4399; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4400; GFX6-NEXT:    v_mov_b32_e32 v1, s4
4401; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4402; GFX6-NEXT:    v_mov_b32_e32 v2, s6
4403; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
4404; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4405; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
4406; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 16
4407; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
4408; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
4409; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
4410; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
4411; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
4412; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
4413; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
4414; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4415; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
4416; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
4417; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4418; GFX6-NEXT:    s_sext_i32_i16 s4, s7
4419; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
4420; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
4421; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
4422; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
4423; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
4424; GFX6-NEXT:    s_sext_i32_i16 s6, s5
4425; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4426; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s6
4427; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v4
4428; GFX6-NEXT:    s_xor_b32 s4, s6, s4
4429; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
4430; GFX6-NEXT:    s_or_b32 s4, s4, 1
4431; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
4432; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4433; GFX6-NEXT:    v_mad_f32 v3, -v5, v4, v3
4434; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4435; GFX6-NEXT:    v_mov_b32_e32 v6, s4
4436; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
4437; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
4438; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
4439; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
4440; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
4441; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4442; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
4443; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4444; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4445; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
4446; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4447; GFX6-NEXT:    s_endpgm
4448;
4449; GFX9-LABEL: srem_v3i16:
4450; GFX9:       ; %bb.0:
4451; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
4452; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4453; GFX9-NEXT:    s_sext_i32_i16 s8, s6
4454; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
4455; GFX9-NEXT:    s_sext_i32_i16 s9, s4
4456; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
4457; GFX9-NEXT:    s_xor_b32 s2, s9, s8
4458; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4459; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
4460; GFX9-NEXT:    s_or_b32 s10, s2, 1
4461; GFX9-NEXT:    s_sext_i32_i16 s7, s7
4462; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
4463; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4464; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
4465; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
4466; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
4467; GFX9-NEXT:    s_cselect_b32 s2, s10, 0
4468; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
4469; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
4470; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s6
4471; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
4472; GFX9-NEXT:    s_sext_i32_i16 s5, s5
4473; GFX9-NEXT:    v_add_u32_e32 v1, s2, v2
4474; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s4
4475; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4476; GFX9-NEXT:    s_xor_b32 s2, s4, s6
4477; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
4478; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
4479; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4480; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4481; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4482; GFX9-NEXT:    s_or_b32 s8, s2, 1
4483; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4484; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
4485; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s7
4486; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
4487; GFX9-NEXT:    s_cselect_b32 s2, s8, 0
4488; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
4489; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
4490; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4491; GFX9-NEXT:    s_xor_b32 s2, s5, s7
4492; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
4493; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
4494; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4495; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4496; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
4497; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4498; GFX9-NEXT:    s_or_b32 s6, s2, 1
4499; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
4500; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
4501; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
4502; GFX9-NEXT:    v_add_u32_e32 v2, s2, v4
4503; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4504; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s7
4505; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
4506; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4507; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
4508; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
4509; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4510; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
4511; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4512; GFX9-NEXT:    global_store_short v3, v2, s[0:1] offset:4
4513; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
4514; GFX9-NEXT:    s_endpgm
4515  %r = srem <3 x i16> %x, %y
4516  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4517  ret void
4518}
4519
4520define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4521; CHECK-LABEL: @udiv_v3i15(
4522; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4523; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4524; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4525; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4526; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4527; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4528; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4529; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4530; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4531; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4532; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4533; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4534; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4535; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4536; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4537; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4538; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4539; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
4540; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
4541; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
4542; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
4543; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4544; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
4545; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
4546; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
4547; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
4548; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
4549; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
4550; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
4551; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
4552; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
4553; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
4554; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
4555; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
4556; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
4557; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
4558; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
4559; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
4560; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
4561; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
4562; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
4563; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4564; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
4565; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
4566; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
4567; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
4568; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
4569; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
4570; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
4571; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
4572; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
4573; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
4574; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
4575; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
4576; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
4577; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
4578; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
4579; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
4580; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
4581; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
4582; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4583; CHECK-NEXT:    ret void
4584;
4585; GFX6-LABEL: udiv_v3i15:
4586; GFX6:       ; %bb.0:
4587; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4588; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4589; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4590; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4591; GFX6-NEXT:    s_mov_b32 s6, -1
4592; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4593; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4594; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4595; GFX6-NEXT:    s_and_b32 s8, s0, 0x7fff
4596; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
4597; GFX6-NEXT:    s_and_b32 s3, s2, 0x7fff
4598; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4599; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf000f
4600; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s3
4601; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4602; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
4603; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf000f
4604; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
4605; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4606; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
4607; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
4608; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
4609; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4610; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4611; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4612; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
4613; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4614; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
4615; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4616; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4617; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4618; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
4619; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4620; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
4621; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
4622; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
4623; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4624; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v6
4625; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4626; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
4627; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
4628; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
4629; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
4630; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
4631; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
4632; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4633; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4634; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
4635; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4636; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4637; GFX6-NEXT:    s_waitcnt expcnt(0)
4638; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4639; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
4640; GFX6-NEXT:    s_endpgm
4641;
4642; GFX9-LABEL: udiv_v3i15:
4643; GFX9:       ; %bb.0:
4644; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
4645; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
4646; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4647; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
4648; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4649; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4650; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4651; GFX9-NEXT:    s_and_b32 s6, s2, 0x7fff
4652; GFX9-NEXT:    s_and_b32 s3, s0, 0x7fff
4653; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
4654; GFX9-NEXT:    v_mov_b32_e32 v3, s0
4655; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf000f
4656; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
4657; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4658; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
4659; GFX9-NEXT:    s_bfe_u32 s2, s2, 0xf000f
4660; GFX9-NEXT:    v_alignbit_b32 v3, s1, v3, 30
4661; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4662; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
4663; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4664; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4665; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4666; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4667; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4668; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
4669; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4670; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
4671; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4672; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4673; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
4674; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
4675; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
4676; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
4677; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
4678; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
4679; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
4680; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v7
4681; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4682; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
4683; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
4684; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
4685; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
4686; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4687; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
4688; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4689; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4690; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4691; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4692; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
4693; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4694; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
4695; GFX9-NEXT:    s_endpgm
4696  %r = udiv <3 x i15> %x, %y
4697  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4698  ret void
4699}
4700
4701define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4702; CHECK-LABEL: @urem_v3i15(
4703; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4704; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4705; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4706; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4707; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4708; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4709; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4710; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4711; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4712; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4713; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4714; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4715; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4716; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4717; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4718; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4719; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4720; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
4721; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
4722; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
4723; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
4724; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
4725; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
4726; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4727; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
4728; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
4729; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
4730; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
4731; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
4732; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
4733; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
4734; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
4735; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
4736; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
4737; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4738; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
4739; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
4740; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
4741; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
4742; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
4743; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
4744; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
4745; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
4746; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
4747; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
4748; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4749; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
4750; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
4751; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
4752; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
4753; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
4754; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
4755; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
4756; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
4757; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
4758; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
4759; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
4760; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
4761; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
4762; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
4763; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
4764; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
4765; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
4766; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
4767; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
4768; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
4769; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4770; CHECK-NEXT:    ret void
4771;
4772; GFX6-LABEL: urem_v3i15:
4773; GFX6:       ; %bb.0:
4774; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4775; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4776; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4777; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4778; GFX6-NEXT:    s_mov_b32 s6, -1
4779; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4780; GFX6-NEXT:    s_and_b32 s8, s2, 0x7fff
4781; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
4782; GFX6-NEXT:    s_and_b32 s9, s0, 0x7fff
4783; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
4784; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4785; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
4786; GFX6-NEXT:    s_bfe_u32 s1, s0, 0xf000f
4787; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4788; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
4789; GFX6-NEXT:    s_bfe_u32 s9, s2, 0xf000f
4790; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
4791; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4792; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4793; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4794; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4795; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4796; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
4797; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4798; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
4799; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
4800; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
4801; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4802; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4803; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
4804; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
4805; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
4806; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
4807; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4808; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
4809; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
4810; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4811; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
4812; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
4813; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4814; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4815; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
4816; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4817; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
4818; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
4819; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
4820; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
4821; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4822; GFX6-NEXT:    s_lshr_b32 s3, s2, 15
4823; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v1
4824; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
4825; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4826; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4827; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
4828; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4829; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
4830; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4831; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4832; GFX6-NEXT:    s_waitcnt expcnt(0)
4833; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4834; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
4835; GFX6-NEXT:    s_endpgm
4836;
4837; GFX9-LABEL: urem_v3i15:
4838; GFX9:       ; %bb.0:
4839; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
4840; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
4841; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4842; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
4843; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4844; GFX9-NEXT:    s_and_b32 s6, s2, 0x7fff
4845; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
4846; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4847; GFX9-NEXT:    s_and_b32 s7, s0, 0x7fff
4848; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
4849; GFX9-NEXT:    s_bfe_u32 s6, s0, 0xf000f
4850; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
4851; GFX9-NEXT:    v_mov_b32_e32 v3, s0
4852; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4853; GFX9-NEXT:    v_alignbit_b32 v3, s1, v3, 30
4854; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4855; GFX9-NEXT:    s_bfe_u32 s3, s2, 0xf000f
4856; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4857; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4858; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4859; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4860; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
4861; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4862; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s3
4863; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4864; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
4865; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
4866; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
4867; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
4868; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
4869; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
4870; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4871; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
4872; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
4873; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
4874; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
4875; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
4876; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
4877; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
4878; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
4879; GFX9-NEXT:    s_lshr_b32 s1, s0, 15
4880; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
4881; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s1
4882; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
4883; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s0
4884; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
4885; GFX9-NEXT:    s_lshr_b32 s0, s2, 15
4886; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v4
4887; GFX9-NEXT:    v_sub_u32_e32 v5, s2, v1
4888; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
4889; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
4890; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4891; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v5
4892; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4893; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4894; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4895; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
4896; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4897; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
4898; GFX9-NEXT:    s_endpgm
4899  %r = urem <3 x i15> %x, %y
4900  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4901  ret void
4902}
4903
4904define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4905; CHECK-LABEL: @sdiv_v3i15(
4906; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4907; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4908; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
4909; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
4910; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4911; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4912; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4913; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4914; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4915; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4916; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4917; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4918; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4919; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4920; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4921; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4922; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4923; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4924; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4925; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4926; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
4927; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
4928; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
4929; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
4930; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
4931; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4932; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
4933; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
4934; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4935; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4936; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4937; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4938; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4939; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4940; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4941; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4942; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4943; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4944; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4945; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4946; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4947; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4948; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4949; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4950; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
4951; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
4952; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
4953; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
4954; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
4955; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4956; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
4957; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
4958; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4959; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4960; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4961; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4962; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4963; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4964; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4965; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4966; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4967; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4968; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4969; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4970; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4971; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4972; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4973; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4974; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
4975; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
4976; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
4977; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
4978; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4979; CHECK-NEXT:    ret void
4980;
4981; GFX6-LABEL: sdiv_v3i15:
4982; GFX6:       ; %bb.0:
4983; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4984; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4985; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4986; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4987; GFX6-NEXT:    s_mov_b32 s6, -1
4988; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4989; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4990; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4991; GFX6-NEXT:    s_bfe_i32 s3, s0, 0xf0000
4992; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s3
4993; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4994; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
4995; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf0000
4996; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
4997; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4998; GFX6-NEXT:    s_xor_b32 s1, s1, s3
4999; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
5000; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
5001; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
5002; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
5003; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
5004; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
5005; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
5006; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
5007; GFX6-NEXT:    s_or_b32 s1, s1, 1
5008; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5009; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
5010; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
5011; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5012; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
5013; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5014; GFX6-NEXT:    s_xor_b32 s0, s1, s0
5015; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
5016; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5017; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
5018; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5019; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
5020; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5021; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
5022; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v1
5023; GFX6-NEXT:    s_or_b32 s0, s0, 1
5024; GFX6-NEXT:    v_mov_b32_e32 v6, s0
5025; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
5026; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
5027; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5028; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
5029; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5030; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
5031; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5032; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
5033; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
5034; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
5035; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
5036; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
5037; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
5038; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5039; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
5040; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5041; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5042; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
5043; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5044; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
5045; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5046; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5047; GFX6-NEXT:    s_waitcnt expcnt(0)
5048; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5049; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5050; GFX6-NEXT:    s_endpgm
5051;
5052; GFX9-LABEL: sdiv_v3i15:
5053; GFX9:       ; %bb.0:
5054; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5055; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
5056; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
5057; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5058; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5059; GFX9-NEXT:    s_bfe_i32 s1, s2, 0xf0000
5060; GFX9-NEXT:    s_bfe_i32 s0, s4, 0xf0000
5061; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
5062; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
5063; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5064; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5065; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5066; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5067; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5068; GFX9-NEXT:    s_or_b32 s3, s0, 1
5069; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
5070; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
5071; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
5072; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
5073; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5074; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
5075; GFX9-NEXT:    s_cselect_b32 s0, s3, 0
5076; GFX9-NEXT:    s_bfe_i32 s1, s4, 0xf000f
5077; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s1
5078; GFX9-NEXT:    v_add_u32_e32 v4, s0, v5
5079; GFX9-NEXT:    s_bfe_i32 s0, s2, 0xf000f
5080; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
5081; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
5082; GFX9-NEXT:    v_mov_b32_e32 v1, s4
5083; GFX9-NEXT:    v_alignbit_b32 v1, s5, v1, 30
5084; GFX9-NEXT:    s_xor_b32 s0, s0, s1
5085; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
5086; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5087; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5088; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
5089; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
5090; GFX9-NEXT:    s_or_b32 s2, s0, 1
5091; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
5092; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
5093; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
5094; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5095; GFX9-NEXT:    s_cselect_b32 s0, s2, 0
5096; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
5097; GFX9-NEXT:    v_add_u32_e32 v5, s0, v6
5098; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v0
5099; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
5100; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
5101; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5102; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
5103; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
5104; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
5105; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v1
5106; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
5107; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
5108; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5109; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
5110; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
5111; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
5112; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5113; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
5114; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
5115; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
5116; GFX9-NEXT:    global_store_dword v2, v0, s[6:7]
5117; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5118; GFX9-NEXT:    global_store_short v2, v0, s[6:7] offset:4
5119; GFX9-NEXT:    s_endpgm
5120  %r = sdiv <3 x i15> %x, %y
5121  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
5122  ret void
5123}
5124
5125define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
5126; CHECK-LABEL: @srem_v3i15(
5127; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
5128; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
5129; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
5130; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
5131; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
5132; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
5133; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
5134; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
5135; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
5136; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5137; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
5138; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
5139; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
5140; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
5141; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
5142; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
5143; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
5144; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
5145; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
5146; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
5147; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
5148; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
5149; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
5150; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
5151; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
5152; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
5153; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
5154; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
5155; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
5156; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
5157; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
5158; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
5159; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
5160; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
5161; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
5162; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5163; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
5164; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
5165; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
5166; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
5167; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
5168; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
5169; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
5170; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
5171; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
5172; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
5173; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
5174; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
5175; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
5176; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
5177; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
5178; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
5179; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
5180; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
5181; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
5182; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
5183; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
5184; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
5185; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
5186; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
5187; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
5188; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
5189; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
5190; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
5191; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
5192; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
5193; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
5194; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
5195; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
5196; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
5197; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
5198; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
5199; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
5200; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
5201; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
5202; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
5203; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
5204; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
5205; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
5206; CHECK-NEXT:    ret void
5207;
5208; GFX6-LABEL: srem_v3i15:
5209; GFX6:       ; %bb.0:
5210; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5211; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5212; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5213; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5214; GFX6-NEXT:    s_mov_b32 s6, -1
5215; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5216; GFX6-NEXT:    s_bfe_i32 s9, s2, 0xf0000
5217; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s9
5218; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5219; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
5220; GFX6-NEXT:    s_bfe_i32 s1, s0, 0xf0000
5221; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
5222; GFX6-NEXT:    s_xor_b32 s1, s9, s1
5223; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
5224; GFX6-NEXT:    s_or_b32 s1, s1, 1
5225; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5226; GFX6-NEXT:    v_mov_b32_e32 v7, s1
5227; GFX6-NEXT:    s_lshr_b32 s8, s0, 15
5228; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
5229; GFX6-NEXT:    v_mul_f32_e32 v6, v5, v6
5230; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
5231; GFX6-NEXT:    v_mad_f32 v5, -v6, v4, v5
5232; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
5233; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
5234; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
5235; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
5236; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
5237; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s0
5238; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
5239; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s0
5240; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, s1
5241; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s2, v4
5242; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5243; GFX6-NEXT:    s_xor_b32 s0, s1, s0
5244; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 15
5245; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5246; GFX6-NEXT:    v_mul_f32_e32 v7, v6, v7
5247; GFX6-NEXT:    v_trunc_f32_e32 v7, v7
5248; GFX6-NEXT:    v_mad_f32 v6, -v7, v5, v6
5249; GFX6-NEXT:    v_cvt_i32_f32_e32 v7, v7
5250; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v5|
5251; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v2
5252; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5253; GFX6-NEXT:    s_or_b32 s0, s0, 1
5254; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5255; GFX6-NEXT:    v_mov_b32_e32 v8, s0
5256; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
5257; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
5258; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
5259; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
5260; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v0
5261; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
5262; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
5263; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5264; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
5265; GFX6-NEXT:    v_mul_f32_e32 v2, v7, v8
5266; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
5267; GFX6-NEXT:    v_mad_f32 v7, -v2, v6, v7
5268; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
5269; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
5270; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5271; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s8
5272; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5273; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
5274; GFX6-NEXT:    s_lshr_b32 s3, s2, 15
5275; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v5
5276; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
5277; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
5278; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5279; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
5280; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
5281; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
5282; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5283; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5284; GFX6-NEXT:    s_waitcnt expcnt(0)
5285; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5286; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5287; GFX6-NEXT:    s_endpgm
5288;
5289; GFX9-LABEL: srem_v3i15:
5290; GFX9:       ; %bb.0:
5291; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5292; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5293; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5294; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5295; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5296; GFX9-NEXT:    s_bfe_i32 s1, s2, 0xf0000
5297; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s1
5298; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf0000
5299; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
5300; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5301; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5302; GFX9-NEXT:    v_mov_b32_e32 v1, s6
5303; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5304; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5305; GFX9-NEXT:    s_lshr_b32 s8, s2, 15
5306; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5307; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
5308; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5309; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
5310; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
5311; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
5312; GFX9-NEXT:    s_lshr_b32 s3, s6, 15
5313; GFX9-NEXT:    s_or_b32 s7, s0, 1
5314; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
5315; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5316; GFX9-NEXT:    s_cselect_b32 s0, s7, 0
5317; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
5318; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf000f
5319; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
5320; GFX9-NEXT:    s_bfe_i32 s1, s2, 0xf000f
5321; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s1
5322; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5323; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
5324; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
5325; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5326; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
5327; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v7
5328; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
5329; GFX9-NEXT:    v_mad_f32 v6, -v7, v5, v6
5330; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v7
5331; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
5332; GFX9-NEXT:    s_or_b32 s6, s0, 1
5333; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v6|, |v5|
5334; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v1
5335; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
5336; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
5337; GFX9-NEXT:    v_add_u32_e32 v5, s0, v7
5338; GFX9-NEXT:    v_bfe_i32 v7, v0, 0, 15
5339; GFX9-NEXT:    v_cvt_f32_i32_e32 v8, v7
5340; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v6
5341; GFX9-NEXT:    v_xor_b32_e32 v1, v7, v1
5342; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
5343; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
5344; GFX9-NEXT:    v_mul_f32_e32 v7, v8, v9
5345; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
5346; GFX9-NEXT:    v_cvt_i32_f32_e32 v9, v7
5347; GFX9-NEXT:    v_mad_f32 v7, -v7, v6, v8
5348; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
5349; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
5350; GFX9-NEXT:    v_mul_lo_u32 v5, v5, s3
5351; GFX9-NEXT:    v_add_u32_e32 v1, v9, v1
5352; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
5353; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
5354; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
5355; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v5
5356; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
5357; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
5358; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5359; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
5360; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
5361; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
5362; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
5363; GFX9-NEXT:    global_store_dword v2, v0, s[4:5]
5364; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5365; GFX9-NEXT:    global_store_short v2, v0, s[4:5] offset:4
5366; GFX9-NEXT:    s_endpgm
5367  %r = srem <3 x i15> %x, %y
5368  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
5369  ret void
5370}
5371
5372define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
5373; CHECK-LABEL: @udiv_i32_oddk_denom(
5374; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
5375; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5376; CHECK-NEXT:    ret void
5377;
5378; GFX6-LABEL: udiv_i32_oddk_denom:
5379; GFX6:       ; %bb.0:
5380; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5381; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5382; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5383; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5384; GFX6-NEXT:    s_mov_b32 s2, -1
5385; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5386; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
5387; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
5388; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5389; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5390; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5391; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5392; GFX6-NEXT:    s_endpgm
5393;
5394; GFX9-LABEL: udiv_i32_oddk_denom:
5395; GFX9:       ; %bb.0:
5396; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5397; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5398; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5399; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5400; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
5401; GFX9-NEXT:    s_sub_i32 s1, s4, s0
5402; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
5403; GFX9-NEXT:    s_add_i32 s1, s1, s0
5404; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
5405; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5406; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5407; GFX9-NEXT:    s_endpgm
5408  %r = udiv i32 %x, 1235195
5409  store i32 %r, i32 addrspace(1)* %out
5410  ret void
5411}
5412
5413define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5414; CHECK-LABEL: @udiv_i32_pow2k_denom(
5415; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
5416; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5417; CHECK-NEXT:    ret void
5418;
5419; GFX6-LABEL: udiv_i32_pow2k_denom:
5420; GFX6:       ; %bb.0:
5421; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5422; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5423; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5424; GFX6-NEXT:    s_mov_b32 s2, -1
5425; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5426; GFX6-NEXT:    s_lshr_b32 s4, s4, 12
5427; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5428; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5429; GFX6-NEXT:    s_endpgm
5430;
5431; GFX9-LABEL: udiv_i32_pow2k_denom:
5432; GFX9:       ; %bb.0:
5433; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5434; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5435; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5436; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5437; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
5438; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5439; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5440; GFX9-NEXT:    s_endpgm
5441  %r = udiv i32 %x, 4096
5442  store i32 %r, i32 addrspace(1)* %out
5443  ret void
5444}
5445
5446define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5447; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
5448; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5449; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
5450; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5451; CHECK-NEXT:    ret void
5452;
5453; GFX6-LABEL: udiv_i32_pow2_shl_denom:
5454; GFX6:       ; %bb.0:
5455; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5456; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5457; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5458; GFX6-NEXT:    s_mov_b32 s2, -1
5459; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5460; GFX6-NEXT:    s_add_i32 s5, s5, 12
5461; GFX6-NEXT:    s_lshr_b32 s4, s4, s5
5462; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5463; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5464; GFX6-NEXT:    s_endpgm
5465;
5466; GFX9-LABEL: udiv_i32_pow2_shl_denom:
5467; GFX9:       ; %bb.0:
5468; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5469; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5470; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5471; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5472; GFX9-NEXT:    s_add_i32 s0, s3, 12
5473; GFX9-NEXT:    s_lshr_b32 s0, s2, s0
5474; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5475; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
5476; GFX9-NEXT:    s_endpgm
5477  %shl.y = shl i32 4096, %y
5478  %r = udiv i32 %x, %shl.y
5479  store i32 %r, i32 addrspace(1)* %out
5480  ret void
5481}
5482
5483define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5484; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
5485; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5486; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5487; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5488; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5489; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
5490; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5491; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5492; CHECK-NEXT:    ret void
5493;
5494; GFX6-LABEL: udiv_v2i32_pow2k_denom:
5495; GFX6:       ; %bb.0:
5496; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5497; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5498; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5499; GFX6-NEXT:    s_mov_b32 s2, -1
5500; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5501; GFX6-NEXT:    s_lshr_b32 s4, s4, 12
5502; GFX6-NEXT:    s_lshr_b32 s5, s5, 12
5503; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5504; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5505; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5506; GFX6-NEXT:    s_endpgm
5507;
5508; GFX9-LABEL: udiv_v2i32_pow2k_denom:
5509; GFX9:       ; %bb.0:
5510; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5511; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5512; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5513; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5514; GFX9-NEXT:    s_lshr_b32 s0, s2, 12
5515; GFX9-NEXT:    s_lshr_b32 s1, s3, 12
5516; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5517; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5518; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
5519; GFX9-NEXT:    s_endpgm
5520  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
5521  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5522  ret void
5523}
5524
5525define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5526; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
5527; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5528; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5529; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5530; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5531; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
5532; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5533; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5534; CHECK-NEXT:    ret void
5535;
5536; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom:
5537; GFX6:       ; %bb.0:
5538; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5539; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5540; GFX6-NEXT:    v_mov_b32_e32 v0, 0x100101
5541; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5542; GFX6-NEXT:    s_mov_b32 s2, -1
5543; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5544; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
5545; GFX6-NEXT:    s_lshr_b32 s4, s4, 12
5546; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v0
5547; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5548; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5549; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
5550; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5551; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5552; GFX6-NEXT:    s_endpgm
5553;
5554; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
5555; GFX9:       ; %bb.0:
5556; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5557; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5558; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5559; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5560; GFX9-NEXT:    s_mul_hi_u32 s1, s3, 0x100101
5561; GFX9-NEXT:    s_lshr_b32 s0, s2, 12
5562; GFX9-NEXT:    s_sub_i32 s2, s3, s1
5563; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
5564; GFX9-NEXT:    s_add_i32 s2, s2, s1
5565; GFX9-NEXT:    s_lshr_b32 s1, s2, 11
5566; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5567; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5568; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
5569; GFX9-NEXT:    s_endpgm
5570  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
5571  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5572  ret void
5573}
5574
5575define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5576; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
5577; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5578; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5579; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5580; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5581; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5582; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5583; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5584; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5585; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5586; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5587; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5588; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5589; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5590; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5591; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5592; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5593; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5594; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5595; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5596; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5597; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5598; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5599; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5600; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5601; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5602; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
5603; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
5604; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5605; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
5606; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
5607; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
5608; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
5609; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0
5610; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
5611; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5612; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
5613; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5614; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
5615; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
5616; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
5617; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
5618; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
5619; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
5620; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
5621; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5622; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
5623; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
5624; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
5625; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
5626; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
5627; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
5628; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5629; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
5630; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
5631; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
5632; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
5633; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
5634; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
5635; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
5636; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
5637; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
5638; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
5639; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
5640; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
5641; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
5642; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5643; CHECK-NEXT:    ret void
5644;
5645; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
5646; GFX6:       ; %bb.0:
5647; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
5648; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
5649; GFX6-NEXT:    s_mov_b32 s11, 0xf000
5650; GFX6-NEXT:    s_mov_b32 s10, -1
5651; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5652; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
5653; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
5654; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s7
5655; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
5656; GFX6-NEXT:    s_sub_i32 s0, 0, s2
5657; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5658; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5659; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
5660; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
5661; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
5662; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
5663; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
5664; GFX6-NEXT:    s_sub_i32 s0, 0, s3
5665; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
5666; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
5667; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
5668; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
5669; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
5670; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
5671; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
5672; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
5673; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
5674; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
5675; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
5676; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
5677; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
5678; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
5679; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
5680; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
5681; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
5682; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5683; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v4
5684; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
5685; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v2
5686; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
5687; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v2
5688; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
5689; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
5690; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
5691; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5692; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5693; GFX6-NEXT:    s_endpgm
5694;
5695; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
5696; GFX9:       ; %bb.0:
5697; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
5698; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5699; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s6
5700; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s7
5701; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
5702; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
5703; GFX9-NEXT:    s_sub_i32 s2, 0, s6
5704; GFX9-NEXT:    s_sub_i32 s3, 0, s7
5705; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5706; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5707; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
5708; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
5709; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
5710; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
5711; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
5712; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
5713; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5714; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
5715; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
5716; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
5717; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
5718; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
5719; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
5720; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5721; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
5722; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s7
5723; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
5724; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
5725; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
5726; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v4
5727; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
5728; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
5729; GFX9-NEXT:    v_subrev_u32_e32 v5, s6, v3
5730; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v4
5731; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
5732; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v4
5733; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
5734; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
5735; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
5736; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
5737; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
5738; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
5739; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
5740; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5741; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5742; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5743; GFX9-NEXT:    s_endpgm
5744  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
5745  %r = udiv <2 x i32> %x, %shl.y
5746  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5747  ret void
5748}
5749
5750define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
5751; CHECK-LABEL: @urem_i32_oddk_denom(
5752; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
5753; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5754; CHECK-NEXT:    ret void
5755;
5756; GFX6-LABEL: urem_i32_oddk_denom:
5757; GFX6:       ; %bb.0:
5758; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5759; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5760; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
5761; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5762; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5763; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5764; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
5765; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
5766; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5767; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5768; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5769; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
5770; GFX6-NEXT:    s_mov_b32 s2, -1
5771; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
5772; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5773; GFX6-NEXT:    s_endpgm
5774;
5775; GFX9-LABEL: urem_i32_oddk_denom:
5776; GFX9:       ; %bb.0:
5777; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5778; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5779; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5780; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5781; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
5782; GFX9-NEXT:    s_sub_i32 s1, s4, s0
5783; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
5784; GFX9-NEXT:    s_add_i32 s1, s1, s0
5785; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
5786; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
5787; GFX9-NEXT:    s_sub_i32 s0, s4, s0
5788; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5789; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5790; GFX9-NEXT:    s_endpgm
5791  %r = urem i32 %x, 1235195
5792  store i32 %r, i32 addrspace(1)* %out
5793  ret void
5794}
5795
5796define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5797; CHECK-LABEL: @urem_i32_pow2k_denom(
5798; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
5799; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5800; CHECK-NEXT:    ret void
5801;
5802; GFX6-LABEL: urem_i32_pow2k_denom:
5803; GFX6:       ; %bb.0:
5804; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5805; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5806; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5807; GFX6-NEXT:    s_mov_b32 s2, -1
5808; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5809; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
5810; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5811; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5812; GFX6-NEXT:    s_endpgm
5813;
5814; GFX9-LABEL: urem_i32_pow2k_denom:
5815; GFX9:       ; %bb.0:
5816; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5817; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5818; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5820; GFX9-NEXT:    s_and_b32 s0, s4, 0xfff
5821; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5822; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5823; GFX9-NEXT:    s_endpgm
5824  %r = urem i32 %x, 4096
5825  store i32 %r, i32 addrspace(1)* %out
5826  ret void
5827}
5828
5829define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5830; CHECK-LABEL: @urem_i32_pow2_shl_denom(
5831; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5832; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
5833; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5834; CHECK-NEXT:    ret void
5835;
5836; GFX6-LABEL: urem_i32_pow2_shl_denom:
5837; GFX6:       ; %bb.0:
5838; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5839; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5840; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5841; GFX6-NEXT:    s_mov_b32 s2, -1
5842; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5843; GFX6-NEXT:    s_lshl_b32 s5, 0x1000, s5
5844; GFX6-NEXT:    s_add_i32 s5, s5, -1
5845; GFX6-NEXT:    s_and_b32 s4, s4, s5
5846; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5847; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5848; GFX6-NEXT:    s_endpgm
5849;
5850; GFX9-LABEL: urem_i32_pow2_shl_denom:
5851; GFX9:       ; %bb.0:
5852; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5853; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5854; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5855; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5856; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s3
5857; GFX9-NEXT:    s_add_i32 s0, s0, -1
5858; GFX9-NEXT:    s_and_b32 s0, s2, s0
5859; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5860; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
5861; GFX9-NEXT:    s_endpgm
5862  %shl.y = shl i32 4096, %y
5863  %r = urem i32 %x, %shl.y
5864  store i32 %r, i32 addrspace(1)* %out
5865  ret void
5866}
5867
5868define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5869; CHECK-LABEL: @urem_v2i32_pow2k_denom(
5870; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5871; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
5872; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5873; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5874; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
5875; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5876; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5877; CHECK-NEXT:    ret void
5878;
5879; GFX6-LABEL: urem_v2i32_pow2k_denom:
5880; GFX6:       ; %bb.0:
5881; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5882; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5883; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5884; GFX6-NEXT:    s_mov_b32 s2, -1
5885; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5886; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
5887; GFX6-NEXT:    s_and_b32 s5, s5, 0xfff
5888; GFX6-NEXT:    v_mov_b32_e32 v0, s4
5889; GFX6-NEXT:    v_mov_b32_e32 v1, s5
5890; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5891; GFX6-NEXT:    s_endpgm
5892;
5893; GFX9-LABEL: urem_v2i32_pow2k_denom:
5894; GFX9:       ; %bb.0:
5895; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5896; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5897; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5898; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5899; GFX9-NEXT:    s_and_b32 s0, s2, 0xfff
5900; GFX9-NEXT:    s_and_b32 s1, s3, 0xfff
5901; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5902; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5903; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
5904; GFX9-NEXT:    s_endpgm
5905  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
5906  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5907  ret void
5908}
5909
5910define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5911; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
5912; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5913; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5914; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5915; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5916; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5917; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5918; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5919; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5920; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5921; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5922; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5923; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5924; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5925; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5926; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5927; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5928; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5929; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5930; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5931; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5932; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5933; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5934; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5935; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5936; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5937; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5938; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
5939; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
5940; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
5941; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
5942; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0
5943; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
5944; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5945; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
5946; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
5947; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
5948; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
5949; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
5950; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
5951; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
5952; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
5953; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
5954; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
5955; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
5956; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5957; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
5958; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
5959; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
5960; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
5961; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
5962; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
5963; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5964; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
5965; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
5966; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
5967; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
5968; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
5969; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
5970; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
5971; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
5972; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
5973; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5974; CHECK-NEXT:    ret void
5975;
5976; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
5977; GFX6:       ; %bb.0:
5978; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
5979; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5980; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5981; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5982; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s6
5983; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
5984; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
5985; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
5986; GFX6-NEXT:    s_sub_i32 s2, 0, s6
5987; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5988; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5989; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
5990; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
5991; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
5992; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
5993; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
5994; GFX6-NEXT:    s_sub_i32 s2, 0, s7
5995; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
5996; GFX6-NEXT:    s_mov_b32 s2, -1
5997; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
5998; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
5999; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
6000; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
6001; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
6002; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
6003; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
6004; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
6005; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
6006; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
6007; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
6008; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6009; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6010; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
6011; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6012; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6013; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
6014; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6015; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6016; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
6017; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6018; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6019; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6020; GFX6-NEXT:    s_endpgm
6021;
6022; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
6023; GFX9:       ; %bb.0:
6024; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
6025; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6026; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
6027; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
6028; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6029; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
6030; GFX9-NEXT:    s_sub_i32 s6, 0, s3
6031; GFX9-NEXT:    s_sub_i32 s7, 0, s2
6032; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6033; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6034; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6035; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6036; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6037; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6038; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6039; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
6040; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
6041; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
6042; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
6043; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
6044; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
6045; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
6046; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
6047; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6048; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
6049; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
6050; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
6051; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
6052; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
6053; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6054; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v1
6055; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6056; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
6057; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6058; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
6059; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6060; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v1
6061; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6062; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
6063; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6064; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6065; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
6066; GFX9-NEXT:    s_endpgm
6067  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6068  %r = urem <2 x i32> %x, %shl.y
6069  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6070  ret void
6071}
6072
6073define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
6074; CHECK-LABEL: @sdiv_i32_oddk_denom(
6075; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
6076; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6077; CHECK-NEXT:    ret void
6078;
6079; GFX6-LABEL: sdiv_i32_oddk_denom:
6080; GFX6:       ; %bb.0:
6081; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6082; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6083; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6084; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6085; GFX6-NEXT:    s_mov_b32 s2, -1
6086; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6087; GFX6-NEXT:    v_mul_hi_i32 v0, s4, v0
6088; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
6089; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6090; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
6091; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6092; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6093; GFX6-NEXT:    s_endpgm
6094;
6095; GFX9-LABEL: sdiv_i32_oddk_denom:
6096; GFX9:       ; %bb.0:
6097; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6098; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6099; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6100; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6101; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
6102; GFX9-NEXT:    s_add_i32 s0, s0, s4
6103; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
6104; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
6105; GFX9-NEXT:    s_add_i32 s0, s0, s1
6106; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6107; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6108; GFX9-NEXT:    s_endpgm
6109  %r = sdiv i32 %x, 1235195
6110  store i32 %r, i32 addrspace(1)* %out
6111  ret void
6112}
6113
6114define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
6115; CHECK-LABEL: @sdiv_i32_pow2k_denom(
6116; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
6117; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6118; CHECK-NEXT:    ret void
6119;
6120; GFX6-LABEL: sdiv_i32_pow2k_denom:
6121; GFX6:       ; %bb.0:
6122; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6123; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6124; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6125; GFX6-NEXT:    s_mov_b32 s2, -1
6126; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6127; GFX6-NEXT:    s_ashr_i32 s5, s4, 31
6128; GFX6-NEXT:    s_lshr_b32 s5, s5, 20
6129; GFX6-NEXT:    s_add_i32 s4, s4, s5
6130; GFX6-NEXT:    s_ashr_i32 s4, s4, 12
6131; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6132; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6133; GFX6-NEXT:    s_endpgm
6134;
6135; GFX9-LABEL: sdiv_i32_pow2k_denom:
6136; GFX9:       ; %bb.0:
6137; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6138; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6139; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6140; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6141; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6142; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6143; GFX9-NEXT:    s_add_i32 s4, s4, s0
6144; GFX9-NEXT:    s_ashr_i32 s0, s4, 12
6145; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6146; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6147; GFX9-NEXT:    s_endpgm
6148  %r = sdiv i32 %x, 4096
6149  store i32 %r, i32 addrspace(1)* %out
6150  ret void
6151}
6152
6153define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
6154; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
6155; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6156; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
6157; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6158; CHECK-NEXT:    ret void
6159;
6160; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
6161; GFX6:       ; %bb.0:
6162; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6163; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6164; GFX6-NEXT:    s_mov_b32 s6, -1
6165; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6166; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6167; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
6168; GFX6-NEXT:    s_add_i32 s3, s3, s8
6169; GFX6-NEXT:    s_xor_b32 s3, s3, s8
6170; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
6171; GFX6-NEXT:    s_sub_i32 s4, 0, s3
6172; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6173; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6174; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6175; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
6176; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6177; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
6178; GFX6-NEXT:    s_add_i32 s1, s2, s0
6179; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6180; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6181; GFX6-NEXT:    s_xor_b32 s2, s0, s8
6182; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6183; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
6184; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
6185; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
6186; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
6187; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
6188; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6189; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
6190; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
6191; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
6192; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6193; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6194; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
6195; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6196; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6197; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6198; GFX6-NEXT:    s_endpgm
6199;
6200; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
6201; GFX9:       ; %bb.0:
6202; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6203; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6204; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6205; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6206; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6207; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6208; GFX9-NEXT:    s_add_i32 s3, s3, s4
6209; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6210; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6211; GFX9-NEXT:    s_sub_i32 s5, 0, s3
6212; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6213; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6214; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6215; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
6216; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
6217; GFX9-NEXT:    s_add_i32 s2, s2, s5
6218; GFX9-NEXT:    s_xor_b32 s2, s2, s5
6219; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
6220; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
6221; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6222; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
6223; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
6224; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
6225; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6226; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6227; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
6228; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6229; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
6230; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6231; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6232; GFX9-NEXT:    s_xor_b32 s2, s5, s4
6233; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
6234; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
6235; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
6236; GFX9-NEXT:    s_endpgm
6237  %shl.y = shl i32 4096, %y
6238  %r = sdiv i32 %x, %shl.y
6239  store i32 %r, i32 addrspace(1)* %out
6240  ret void
6241}
6242
6243define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6244; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
6245; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6246; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6247; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6248; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6249; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
6250; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6251; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6252; CHECK-NEXT:    ret void
6253;
6254; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
6255; GFX6:       ; %bb.0:
6256; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
6257; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6258; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6259; GFX6-NEXT:    s_mov_b32 s2, -1
6260; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6261; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
6262; GFX6-NEXT:    s_ashr_i32 s7, s5, 31
6263; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
6264; GFX6-NEXT:    s_add_i32 s4, s4, s6
6265; GFX6-NEXT:    s_lshr_b32 s6, s7, 20
6266; GFX6-NEXT:    s_add_i32 s5, s5, s6
6267; GFX6-NEXT:    s_ashr_i32 s4, s4, 12
6268; GFX6-NEXT:    s_ashr_i32 s5, s5, 12
6269; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6270; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6271; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6272; GFX6-NEXT:    s_endpgm
6273;
6274; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
6275; GFX9:       ; %bb.0:
6276; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6277; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
6278; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6279; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6280; GFX9-NEXT:    s_ashr_i32 s0, s2, 31
6281; GFX9-NEXT:    s_ashr_i32 s1, s3, 31
6282; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6283; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
6284; GFX9-NEXT:    s_add_i32 s0, s2, s0
6285; GFX9-NEXT:    s_add_i32 s1, s3, s1
6286; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
6287; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
6288; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6289; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6290; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
6291; GFX9-NEXT:    s_endpgm
6292  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
6293  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6294  ret void
6295}
6296
6297define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6298; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
6299; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6300; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6301; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6302; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6303; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
6304; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6305; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6306; CHECK-NEXT:    ret void
6307;
6308; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6309; GFX6:       ; %bb.0:
6310; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
6311; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6312; GFX6-NEXT:    v_mov_b32_e32 v0, 0x80080081
6313; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6314; GFX6-NEXT:    s_mov_b32 s2, -1
6315; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6316; GFX6-NEXT:    v_mul_hi_i32 v0, s5, v0
6317; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
6318; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
6319; GFX6-NEXT:    s_add_i32 s4, s4, s6
6320; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
6321; GFX6-NEXT:    s_ashr_i32 s4, s4, 12
6322; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6323; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
6324; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
6325; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6326; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6327; GFX6-NEXT:    s_endpgm
6328;
6329; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6330; GFX9:       ; %bb.0:
6331; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6332; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
6333; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6334; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6335; GFX9-NEXT:    s_ashr_i32 s0, s2, 31
6336; GFX9-NEXT:    s_mul_hi_i32 s1, s3, 0x80080081
6337; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6338; GFX9-NEXT:    s_add_i32 s1, s1, s3
6339; GFX9-NEXT:    s_add_i32 s0, s2, s0
6340; GFX9-NEXT:    s_lshr_b32 s2, s1, 31
6341; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
6342; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
6343; GFX9-NEXT:    s_add_i32 s1, s1, s2
6344; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6345; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6346; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
6347; GFX9-NEXT:    s_endpgm
6348  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
6349  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6350  ret void
6351}
6352
6353define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
6354; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
6355; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
6356; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6357; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6358; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6359; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6360; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
6361; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
6362; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
6363; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
6364; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
6365; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
6366; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
6367; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
6368; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
6369; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
6370; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
6371; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
6372; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
6373; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
6374; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
6375; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
6376; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
6377; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
6378; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
6379; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
6380; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
6381; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
6382; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
6383; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
6384; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
6385; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
6386; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
6387; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
6388; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
6389; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
6390; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
6391; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
6392; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
6393; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
6394; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
6395; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
6396; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0
6397; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
6398; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6399; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
6400; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
6401; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
6402; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
6403; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
6404; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
6405; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
6406; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
6407; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
6408; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
6409; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
6410; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
6411; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
6412; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
6413; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
6414; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
6415; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
6416; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
6417; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
6418; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
6419; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
6420; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
6421; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
6422; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
6423; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
6424; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
6425; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
6426; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
6427; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
6428; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
6429; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
6430; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
6431; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
6432; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
6433; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
6434; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
6435; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
6436; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
6437; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
6438; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6439; CHECK-NEXT:    ret void
6440;
6441; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
6442; GFX6:       ; %bb.0:
6443; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xb
6444; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6445; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6446; GFX6-NEXT:    s_mov_b32 s6, -1
6447; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6448; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s10
6449; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
6450; GFX6-NEXT:    s_add_i32 s0, s0, s1
6451; GFX6-NEXT:    s_xor_b32 s2, s0, s1
6452; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
6453; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s11
6454; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
6455; GFX6-NEXT:    s_add_i32 s0, s0, s3
6456; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6457; GFX6-NEXT:    s_sub_i32 s11, 0, s2
6458; GFX6-NEXT:    s_xor_b32 s10, s0, s3
6459; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
6460; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6461; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6462; GFX6-NEXT:    s_ashr_i32 s0, s8, 31
6463; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6464; GFX6-NEXT:    s_add_i32 s8, s8, s0
6465; GFX6-NEXT:    v_mul_lo_u32 v2, s11, v0
6466; GFX6-NEXT:    s_xor_b32 s8, s8, s0
6467; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6468; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6469; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
6470; GFX6-NEXT:    s_xor_b32 s11, s0, s1
6471; GFX6-NEXT:    s_sub_i32 s0, 0, s10
6472; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
6473; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
6474; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
6475; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s2
6476; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
6477; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
6478; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
6479; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v3
6480; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
6481; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v3
6482; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
6483; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
6484; GFX6-NEXT:    s_add_i32 s1, s9, s0
6485; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6486; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6487; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
6488; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
6489; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
6490; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
6491; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6492; GFX6-NEXT:    s_xor_b32 s2, s0, s3
6493; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
6494; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
6495; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
6496; GFX6-NEXT:    v_xor_b32_e32 v0, s11, v0
6497; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6498; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
6499; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s11, v0
6500; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
6501; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
6502; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
6503; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6504; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
6505; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
6506; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6507; GFX6-NEXT:    s_endpgm
6508;
6509; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
6510; GFX9:       ; %bb.0:
6511; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
6512; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6513; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6514; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6515; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s6
6516; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
6517; GFX9-NEXT:    s_add_i32 s0, s0, s1
6518; GFX9-NEXT:    s_xor_b32 s0, s0, s1
6519; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
6520; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s7
6521; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
6522; GFX9-NEXT:    s_add_i32 s6, s6, s8
6523; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6524; GFX9-NEXT:    s_xor_b32 s6, s6, s8
6525; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
6526; GFX9-NEXT:    s_sub_i32 s10, 0, s0
6527; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6528; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6529; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6530; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
6531; GFX9-NEXT:    s_add_i32 s4, s4, s7
6532; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
6533; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6534; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6535; GFX9-NEXT:    s_sub_i32 s10, 0, s6
6536; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
6537; GFX9-NEXT:    s_xor_b32 s4, s4, s7
6538; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v1
6539; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
6540; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
6541; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
6542; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
6543; GFX9-NEXT:    s_add_i32 s5, s5, s9
6544; GFX9-NEXT:    s_xor_b32 s5, s5, s9
6545; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s0
6546; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
6547; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
6548; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
6549; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v4
6550; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
6551; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6552; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v4
6553; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
6554; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
6555; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
6556; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
6557; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6558; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
6559; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
6560; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
6561; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6562; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
6563; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
6564; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
6565; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
6566; GFX9-NEXT:    s_xor_b32 s1, s7, s1
6567; GFX9-NEXT:    s_xor_b32 s0, s9, s8
6568; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6569; GFX9-NEXT:    v_xor_b32_e32 v0, s1, v0
6570; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
6571; GFX9-NEXT:    v_subrev_u32_e32 v0, s1, v0
6572; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
6573; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6574; GFX9-NEXT:    s_endpgm
6575  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6576  %r = sdiv <2 x i32> %x, %shl.y
6577  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6578  ret void
6579}
6580
6581define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
6582; CHECK-LABEL: @srem_i32_oddk_denom(
6583; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
6584; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6585; CHECK-NEXT:    ret void
6586;
6587; GFX6-LABEL: srem_i32_oddk_denom:
6588; GFX6:       ; %bb.0:
6589; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6590; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6591; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
6592; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6593; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6594; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6595; GFX6-NEXT:    v_mul_hi_i32 v0, s4, v0
6596; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
6597; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6598; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
6599; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6600; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
6601; GFX6-NEXT:    s_mov_b32 s2, -1
6602; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
6603; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6604; GFX6-NEXT:    s_endpgm
6605;
6606; GFX9-LABEL: srem_i32_oddk_denom:
6607; GFX9:       ; %bb.0:
6608; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6609; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6610; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6611; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6612; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
6613; GFX9-NEXT:    s_add_i32 s0, s0, s4
6614; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
6615; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
6616; GFX9-NEXT:    s_add_i32 s0, s0, s1
6617; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
6618; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6619; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6620; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6621; GFX9-NEXT:    s_endpgm
6622  %r = srem i32 %x, 1235195
6623  store i32 %r, i32 addrspace(1)* %out
6624  ret void
6625}
6626
6627define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
6628; CHECK-LABEL: @srem_i32_pow2k_denom(
6629; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
6630; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6631; CHECK-NEXT:    ret void
6632;
6633; GFX6-LABEL: srem_i32_pow2k_denom:
6634; GFX6:       ; %bb.0:
6635; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6636; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6637; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6638; GFX6-NEXT:    s_mov_b32 s2, -1
6639; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6640; GFX6-NEXT:    s_ashr_i32 s5, s4, 31
6641; GFX6-NEXT:    s_lshr_b32 s5, s5, 20
6642; GFX6-NEXT:    s_add_i32 s5, s4, s5
6643; GFX6-NEXT:    s_and_b32 s5, s5, 0xfffff000
6644; GFX6-NEXT:    s_sub_i32 s4, s4, s5
6645; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6646; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6647; GFX6-NEXT:    s_endpgm
6648;
6649; GFX9-LABEL: srem_i32_pow2k_denom:
6650; GFX9:       ; %bb.0:
6651; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6652; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6653; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6654; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6655; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6656; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6657; GFX9-NEXT:    s_add_i32 s0, s4, s0
6658; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
6659; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6660; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6661; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6662; GFX9-NEXT:    s_endpgm
6663  %r = srem i32 %x, 4096
6664  store i32 %r, i32 addrspace(1)* %out
6665  ret void
6666}
6667
6668define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
6669; CHECK-LABEL: @srem_i32_pow2_shl_denom(
6670; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6671; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
6672; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6673; CHECK-NEXT:    ret void
6674;
6675; GFX6-LABEL: srem_i32_pow2_shl_denom:
6676; GFX6:       ; %bb.0:
6677; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6678; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6679; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6680; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6681; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
6682; GFX6-NEXT:    s_add_i32 s3, s3, s4
6683; GFX6-NEXT:    s_xor_b32 s4, s3, s4
6684; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
6685; GFX6-NEXT:    s_sub_i32 s3, 0, s4
6686; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
6687; GFX6-NEXT:    s_add_i32 s2, s2, s5
6688; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6689; GFX6-NEXT:    s_xor_b32 s6, s2, s5
6690; GFX6-NEXT:    s_mov_b32 s2, -1
6691; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6692; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6693; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
6694; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6695; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6696; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6697; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
6698; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
6699; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
6700; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
6701; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6702; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6703; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
6704; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6705; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6706; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
6707; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
6708; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6709; GFX6-NEXT:    s_endpgm
6710;
6711; GFX9-LABEL: srem_i32_pow2_shl_denom:
6712; GFX9:       ; %bb.0:
6713; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6714; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6715; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6716; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6717; GFX9-NEXT:    s_add_i32 s3, s3, s4
6718; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6719; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6720; GFX9-NEXT:    s_sub_i32 s4, 0, s3
6721; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6722; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6723; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6724; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6725; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
6726; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
6727; GFX9-NEXT:    s_add_i32 s2, s2, s4
6728; GFX9-NEXT:    s_xor_b32 s2, s2, s4
6729; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
6730; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
6731; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6732; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6733; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
6734; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
6735; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
6736; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6737; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6738; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
6739; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6740; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6741; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
6742; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
6743; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6744; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
6745; GFX9-NEXT:    s_endpgm
6746  %shl.y = shl i32 4096, %y
6747  %r = srem i32 %x, %shl.y
6748  store i32 %r, i32 addrspace(1)* %out
6749  ret void
6750}
6751
6752define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6753; CHECK-LABEL: @srem_v2i32_pow2k_denom(
6754; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6755; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
6756; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6757; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6758; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
6759; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6760; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6761; CHECK-NEXT:    ret void
6762;
6763; GFX6-LABEL: srem_v2i32_pow2k_denom:
6764; GFX6:       ; %bb.0:
6765; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
6766; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6767; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6768; GFX6-NEXT:    s_mov_b32 s2, -1
6769; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6770; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
6771; GFX6-NEXT:    s_lshr_b32 s6, s6, 20
6772; GFX6-NEXT:    s_add_i32 s6, s4, s6
6773; GFX6-NEXT:    s_ashr_i32 s7, s5, 31
6774; GFX6-NEXT:    s_and_b32 s6, s6, 0xfffff000
6775; GFX6-NEXT:    s_sub_i32 s4, s4, s6
6776; GFX6-NEXT:    s_lshr_b32 s6, s7, 20
6777; GFX6-NEXT:    s_add_i32 s6, s5, s6
6778; GFX6-NEXT:    s_and_b32 s6, s6, 0xfffff000
6779; GFX6-NEXT:    s_sub_i32 s5, s5, s6
6780; GFX6-NEXT:    v_mov_b32_e32 v0, s4
6781; GFX6-NEXT:    v_mov_b32_e32 v1, s5
6782; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6783; GFX6-NEXT:    s_endpgm
6784;
6785; GFX9-LABEL: srem_v2i32_pow2k_denom:
6786; GFX9:       ; %bb.0:
6787; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6788; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
6789; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6790; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6791; GFX9-NEXT:    s_ashr_i32 s0, s2, 31
6792; GFX9-NEXT:    s_ashr_i32 s1, s3, 31
6793; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6794; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
6795; GFX9-NEXT:    s_add_i32 s0, s2, s0
6796; GFX9-NEXT:    s_add_i32 s1, s3, s1
6797; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
6798; GFX9-NEXT:    s_and_b32 s1, s1, 0xfffff000
6799; GFX9-NEXT:    s_sub_i32 s0, s2, s0
6800; GFX9-NEXT:    s_sub_i32 s1, s3, s1
6801; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6802; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6803; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
6804; GFX9-NEXT:    s_endpgm
6805  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
6806  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6807  ret void
6808}
6809
6810define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
6811; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
6812; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
6813; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6814; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6815; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6816; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6817; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
6818; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
6819; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
6820; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
6821; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
6822; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
6823; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
6824; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
6825; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
6826; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
6827; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
6828; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
6829; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
6830; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
6831; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
6832; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
6833; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
6834; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
6835; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
6836; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
6837; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
6838; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
6839; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
6840; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
6841; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
6842; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
6843; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
6844; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
6845; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
6846; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
6847; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
6848; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
6849; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
6850; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0
6851; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
6852; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6853; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
6854; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
6855; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
6856; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
6857; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
6858; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
6859; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
6860; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
6861; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
6862; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
6863; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
6864; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
6865; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
6866; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
6867; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
6868; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
6869; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
6870; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
6871; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
6872; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
6873; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
6874; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
6875; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
6876; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
6877; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
6878; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
6879; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
6880; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
6881; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
6882; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
6883; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
6884; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
6885; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
6886; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
6887; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
6888; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
6889; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6890; CHECK-NEXT:    ret void
6891;
6892; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
6893; GFX6:       ; %bb.0:
6894; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
6895; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6896; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6897; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
6898; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
6899; GFX6-NEXT:    s_add_i32 s2, s2, s3
6900; GFX6-NEXT:    s_xor_b32 s6, s2, s3
6901; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
6902; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
6903; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
6904; GFX6-NEXT:    s_add_i32 s7, s7, s8
6905; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6906; GFX6-NEXT:    s_xor_b32 s7, s7, s8
6907; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
6908; GFX6-NEXT:    s_sub_i32 s9, 0, s6
6909; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6910; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6911; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6912; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
6913; GFX6-NEXT:    s_add_i32 s4, s4, s8
6914; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v0
6915; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6916; GFX6-NEXT:    s_xor_b32 s4, s4, s8
6917; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6918; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
6919; GFX6-NEXT:    s_sub_i32 s9, 0, s7
6920; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6921; GFX6-NEXT:    s_mov_b32 s2, -1
6922; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
6923; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
6924; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
6925; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
6926; GFX6-NEXT:    s_add_i32 s5, s5, s9
6927; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
6928; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
6929; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
6930; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
6931; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6932; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6933; GFX6-NEXT:    s_xor_b32 s4, s5, s9
6934; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6935; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
6936; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
6937; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6938; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
6939; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6940; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
6941; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
6942; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
6943; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
6944; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6945; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6946; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
6947; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6948; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6949; GFX6-NEXT:    v_xor_b32_e32 v1, s9, v1
6950; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
6951; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6952; GFX6-NEXT:    s_endpgm
6953;
6954; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
6955; GFX9:       ; %bb.0:
6956; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
6957; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6958; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
6959; GFX9-NEXT:    s_ashr_i32 s6, s3, 31
6960; GFX9-NEXT:    s_add_i32 s3, s3, s6
6961; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
6962; GFX9-NEXT:    s_xor_b32 s3, s3, s6
6963; GFX9-NEXT:    s_ashr_i32 s7, s2, 31
6964; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6965; GFX9-NEXT:    s_add_i32 s2, s2, s7
6966; GFX9-NEXT:    s_xor_b32 s2, s2, s7
6967; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
6968; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6969; GFX9-NEXT:    s_sub_i32 s8, 0, s3
6970; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
6971; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6972; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6973; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6974; GFX9-NEXT:    s_add_i32 s4, s4, s6
6975; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
6976; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6977; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v0
6978; GFX9-NEXT:    s_sub_i32 s8, 0, s2
6979; GFX9-NEXT:    s_xor_b32 s4, s4, s6
6980; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
6981; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
6982; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
6983; GFX9-NEXT:    s_add_i32 s5, s5, s7
6984; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
6985; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
6986; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
6987; GFX9-NEXT:    s_xor_b32 s5, s5, s7
6988; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
6989; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
6990; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
6991; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6992; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6993; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
6994; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
6995; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
6996; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6997; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6998; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
6999; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
7000; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
7001; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7002; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v1
7003; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
7004; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7005; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v1
7006; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
7007; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7008; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
7009; GFX9-NEXT:    v_xor_b32_e32 v1, s7, v1
7010; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
7011; GFX9-NEXT:    v_subrev_u32_e32 v1, s7, v1
7012; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7013; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7014; GFX9-NEXT:    s_endpgm
7015  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
7016  %r = srem <2 x i32> %x, %shl.y
7017  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7018  ret void
7019}
7020
7021define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
7022; CHECK-LABEL: @udiv_i64_oddk_denom(
7023; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
7024; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7025; CHECK-NEXT:    ret void
7026;
7027; GFX6-LABEL: udiv_i64_oddk_denom:
7028; GFX6:       ; %bb.0:
7029; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
7030; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7031; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7032; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7033; GFX6-NEXT:    s_movk_i32 s4, 0xfee0
7034; GFX6-NEXT:    s_mov_b32 s5, 0x68958c89
7035; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
7036; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7037; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7038; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7039; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7040; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7041; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7042; GFX6-NEXT:    s_movk_i32 s8, 0x11f
7043; GFX6-NEXT:    s_mov_b32 s9, 0x976a7377
7044; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
7045; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
7046; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
7047; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s5
7048; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7049; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7050; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
7051; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
7052; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
7053; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7054; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7055; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7056; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
7057; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
7058; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
7059; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
7060; GFX6-NEXT:    s_mov_b32 s6, -1
7061; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
7062; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
7063; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7064; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7065; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7066; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7067; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7068; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
7069; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
7070; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
7071; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7072; GFX6-NEXT:    s_mov_b32 s4, s0
7073; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7074; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
7075; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7076; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
7077; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
7078; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7079; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7080; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7081; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7082; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
7083; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7084; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7085; GFX6-NEXT:    s_mov_b32 s5, s1
7086; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7087; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
7088; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7089; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7090; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7091; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7092; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7093; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
7094; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
7095; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
7096; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
7097; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
7098; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7099; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7100; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
7101; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
7102; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7103; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7104; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
7105; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7106; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
7107; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
7108; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s9
7109; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
7110; GFX6-NEXT:    v_mov_b32_e32 v5, 0x11f
7111; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7112; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s9
7113; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7114; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
7115; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
7116; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
7117; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s9, v3
7118; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7119; GFX6-NEXT:    s_movk_i32 s2, 0x11e
7120; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v4
7121; GFX6-NEXT:    s_mov_b32 s9, 0x976a7376
7122; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7123; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s9, v5
7124; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7125; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s8, v4
7126; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
7127; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
7128; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
7129; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
7130; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
7131; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7132; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
7133; GFX6-NEXT:    v_mov_b32_e32 v6, s3
7134; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
7135; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v2
7136; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7137; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v3
7138; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7139; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v2
7140; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
7141; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7142; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
7143; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7144; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7145; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7146; GFX6-NEXT:    s_endpgm
7147;
7148; GFX9-LABEL: udiv_i64_oddk_denom:
7149; GFX9:       ; %bb.0:
7150; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
7151; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7152; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7153; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7154; GFX9-NEXT:    s_movk_i32 s2, 0xfee0
7155; GFX9-NEXT:    s_mov_b32 s3, 0x68958c89
7156; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7157; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7158; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7159; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7160; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7161; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7162; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7163; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
7164; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
7165; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
7166; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s3
7167; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7168; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
7169; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
7170; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
7171; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
7172; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
7173; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7174; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
7175; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
7176; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
7177; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
7178; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
7179; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
7180; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
7181; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7182; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7183; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7184; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
7185; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
7186; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
7187; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
7188; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
7189; GFX9-NEXT:    s_movk_i32 s2, 0x11f
7190; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7191; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7192; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
7193; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
7194; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
7195; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
7196; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7197; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
7198; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
7199; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v5
7200; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
7201; GFX9-NEXT:    s_mov_b32 s3, 0x976a7377
7202; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
7203; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v5, vcc
7204; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
7205; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7206; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7207; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7208; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
7209; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7210; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7211; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7212; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
7213; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
7214; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7215; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7216; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7217; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
7218; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7219; GFX9-NEXT:    v_mov_b32_e32 v6, 0x11f
7220; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
7221; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7222; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
7223; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7224; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
7225; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
7226; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
7227; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
7228; GFX9-NEXT:    v_mov_b32_e32 v5, 0
7229; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7230; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
7231; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7232; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
7233; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
7234; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
7235; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v3
7236; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7237; GFX9-NEXT:    s_movk_i32 s3, 0x11e
7238; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
7239; GFX9-NEXT:    s_mov_b32 s6, 0x976a7376
7240; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
7241; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v6
7242; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7243; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
7244; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
7245; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
7246; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
7247; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
7248; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
7249; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7250; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
7251; GFX9-NEXT:    v_mov_b32_e32 v7, s7
7252; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
7253; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
7254; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7255; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v3
7256; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7257; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
7258; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
7259; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7260; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
7261; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7262; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7263; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
7264; GFX9-NEXT:    s_endpgm
7265  %r = udiv i64 %x, 1235195949943
7266  store i64 %r, i64 addrspace(1)* %out
7267  ret void
7268}
7269
7270define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
7271; CHECK-LABEL: @udiv_i64_pow2k_denom(
7272; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
7273; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7274; CHECK-NEXT:    ret void
7275;
7276; GFX6-LABEL: udiv_i64_pow2k_denom:
7277; GFX6:       ; %bb.0:
7278; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
7279; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7280; GFX6-NEXT:    s_mov_b32 s6, -1
7281; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7282; GFX6-NEXT:    s_mov_b32 s4, s0
7283; GFX6-NEXT:    s_mov_b32 s5, s1
7284; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
7285; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7286; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7287; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7288; GFX6-NEXT:    s_endpgm
7289;
7290; GFX9-LABEL: udiv_i64_pow2k_denom:
7291; GFX9:       ; %bb.0:
7292; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
7293; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7294; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7295; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7296; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7297; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7298; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7299; GFX9-NEXT:    s_endpgm
7300  %r = udiv i64 %x, 4096
7301  store i64 %r, i64 addrspace(1)* %out
7302  ret void
7303}
7304
7305define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
7306; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
7307; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7308; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
7309; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7310; CHECK-NEXT:    ret void
7311;
7312; GFX6-LABEL: udiv_i64_pow2_shl_denom:
7313; GFX6:       ; %bb.0:
7314; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7315; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
7316; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7317; GFX6-NEXT:    s_mov_b32 s2, -1
7318; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7319; GFX6-NEXT:    s_mov_b32 s0, s4
7320; GFX6-NEXT:    s_add_i32 s8, s8, 12
7321; GFX6-NEXT:    s_mov_b32 s1, s5
7322; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
7323; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7324; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7325; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
7326; GFX6-NEXT:    s_endpgm
7327;
7328; GFX9-LABEL: udiv_i64_pow2_shl_denom:
7329; GFX9:       ; %bb.0:
7330; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
7331; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7332; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7333; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7334; GFX9-NEXT:    s_add_i32 s2, s2, 12
7335; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s2
7336; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7337; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7338; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7339; GFX9-NEXT:    s_endpgm
7340  %shl.y = shl i64 4096, %y
7341  %r = udiv i64 %x, %shl.y
7342  store i64 %r, i64 addrspace(1)* %out
7343  ret void
7344}
7345
7346define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7347; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
7348; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7349; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7350; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7351; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7352; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
7353; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7354; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7355; CHECK-NEXT:    ret void
7356;
7357; GFX6-LABEL: udiv_v2i64_pow2k_denom:
7358; GFX6:       ; %bb.0:
7359; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
7360; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
7361; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7362; GFX6-NEXT:    s_mov_b32 s2, -1
7363; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7364; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
7365; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], 12
7366; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7367; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7368; GFX6-NEXT:    v_mov_b32_e32 v2, s6
7369; GFX6-NEXT:    v_mov_b32_e32 v3, s7
7370; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7371; GFX6-NEXT:    s_endpgm
7372;
7373; GFX9-LABEL: udiv_v2i64_pow2k_denom:
7374; GFX9:       ; %bb.0:
7375; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7376; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7377; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7378; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7379; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
7380; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 12
7381; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7382; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7383; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7384; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7385; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
7386; GFX9-NEXT:    s_endpgm
7387  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
7388  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7389  ret void
7390}
7391
7392define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7393; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
7394; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7395; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7396; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7397; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7398; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
7399; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7400; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7401; CHECK-NEXT:    ret void
7402;
7403; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom:
7404; GFX6:       ; %bb.0:
7405; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
7406; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
7407; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7408; GFX6-NEXT:    s_movk_i32 s6, 0xf001
7409; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7410; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
7411; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7412; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7413; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7414; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7415; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7416; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7417; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7418; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7419; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], 12
7420; GFX6-NEXT:    s_movk_i32 s0, 0xfff
7421; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
7422; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s6
7423; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
7424; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
7425; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
7426; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
7427; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
7428; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7429; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7430; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7431; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7432; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
7433; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7434; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7435; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7436; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
7437; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7438; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7439; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7440; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7441; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7442; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
7443; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
7444; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
7445; GFX6-NEXT:    s_mov_b32 s6, -1
7446; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
7447; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
7448; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
7449; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v4
7450; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7451; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7452; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7453; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
7454; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
7455; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
7456; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
7457; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
7458; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
7459; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7460; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7461; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7462; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7463; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7464; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
7465; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
7466; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
7467; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
7468; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
7469; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7470; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7471; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
7472; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
7473; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7474; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7475; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
7476; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7477; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
7478; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
7479; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
7480; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
7481; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
7482; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
7483; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
7484; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
7485; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7486; GFX6-NEXT:    v_mov_b32_e32 v5, s3
7487; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
7488; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
7489; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
7490; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
7491; GFX6-NEXT:    s_movk_i32 s0, 0xffe
7492; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
7493; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7494; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
7495; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
7496; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
7497; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7498; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7499; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
7500; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
7501; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
7502; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7503; GFX6-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
7504; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
7505; GFX6-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
7506; GFX6-NEXT:    v_mov_b32_e32 v0, s8
7507; GFX6-NEXT:    v_mov_b32_e32 v1, s9
7508; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7509; GFX6-NEXT:    s_endpgm
7510;
7511; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
7512; GFX9:       ; %bb.0:
7513; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
7514; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
7515; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7516; GFX9-NEXT:    s_movk_i32 s2, 0xf001
7517; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7518; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7519; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7520; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7521; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7522; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7523; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
7524; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s2
7525; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s2
7526; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
7527; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7528; GFX9-NEXT:    v_mul_hi_u32 v5, v0, v3
7529; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
7530; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
7531; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v3
7532; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
7533; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
7534; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
7535; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
7536; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7537; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
7538; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
7539; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
7540; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7541; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7542; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7543; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
7544; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
7545; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s2
7546; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s2
7547; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7548; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7549; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
7550; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
7551; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
7552; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
7553; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
7554; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
7555; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7556; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
7557; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
7558; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
7559; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
7560; GFX9-NEXT:    s_movk_i32 s0, 0xfff
7561; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7562; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
7563; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
7564; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
7565; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v8, vcc
7566; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7567; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
7568; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7569; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
7570; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7571; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7572; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
7573; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
7574; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7575; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7576; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
7577; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
7578; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7579; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7580; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
7581; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7582; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
7583; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7584; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
7585; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
7586; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s0
7587; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
7588; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
7589; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
7590; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
7591; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
7592; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
7593; GFX9-NEXT:    v_mov_b32_e32 v6, s7
7594; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
7595; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
7596; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
7597; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
7598; GFX9-NEXT:    s_movk_i32 s0, 0xffe
7599; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
7600; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7601; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
7602; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
7603; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
7604; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
7605; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7606; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
7607; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
7608; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
7609; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
7610; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
7611; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
7612; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
7613; GFX9-NEXT:    v_mov_b32_e32 v0, s4
7614; GFX9-NEXT:    v_mov_b32_e32 v1, s5
7615; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
7616; GFX9-NEXT:    s_endpgm
7617  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
7618  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7619  ret void
7620}
7621
7622define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
7623; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
7624; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
7625; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7626; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7627; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
7628; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
7629; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7630; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7631; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
7632; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7633; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7634; CHECK-NEXT:    ret void
7635;
7636; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
7637; GFX6:       ; %bb.0:
7638; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
7639; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
7640; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7641; GFX6-NEXT:    s_mov_b32 s2, -1
7642; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7643; GFX6-NEXT:    s_add_i32 s8, s8, 12
7644; GFX6-NEXT:    s_add_i32 s9, s10, 12
7645; GFX6-NEXT:    s_lshr_b64 s[4:5], s[4:5], s8
7646; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s9
7647; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7648; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7649; GFX6-NEXT:    v_mov_b32_e32 v2, s6
7650; GFX6-NEXT:    v_mov_b32_e32 v3, s7
7651; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
7652; GFX6-NEXT:    s_endpgm
7653;
7654; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
7655; GFX9:       ; %bb.0:
7656; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
7657; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7658; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
7659; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7660; GFX9-NEXT:    s_add_i32 s2, s8, 12
7661; GFX9-NEXT:    s_add_i32 s8, s10, 12
7662; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], s2
7663; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
7664; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7665; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7666; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7667; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7668; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
7669; GFX9-NEXT:    s_endpgm
7670  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7671  %r = udiv <2 x i64> %x, %shl.y
7672  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7673  ret void
7674}
7675
7676define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
7677; CHECK-LABEL: @urem_i64_oddk_denom(
7678; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
7679; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7680; CHECK-NEXT:    ret void
7681;
7682; GFX6-LABEL: urem_i64_oddk_denom:
7683; GFX6:       ; %bb.0:
7684; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
7685; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7686; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7687; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7688; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
7689; GFX6-NEXT:    s_mov_b32 s3, 0x689e0837
7690; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7691; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7692; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7693; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7694; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7695; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7696; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7697; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7698; GFX6-NEXT:    s_mov_b32 s8, s4
7699; GFX6-NEXT:    s_movk_i32 s4, 0x11f
7700; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7701; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7702; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7703; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
7704; GFX6-NEXT:    s_mov_b32 s12, 0x9761f7c9
7705; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7706; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
7707; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
7708; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
7709; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7710; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7711; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7712; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
7713; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
7714; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
7715; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
7716; GFX6-NEXT:    s_mov_b32 s9, s5
7717; GFX6-NEXT:    s_movk_i32 s5, 0x11e
7718; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
7719; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
7720; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7721; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7722; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7723; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7724; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7725; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7726; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7727; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7728; GFX6-NEXT:    s_mov_b32 s11, 0xf000
7729; GFX6-NEXT:    s_mov_b32 s10, -1
7730; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7731; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
7732; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7733; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
7734; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
7735; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
7736; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
7737; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7738; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7739; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
7740; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7741; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7742; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7743; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
7744; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
7745; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7746; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7747; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7748; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
7749; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
7750; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
7751; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
7752; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
7753; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
7754; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7755; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
7756; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
7757; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
7758; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7759; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7760; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
7761; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7762; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
7763; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
7764; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
7765; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
7766; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
7767; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7768; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
7769; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
7770; GFX6-NEXT:    v_mov_b32_e32 v3, 0x11f
7771; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
7772; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
7773; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
7774; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
7775; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
7776; GFX6-NEXT:    s_mov_b32 s6, 0x9761f7c8
7777; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
7778; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v4
7779; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
7780; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
7781; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, v5
7782; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
7783; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
7784; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
7785; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
7786; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
7787; GFX6-NEXT:    v_mov_b32_e32 v5, s7
7788; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
7789; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
7790; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7791; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
7792; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7793; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
7794; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
7795; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7796; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7797; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
7798; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7799; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
7800; GFX6-NEXT:    s_endpgm
7801;
7802; GFX9-LABEL: urem_i64_oddk_denom:
7803; GFX9:       ; %bb.0:
7804; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
7805; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7806; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7807; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7808; GFX9-NEXT:    s_movk_i32 s2, 0xfee0
7809; GFX9-NEXT:    s_mov_b32 s3, 0x689e0837
7810; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7811; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7812; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7813; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7814; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7815; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7816; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7817; GFX9-NEXT:    s_movk_i32 s8, 0x11f
7818; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
7819; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
7820; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
7821; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
7822; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s3
7823; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
7824; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7825; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
7826; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
7827; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
7828; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
7829; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
7830; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7831; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
7832; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
7833; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
7834; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
7835; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
7836; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
7837; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
7838; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7839; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7840; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7841; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
7842; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
7843; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
7844; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
7845; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
7846; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7847; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7848; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
7849; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
7850; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
7851; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
7852; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7853; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
7854; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
7855; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v5
7856; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
7857; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
7858; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v5, vcc
7859; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
7860; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7861; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7862; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7863; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
7864; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7865; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7866; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7867; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
7868; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
7869; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7870; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7871; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7872; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
7873; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7874; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
7875; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7876; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
7877; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7878; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
7879; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
7880; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s9
7881; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
7882; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
7883; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7884; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7885; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
7886; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
7887; GFX9-NEXT:    v_mov_b32_e32 v3, 0x11f
7888; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
7889; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
7890; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s9, v0
7891; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
7892; GFX9-NEXT:    s_movk_i32 s6, 0x11e
7893; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
7894; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
7895; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v5
7896; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
7897; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
7898; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
7899; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v5
7900; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
7901; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
7902; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
7903; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
7904; GFX9-NEXT:    v_mov_b32_e32 v6, s7
7905; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
7906; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
7907; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7908; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
7909; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7910; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
7911; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
7912; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
7913; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7914; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
7915; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7916; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
7917; GFX9-NEXT:    s_endpgm
7918  %r = urem i64 %x, 1235195393993
7919  store i64 %r, i64 addrspace(1)* %out
7920  ret void
7921}
7922
7923define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
7924; CHECK-LABEL: @urem_i64_pow2k_denom(
7925; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
7926; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7927; CHECK-NEXT:    ret void
7928;
7929; GFX6-LABEL: urem_i64_pow2k_denom:
7930; GFX6:       ; %bb.0:
7931; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
7932; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7933; GFX6-NEXT:    s_mov_b32 s6, -1
7934; GFX6-NEXT:    v_mov_b32_e32 v1, 0
7935; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7936; GFX6-NEXT:    s_mov_b32 s4, s0
7937; GFX6-NEXT:    s_and_b32 s0, s2, 0xfff
7938; GFX6-NEXT:    s_mov_b32 s5, s1
7939; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7940; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7941; GFX6-NEXT:    s_endpgm
7942;
7943; GFX9-LABEL: urem_i64_pow2k_denom:
7944; GFX9:       ; %bb.0:
7945; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
7946; GFX9-NEXT:    v_mov_b32_e32 v1, 0
7947; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7948; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
7949; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7950; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
7951; GFX9-NEXT:    s_endpgm
7952  %r = urem i64 %x, 4096
7953  store i64 %r, i64 addrspace(1)* %out
7954  ret void
7955}
7956
7957define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
7958; CHECK-LABEL: @urem_i64_pow2_shl_denom(
7959; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7960; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
7961; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7962; CHECK-NEXT:    ret void
7963;
7964; GFX6-LABEL: urem_i64_pow2_shl_denom:
7965; GFX6:       ; %bb.0:
7966; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7967; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
7968; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7969; GFX6-NEXT:    s_mov_b32 s2, -1
7970; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7971; GFX6-NEXT:    s_mov_b32 s0, s4
7972; GFX6-NEXT:    s_mov_b32 s1, s5
7973; GFX6-NEXT:    s_mov_b64 s[4:5], 0x1000
7974; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
7975; GFX6-NEXT:    s_add_u32 s4, s4, -1
7976; GFX6-NEXT:    s_addc_u32 s5, s5, -1
7977; GFX6-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
7978; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7979; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7980; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
7981; GFX6-NEXT:    s_endpgm
7982;
7983; GFX9-LABEL: urem_i64_pow2_shl_denom:
7984; GFX9:       ; %bb.0:
7985; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
7986; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7987; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
7988; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7989; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7990; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
7991; GFX9-NEXT:    s_add_u32 s0, s0, -1
7992; GFX9-NEXT:    s_addc_u32 s1, s1, -1
7993; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
7994; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7995; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7996; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7997; GFX9-NEXT:    s_endpgm
7998  %shl.y = shl i64 4096, %y
7999  %r = urem i64 %x, %shl.y
8000  store i64 %r, i64 addrspace(1)* %out
8001  ret void
8002}
8003
8004define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8005; CHECK-LABEL: @urem_v2i64_pow2k_denom(
8006; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8007; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
8008; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8009; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8010; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
8011; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8012; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8013; CHECK-NEXT:    ret void
8014;
8015; GFX6-LABEL: urem_v2i64_pow2k_denom:
8016; GFX6:       ; %bb.0:
8017; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
8018; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
8019; GFX6-NEXT:    v_mov_b32_e32 v1, 0
8020; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8021; GFX6-NEXT:    s_mov_b32 s2, -1
8022; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8023; GFX6-NEXT:    s_and_b32 s4, s4, 0xfff
8024; GFX6-NEXT:    s_and_b32 s5, s6, 0xfff
8025; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8026; GFX6-NEXT:    v_mov_b32_e32 v2, s5
8027; GFX6-NEXT:    v_mov_b32_e32 v3, v1
8028; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
8029; GFX6-NEXT:    s_endpgm
8030;
8031; GFX9-LABEL: urem_v2i64_pow2k_denom:
8032; GFX9:       ; %bb.0:
8033; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8034; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8035; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8036; GFX9-NEXT:    v_mov_b32_e32 v3, v1
8037; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8038; GFX9-NEXT:    s_and_b32 s0, s4, 0xfff
8039; GFX9-NEXT:    s_and_b32 s1, s6, 0xfff
8040; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8041; GFX9-NEXT:    v_mov_b32_e32 v2, s1
8042; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
8043; GFX9-NEXT:    s_endpgm
8044  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
8045  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8046  ret void
8047}
8048
8049define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
8050; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
8051; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
8052; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8053; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
8054; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
8055; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
8056; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
8057; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
8058; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
8059; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
8060; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8061; CHECK-NEXT:    ret void
8062;
8063; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
8064; GFX6:       ; %bb.0:
8065; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
8066; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0xd
8067; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
8068; GFX6-NEXT:    s_mov_b32 s11, 0xf000
8069; GFX6-NEXT:    s_mov_b32 s10, -1
8070; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8071; GFX6-NEXT:    s_lshl_b64 s[6:7], s[12:13], s6
8072; GFX6-NEXT:    s_lshl_b64 s[4:5], s[12:13], s4
8073; GFX6-NEXT:    s_add_u32 s4, s4, -1
8074; GFX6-NEXT:    s_addc_u32 s5, s5, -1
8075; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
8076; GFX6-NEXT:    s_add_u32 s4, s6, -1
8077; GFX6-NEXT:    s_addc_u32 s5, s7, -1
8078; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
8079; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8080; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8081; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8082; GFX6-NEXT:    v_mov_b32_e32 v3, s3
8083; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
8084; GFX6-NEXT:    s_endpgm
8085;
8086; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
8087; GFX9:       ; %bb.0:
8088; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
8089; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
8090; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
8091; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8092; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8093; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
8094; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
8095; GFX9-NEXT:    s_add_u32 s2, s2, -1
8096; GFX9-NEXT:    s_addc_u32 s3, s3, -1
8097; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
8098; GFX9-NEXT:    s_add_u32 s4, s10, -1
8099; GFX9-NEXT:    s_addc_u32 s5, s11, -1
8100; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
8101; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8102; GFX9-NEXT:    v_mov_b32_e32 v1, s3
8103; GFX9-NEXT:    v_mov_b32_e32 v2, s4
8104; GFX9-NEXT:    v_mov_b32_e32 v3, s5
8105; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
8106; GFX9-NEXT:    s_endpgm
8107  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
8108  %r = urem <2 x i64> %x, %shl.y
8109  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8110  ret void
8111}
8112
8113define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
8114; CHECK-LABEL: @sdiv_i64_oddk_denom(
8115; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
8116; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8117; CHECK-NEXT:    ret void
8118;
8119; GFX6-LABEL: sdiv_i64_oddk_denom:
8120; GFX6:       ; %bb.0:
8121; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
8122; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
8123; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8124; GFX6-NEXT:    s_mov_b32 s5, 0xffed2705
8125; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
8126; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8127; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8128; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8129; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8130; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8131; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8132; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8133; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8134; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
8135; GFX6-NEXT:    s_add_u32 s2, s2, s8
8136; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
8137; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
8138; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s5
8139; GFX6-NEXT:    s_mov_b32 s9, s8
8140; GFX6-NEXT:    s_addc_u32 s3, s3, s8
8141; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8142; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8143; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
8144; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8145; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8146; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8147; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8148; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
8149; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8150; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
8151; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
8152; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
8153; GFX6-NEXT:    s_mov_b32 s4, s0
8154; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
8155; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
8156; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
8157; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8158; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8159; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8160; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8161; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
8162; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
8163; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
8164; GFX6-NEXT:    s_mov_b32 s6, -1
8165; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8166; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
8167; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8168; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
8169; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
8170; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
8171; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
8172; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
8173; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
8174; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
8175; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
8176; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8177; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
8178; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
8179; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8180; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8181; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8182; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8183; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8184; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
8185; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
8186; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
8187; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
8188; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
8189; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8190; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8191; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
8192; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
8193; GFX6-NEXT:    s_mov_b32 s5, s1
8194; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8195; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8196; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
8197; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8198; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
8199; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
8200; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
8201; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
8202; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
8203; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
8204; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
8205; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
8206; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8207; GFX6-NEXT:    v_mov_b32_e32 v5, s3
8208; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
8209; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
8210; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
8211; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
8212; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
8213; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
8214; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
8215; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
8216; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
8217; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
8218; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
8219; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8220; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
8221; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
8222; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8223; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
8224; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
8225; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8226; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8227; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
8228; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
8229; GFX6-NEXT:    v_mov_b32_e32 v2, s8
8230; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
8231; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
8232; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8233; GFX6-NEXT:    s_endpgm
8234;
8235; GFX9-LABEL: sdiv_i64_oddk_denom:
8236; GFX9:       ; %bb.0:
8237; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
8238; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
8239; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8240; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
8241; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8242; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8243; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8244; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8245; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8246; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8247; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8248; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
8249; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
8250; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
8251; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8252; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8253; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
8254; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8255; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
8256; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v4
8257; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
8258; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
8259; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
8260; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
8261; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8262; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
8263; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
8264; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
8265; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8266; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
8267; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8268; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8269; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
8270; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
8271; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
8272; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8273; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
8274; GFX9-NEXT:    s_add_u32 s0, s6, s2
8275; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8276; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8277; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8278; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
8279; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
8280; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
8281; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
8282; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
8283; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
8284; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
8285; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8286; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
8287; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
8288; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
8289; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
8290; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
8291; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8292; GFX9-NEXT:    s_mov_b32 s3, s2
8293; GFX9-NEXT:    s_addc_u32 s1, s7, s2
8294; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8295; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
8296; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
8297; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
8298; GFX9-NEXT:    v_mul_hi_u32 v5, s0, v1
8299; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
8300; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
8301; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8302; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
8303; GFX9-NEXT:    v_mul_lo_u32 v5, s1, v0
8304; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
8305; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
8306; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8307; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
8308; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
8309; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
8310; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
8311; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
8312; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
8313; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
8314; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s3
8315; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s3
8316; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
8317; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
8318; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
8319; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
8320; GFX9-NEXT:    v_mov_b32_e32 v6, s1
8321; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
8322; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
8323; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s3, v9
8324; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
8325; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
8326; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
8327; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8328; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
8329; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
8330; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
8331; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
8332; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8333; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
8334; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
8335; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
8336; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
8337; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
8338; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8339; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8340; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
8341; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
8342; GFX9-NEXT:    v_mov_b32_e32 v2, s2
8343; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
8344; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
8345; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
8346; GFX9-NEXT:    s_endpgm
8347  %r = sdiv i64 %x, 1235195
8348  store i64 %r, i64 addrspace(1)* %out
8349  ret void
8350}
8351
8352define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
8353; CHECK-LABEL: @sdiv_i64_pow2k_denom(
8354; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
8355; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8356; CHECK-NEXT:    ret void
8357;
8358; GFX6-LABEL: sdiv_i64_pow2k_denom:
8359; GFX6:       ; %bb.0:
8360; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
8361; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8362; GFX6-NEXT:    s_mov_b32 s6, -1
8363; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8364; GFX6-NEXT:    s_mov_b32 s4, s0
8365; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
8366; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
8367; GFX6-NEXT:    s_add_u32 s0, s2, s0
8368; GFX6-NEXT:    s_mov_b32 s5, s1
8369; GFX6-NEXT:    s_addc_u32 s1, s3, 0
8370; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8371; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8372; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8373; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8374; GFX6-NEXT:    s_endpgm
8375;
8376; GFX9-LABEL: sdiv_i64_pow2k_denom:
8377; GFX9:       ; %bb.0:
8378; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
8379; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8380; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8381; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
8382; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8383; GFX9-NEXT:    s_add_u32 s2, s2, s4
8384; GFX9-NEXT:    s_addc_u32 s3, s3, 0
8385; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8386; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8387; GFX9-NEXT:    v_mov_b32_e32 v1, s3
8388; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
8389; GFX9-NEXT:    s_endpgm
8390  %r = sdiv i64 %x, 4096
8391  store i64 %r, i64 addrspace(1)* %out
8392  ret void
8393}
8394
8395define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
8396; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
8397; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
8398; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
8399; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8400; CHECK-NEXT:    ret void
8401;
8402; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
8403; GFX6:       ; %bb.0:
8404; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
8405; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
8406; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8407; GFX6-NEXT:    s_mov_b32 s6, -1
8408; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8409; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
8410; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
8411; GFX6-NEXT:    s_add_u32 s2, s2, s8
8412; GFX6-NEXT:    s_mov_b32 s9, s8
8413; GFX6-NEXT:    s_addc_u32 s3, s3, s8
8414; GFX6-NEXT:    s_xor_b64 s[10:11], s[2:3], s[8:9]
8415; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
8416; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
8417; GFX6-NEXT:    s_sub_u32 s4, 0, s10
8418; GFX6-NEXT:    s_subb_u32 s5, 0, s11
8419; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
8420; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8421; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8422; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8423; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
8424; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8425; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8426; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8427; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8428; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8429; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8430; GFX6-NEXT:    s_add_u32 s2, s2, s12
8431; GFX6-NEXT:    s_mov_b32 s13, s12
8432; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
8433; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
8434; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
8435; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
8436; GFX6-NEXT:    s_addc_u32 s3, s3, s12
8437; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8438; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
8439; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
8440; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8441; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8442; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8443; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8444; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
8445; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8446; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
8447; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
8448; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
8449; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
8450; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
8451; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
8452; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8453; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8454; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8455; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8456; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
8457; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
8458; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
8459; GFX6-NEXT:    s_mov_b32 s5, s1
8460; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8461; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
8462; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
8463; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
8464; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
8465; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
8466; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
8467; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
8468; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
8469; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
8470; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
8471; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8472; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
8473; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
8474; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8475; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8476; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8477; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8478; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8479; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
8480; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
8481; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
8482; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
8483; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
8484; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8485; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8486; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
8487; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
8488; GFX6-NEXT:    s_mov_b32 s4, s0
8489; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8490; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8491; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
8492; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8493; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
8494; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
8495; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
8496; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
8497; GFX6-NEXT:    v_mov_b32_e32 v5, s11
8498; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8499; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
8500; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8501; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
8502; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
8503; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
8504; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s10, v3
8505; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
8506; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
8507; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8508; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v5
8509; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8510; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
8511; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
8512; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
8513; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
8514; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
8515; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
8516; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8517; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
8518; GFX6-NEXT:    v_mov_b32_e32 v6, s3
8519; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
8520; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
8521; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8522; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
8523; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8524; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
8525; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
8526; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
8527; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
8528; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8529; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
8530; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8531; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
8532; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
8533; GFX6-NEXT:    v_mov_b32_e32 v2, s1
8534; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
8535; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
8536; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8537; GFX6-NEXT:    s_endpgm
8538;
8539; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
8540; GFX9:       ; %bb.0:
8541; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
8542; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
8543; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8544; GFX9-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
8545; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
8546; GFX9-NEXT:    s_add_u32 s4, s4, s2
8547; GFX9-NEXT:    s_mov_b32 s3, s2
8548; GFX9-NEXT:    s_addc_u32 s5, s5, s2
8549; GFX9-NEXT:    s_xor_b64 s[8:9], s[4:5], s[2:3]
8550; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
8551; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
8552; GFX9-NEXT:    s_sub_u32 s10, 0, s8
8553; GFX9-NEXT:    s_subb_u32 s4, 0, s9
8554; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8555; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8556; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8557; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8558; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8559; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8560; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8561; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8562; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
8563; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
8564; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
8565; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v0
8566; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8567; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
8568; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
8569; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
8570; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
8571; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
8572; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
8573; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
8574; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
8575; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
8576; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8577; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
8578; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
8579; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
8580; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8581; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
8582; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8583; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8584; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
8585; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
8586; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v0
8587; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
8588; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8589; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8590; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
8591; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8592; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
8593; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
8594; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
8595; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
8596; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
8597; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
8598; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
8599; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8600; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
8601; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
8602; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
8603; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
8604; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8605; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
8606; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
8607; GFX9-NEXT:    s_add_u32 s0, s6, s10
8608; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8609; GFX9-NEXT:    s_mov_b32 s11, s10
8610; GFX9-NEXT:    s_addc_u32 s1, s7, s10
8611; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8612; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
8613; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
8614; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
8615; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
8616; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
8617; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
8618; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8619; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
8620; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
8621; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
8622; GFX9-NEXT:    v_mov_b32_e32 v6, s9
8623; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
8624; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
8625; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
8626; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
8627; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
8628; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
8629; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
8630; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
8631; GFX9-NEXT:    v_mov_b32_e32 v5, 0
8632; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8633; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v0
8634; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
8635; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
8636; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
8637; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
8638; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v3
8639; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
8640; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
8641; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
8642; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
8643; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8644; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
8645; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
8646; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
8647; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
8648; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
8649; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
8650; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8651; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
8652; GFX9-NEXT:    v_mov_b32_e32 v7, s7
8653; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
8654; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
8655; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
8656; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
8657; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8658; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
8659; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
8660; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
8661; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
8662; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8663; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
8664; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8665; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
8666; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
8667; GFX9-NEXT:    v_mov_b32_e32 v2, s1
8668; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
8669; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
8670; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
8671; GFX9-NEXT:    s_endpgm
8672  %shl.y = shl i64 4096, %y
8673  %r = sdiv i64 %x, %shl.y
8674  store i64 %r, i64 addrspace(1)* %out
8675  ret void
8676}
8677
8678define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8679; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
8680; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8681; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8682; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8683; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8684; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
8685; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8686; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8687; CHECK-NEXT:    ret void
8688;
8689; GFX6-LABEL: sdiv_v2i64_pow2k_denom:
8690; GFX6:       ; %bb.0:
8691; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
8692; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
8693; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8694; GFX6-NEXT:    s_mov_b32 s2, -1
8695; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8696; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
8697; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8698; GFX6-NEXT:    s_add_u32 s4, s4, s8
8699; GFX6-NEXT:    s_addc_u32 s5, s5, 0
8700; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
8701; GFX6-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
8702; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8703; GFX6-NEXT:    s_add_u32 s6, s6, s8
8704; GFX6-NEXT:    s_addc_u32 s7, s7, 0
8705; GFX6-NEXT:    s_ashr_i64 s[6:7], s[6:7], 12
8706; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8707; GFX6-NEXT:    v_mov_b32_e32 v1, s5
8708; GFX6-NEXT:    v_mov_b32_e32 v2, s6
8709; GFX6-NEXT:    v_mov_b32_e32 v3, s7
8710; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
8711; GFX6-NEXT:    s_endpgm
8712;
8713; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
8714; GFX9:       ; %bb.0:
8715; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8716; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8717; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8718; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8719; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
8720; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8721; GFX9-NEXT:    s_add_u32 s0, s4, s0
8722; GFX9-NEXT:    s_addc_u32 s1, s5, 0
8723; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
8724; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8725; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8726; GFX9-NEXT:    s_add_u32 s4, s6, s4
8727; GFX9-NEXT:    s_addc_u32 s5, s7, 0
8728; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
8729; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8730; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8731; GFX9-NEXT:    v_mov_b32_e32 v2, s4
8732; GFX9-NEXT:    v_mov_b32_e32 v3, s5
8733; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
8734; GFX9-NEXT:    s_endpgm
8735  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
8736  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8737  ret void
8738}
8739
8740define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8741; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
8742; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8743; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8744; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8745; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8746; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
8747; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8748; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8749; CHECK-NEXT:    ret void
8750;
8751; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8752; GFX6:       ; %bb.0:
8753; GFX6-NEXT:    v_mov_b32_e32 v0, 0x457ff000
8754; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
8755; GFX6-NEXT:    v_mac_f32_e32 v0, 0, v1
8756; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8757; GFX6-NEXT:    s_movk_i32 s6, 0xf001
8758; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8759; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
8760; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8761; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8762; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8763; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8764; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8765; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8766; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8767; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8768; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
8769; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8770; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
8771; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
8772; GFX6-NEXT:    s_add_u32 s0, s0, s8
8773; GFX6-NEXT:    s_addc_u32 s1, s1, 0
8774; GFX6-NEXT:    s_ashr_i64 s[8:9], s[0:1], 12
8775; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8776; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
8777; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8778; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
8779; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
8780; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8781; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8782; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8783; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8784; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8785; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
8786; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
8787; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
8788; GFX6-NEXT:    s_add_u32 s0, s2, s10
8789; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
8790; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
8791; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
8792; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8793; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8794; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8795; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8796; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s6
8797; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s6
8798; GFX6-NEXT:    s_mov_b32 s11, s10
8799; GFX6-NEXT:    s_addc_u32 s1, s3, s10
8800; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[10:11]
8801; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8802; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
8803; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8804; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
8805; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
8806; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
8807; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
8808; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
8809; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
8810; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
8811; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
8812; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8813; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
8814; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
8815; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
8816; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8817; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8818; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8819; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
8820; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
8821; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
8822; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
8823; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
8824; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
8825; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8826; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
8827; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
8828; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
8829; GFX6-NEXT:    s_movk_i32 s2, 0xfff
8830; GFX6-NEXT:    s_mov_b32 s6, -1
8831; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8832; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8833; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
8834; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8835; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
8836; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
8837; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
8838; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
8839; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s2
8840; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
8841; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
8842; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
8843; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8844; GFX6-NEXT:    v_mov_b32_e32 v5, s1
8845; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
8846; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
8847; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s2, v8
8848; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
8849; GFX6-NEXT:    s_movk_i32 s0, 0xffe
8850; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
8851; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
8852; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
8853; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
8854; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
8855; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
8856; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8857; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
8858; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
8859; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8860; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
8861; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
8862; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8863; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8864; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
8865; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
8866; GFX6-NEXT:    v_mov_b32_e32 v3, s10
8867; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s10, v0
8868; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
8869; GFX6-NEXT:    v_mov_b32_e32 v0, s8
8870; GFX6-NEXT:    v_mov_b32_e32 v1, s9
8871; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8872; GFX6-NEXT:    s_endpgm
8873;
8874; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8875; GFX9:       ; %bb.0:
8876; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
8877; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
8878; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
8879; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8880; GFX9-NEXT:    s_movk_i32 s8, 0xf001
8881; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8882; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8883; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8884; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8885; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8886; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8887; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8888; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8889; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8890; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
8891; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8892; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s8
8893; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
8894; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
8895; GFX9-NEXT:    s_add_u32 s0, s4, s0
8896; GFX9-NEXT:    s_addc_u32 s1, s5, 0
8897; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
8898; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8899; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
8900; GFX9-NEXT:    v_mul_hi_u32 v5, v0, v4
8901; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
8902; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
8903; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8904; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
8905; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
8906; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
8907; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
8908; GFX9-NEXT:    s_ashr_i64 s[4:5], s[0:1], 12
8909; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
8910; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
8911; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
8912; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8913; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
8914; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8915; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8916; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
8917; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
8918; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
8919; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
8920; GFX9-NEXT:    s_add_u32 s0, s6, s8
8921; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8922; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8923; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8924; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
8925; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
8926; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
8927; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
8928; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
8929; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
8930; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
8931; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8932; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
8933; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
8934; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
8935; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
8936; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
8937; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8938; GFX9-NEXT:    s_mov_b32 s9, s8
8939; GFX9-NEXT:    s_addc_u32 s1, s7, s8
8940; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8941; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
8942; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
8943; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
8944; GFX9-NEXT:    v_mul_hi_u32 v5, s0, v1
8945; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
8946; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
8947; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8948; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
8949; GFX9-NEXT:    v_mul_lo_u32 v5, s1, v0
8950; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
8951; GFX9-NEXT:    s_movk_i32 s6, 0xfff
8952; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8953; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
8954; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
8955; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
8956; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
8957; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
8958; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
8959; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s6
8960; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s6
8961; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s6
8962; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
8963; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
8964; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
8965; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
8966; GFX9-NEXT:    v_mov_b32_e32 v6, s1
8967; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
8968; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
8969; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v9
8970; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
8971; GFX9-NEXT:    s_movk_i32 s0, 0xffe
8972; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
8973; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8974; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
8975; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
8976; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
8977; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
8978; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8979; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
8980; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
8981; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
8982; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
8983; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
8984; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8985; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8986; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
8987; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
8988; GFX9-NEXT:    v_mov_b32_e32 v3, s8
8989; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s8, v0
8990; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
8991; GFX9-NEXT:    v_mov_b32_e32 v0, s4
8992; GFX9-NEXT:    v_mov_b32_e32 v1, s5
8993; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
8994; GFX9-NEXT:    s_endpgm
8995  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
8996  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8997  ret void
8998}
8999
9000define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
9001; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
9002; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
9003; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9004; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
9005; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
9006; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
9007; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
9008; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
9009; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
9010; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
9011; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
9012; CHECK-NEXT:    ret void
9013;
9014; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
9015; GFX6:       ; %bb.0:
9016; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
9017; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
9018; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9019; GFX6-NEXT:    s_lshl_b64 s[8:9], s[12:13], s8
9020; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s10
9021; GFX6-NEXT:    s_ashr_i32 s14, s9, 31
9022; GFX6-NEXT:    s_add_u32 s8, s8, s14
9023; GFX6-NEXT:    s_mov_b32 s15, s14
9024; GFX6-NEXT:    s_addc_u32 s9, s9, s14
9025; GFX6-NEXT:    s_xor_b64 s[12:13], s[8:9], s[14:15]
9026; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
9027; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
9028; GFX6-NEXT:    s_sub_u32 s10, 0, s12
9029; GFX6-NEXT:    s_subb_u32 s11, 0, s13
9030; GFX6-NEXT:    s_ashr_i32 s16, s5, 31
9031; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9032; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9033; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
9034; GFX6-NEXT:    s_add_u32 s0, s4, s16
9035; GFX6-NEXT:    s_mov_b32 s17, s16
9036; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9037; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9038; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9039; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9040; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9041; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9042; GFX6-NEXT:    s_addc_u32 s1, s5, s16
9043; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[16:17]
9044; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
9045; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
9046; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
9047; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v0
9048; GFX6-NEXT:    s_xor_b64 s[14:15], s[16:17], s[14:15]
9049; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9050; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9051; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9052; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9053; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
9054; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9055; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9056; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9057; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
9058; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9059; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9060; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9061; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
9062; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
9063; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9064; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9065; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9066; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9067; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
9068; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
9069; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
9070; GFX6-NEXT:    s_mov_b32 s11, 0xf000
9071; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9072; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
9073; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
9074; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
9075; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
9076; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
9077; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
9078; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
9079; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
9080; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9081; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9082; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9083; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
9084; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
9085; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
9086; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9087; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9088; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9089; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9090; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
9091; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
9092; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v1
9093; GFX6-NEXT:    v_mul_hi_u32 v5, s5, v1
9094; GFX6-NEXT:    v_mul_lo_u32 v1, s5, v1
9095; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9096; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9097; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
9098; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
9099; GFX6-NEXT:    s_mov_b32 s10, -1
9100; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9101; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9102; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
9103; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9104; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
9105; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
9106; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
9107; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
9108; GFX6-NEXT:    v_mov_b32_e32 v5, s13
9109; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9110; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
9111; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9112; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v2
9113; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
9114; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
9115; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s12, v3
9116; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
9117; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
9118; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9119; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
9120; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9121; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
9122; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
9123; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
9124; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
9125; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
9126; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
9127; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
9128; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
9129; GFX6-NEXT:    s_add_u32 s2, s2, s4
9130; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
9131; GFX6-NEXT:    v_mov_b32_e32 v6, s5
9132; GFX6-NEXT:    s_mov_b32 s5, s4
9133; GFX6-NEXT:    s_addc_u32 s3, s3, s4
9134; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
9135; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s2
9136; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s3
9137; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
9138; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
9139; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9140; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
9141; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9142; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
9143; GFX6-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
9144; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
9145; GFX6-NEXT:    v_rcp_f32_e32 v3, v8
9146; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9147; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
9148; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
9149; GFX6-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
9150; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
9151; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
9152; GFX6-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
9153; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
9154; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
9155; GFX6-NEXT:    s_sub_u32 s0, 0, s2
9156; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9157; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v3
9158; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
9159; GFX6-NEXT:    s_subb_u32 s1, 0, s3
9160; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
9161; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
9162; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9163; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
9164; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
9165; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
9166; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
9167; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
9168; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v2
9169; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
9170; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9171; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9172; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
9173; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
9174; GFX6-NEXT:    s_mov_b32 s13, s12
9175; GFX6-NEXT:    v_xor_b32_e32 v0, s14, v0
9176; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
9177; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
9178; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
9179; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9180; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9181; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9182; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
9183; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
9184; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
9185; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
9186; GFX6-NEXT:    v_xor_b32_e32 v1, s15, v1
9187; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9188; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
9189; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
9190; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
9191; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
9192; GFX6-NEXT:    v_mul_hi_u32 v10, v2, v4
9193; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
9194; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v5
9195; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v4
9196; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
9197; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
9198; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
9199; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
9200; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
9201; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
9202; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9203; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9204; GFX6-NEXT:    s_add_u32 s0, s6, s12
9205; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9206; GFX6-NEXT:    s_addc_u32 s1, s7, s12
9207; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
9208; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
9209; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
9210; GFX6-NEXT:    v_mul_hi_u32 v5, s6, v2
9211; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
9212; GFX6-NEXT:    v_mul_hi_u32 v8, s7, v3
9213; GFX6-NEXT:    v_mul_lo_u32 v3, s7, v3
9214; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9215; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
9216; GFX6-NEXT:    v_mul_lo_u32 v7, s7, v2
9217; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
9218; GFX6-NEXT:    v_mov_b32_e32 v6, s15
9219; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
9220; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
9221; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
9222; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9223; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9224; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v3
9225; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v2
9226; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
9227; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
9228; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v2
9229; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9230; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
9231; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
9232; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s7, v4
9233; GFX6-NEXT:    v_mov_b32_e32 v7, s3
9234; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s6, v5
9235; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
9236; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s2, v5
9237; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
9238; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
9239; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9240; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
9241; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9242; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
9243; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
9244; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
9245; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
9246; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
9247; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
9248; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9249; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
9250; GFX6-NEXT:    v_mov_b32_e32 v8, s7
9251; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
9252; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v4
9253; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9254; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v5
9255; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9256; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v4
9257; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
9258; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9259; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
9260; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9261; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[4:5]
9262; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
9263; GFX6-NEXT:    v_xor_b32_e32 v2, s0, v2
9264; GFX6-NEXT:    v_xor_b32_e32 v3, s1, v3
9265; GFX6-NEXT:    v_mov_b32_e32 v4, s1
9266; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
9267; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
9268; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9269; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
9270; GFX6-NEXT:    s_endpgm
9271;
9272; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
9273; GFX9:       ; %bb.0:
9274; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
9275; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
9276; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9277; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
9278; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
9279; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
9280; GFX9-NEXT:    s_add_u32 s2, s2, s12
9281; GFX9-NEXT:    s_mov_b32 s13, s12
9282; GFX9-NEXT:    s_addc_u32 s3, s3, s12
9283; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
9284; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
9285; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
9286; GFX9-NEXT:    s_sub_u32 s2, 0, s8
9287; GFX9-NEXT:    s_subb_u32 s3, 0, s9
9288; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
9289; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9290; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9291; GFX9-NEXT:    s_mov_b32 s15, s14
9292; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9293; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9294; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9295; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9296; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9297; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9298; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
9299; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
9300; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
9301; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
9302; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9303; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
9304; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
9305; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
9306; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
9307; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
9308; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
9309; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
9310; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
9311; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
9312; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9313; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
9314; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
9315; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
9316; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9317; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
9318; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9319; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
9320; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
9321; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
9322; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
9323; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
9324; GFX9-NEXT:    s_add_u32 s2, s4, s14
9325; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9326; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
9327; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
9328; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
9329; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
9330; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
9331; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
9332; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
9333; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
9334; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
9335; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9336; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
9337; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
9338; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
9339; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
9340; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
9341; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9342; GFX9-NEXT:    s_addc_u32 s3, s5, s14
9343; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
9344; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
9345; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
9346; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
9347; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v1
9348; GFX9-NEXT:    v_mul_hi_u32 v5, s5, v1
9349; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v1
9350; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9351; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
9352; GFX9-NEXT:    v_mul_lo_u32 v4, s5, v0
9353; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
9354; GFX9-NEXT:    v_mov_b32_e32 v6, s9
9355; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
9356; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
9357; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9358; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
9359; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
9360; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
9361; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v2
9362; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v1
9363; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v1
9364; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
9365; GFX9-NEXT:    v_mov_b32_e32 v0, 0
9366; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
9367; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v1
9368; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
9369; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v3
9370; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s4, v4
9371; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
9372; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v4
9373; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
9374; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v5
9375; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9376; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
9377; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9378; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v5
9379; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
9380; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v1
9381; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1]
9382; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v1
9383; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v2, s[0:1]
9384; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
9385; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9386; GFX9-NEXT:    s_add_u32 s10, s10, s4
9387; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
9388; GFX9-NEXT:    v_mov_b32_e32 v7, s5
9389; GFX9-NEXT:    s_mov_b32 s5, s4
9390; GFX9-NEXT:    s_addc_u32 s11, s11, s4
9391; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[4:5]
9392; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s10
9393; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s11
9394; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
9395; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
9396; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9397; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
9398; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
9399; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
9400; GFX9-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v10
9401; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
9402; GFX9-NEXT:    v_rcp_f32_e32 v4, v9
9403; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
9404; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
9405; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
9406; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
9407; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
9408; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
9409; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
9410; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
9411; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
9412; GFX9-NEXT:    s_sub_u32 s0, 0, s10
9413; GFX9-NEXT:    s_subb_u32 s1, 0, s11
9414; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v4
9415; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v5
9416; GFX9-NEXT:    v_mul_lo_u32 v8, s1, v4
9417; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
9418; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v4
9419; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
9420; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
9421; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v6
9422; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v3
9423; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v6
9424; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v6
9425; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
9426; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
9427; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
9428; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v3
9429; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
9430; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
9431; GFX9-NEXT:    s_mov_b32 s9, s8
9432; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
9433; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
9434; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v10, vcc
9435; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
9436; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
9437; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
9438; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v6, vcc
9439; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v4
9440; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v3
9441; GFX9-NEXT:    v_mul_lo_u32 v7, s1, v3
9442; GFX9-NEXT:    v_mul_lo_u32 v8, s0, v3
9443; GFX9-NEXT:    s_add_u32 s0, s6, s8
9444; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
9445; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
9446; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v5
9447; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v8
9448; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v5
9449; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v8
9450; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v8
9451; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v5
9452; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
9453; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v11, vcc
9454; GFX9-NEXT:    v_mul_lo_u32 v5, v4, v5
9455; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
9456; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v7, vcc
9457; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
9458; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
9459; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
9460; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
9461; GFX9-NEXT:    s_addc_u32 s1, s7, s8
9462; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
9463; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
9464; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v4
9465; GFX9-NEXT:    v_mul_hi_u32 v6, s6, v3
9466; GFX9-NEXT:    v_mul_hi_u32 v8, s6, v4
9467; GFX9-NEXT:    v_mul_hi_u32 v9, s7, v4
9468; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v4
9469; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
9470; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
9471; GFX9-NEXT:    v_mul_lo_u32 v8, s7, v3
9472; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
9473; GFX9-NEXT:    v_xor_b32_e32 v1, s12, v1
9474; GFX9-NEXT:    v_xor_b32_e32 v2, s13, v2
9475; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
9476; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
9477; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v9, vcc
9478; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
9479; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
9480; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v4
9481; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v3
9482; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v3
9483; GFX9-NEXT:    v_mov_b32_e32 v7, s13
9484; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s12, v1
9485; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
9486; GFX9-NEXT:    v_mul_lo_u32 v6, s10, v3
9487; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
9488; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
9489; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v5
9490; GFX9-NEXT:    v_mov_b32_e32 v8, s11
9491; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, s6, v6
9492; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc
9493; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s10, v6
9494; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1]
9495; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
9496; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
9497; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v8
9498; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9499; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
9500; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
9501; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 2, v3
9502; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v4, s[0:1]
9503; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v3
9504; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1]
9505; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
9506; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[0:1]
9507; GFX9-NEXT:    v_mov_b32_e32 v9, s7
9508; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v9, v5, vcc
9509; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
9510; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
9511; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
9512; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9513; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v5
9514; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
9515; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
9516; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
9517; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
9518; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[4:5]
9519; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
9520; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
9521; GFX9-NEXT:    v_xor_b32_e32 v4, s1, v4
9522; GFX9-NEXT:    v_mov_b32_e32 v5, s1
9523; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v3
9524; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
9525; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9526; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
9527; GFX9-NEXT:    s_endpgm
9528  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
9529  %r = sdiv <2 x i64> %x, %shl.y
9530  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
9531  ret void
9532}
9533
9534define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
9535; CHECK-LABEL: @srem_i64_oddk_denom(
9536; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
9537; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9538; CHECK-NEXT:    ret void
9539;
9540; GFX6-LABEL: srem_i64_oddk_denom:
9541; GFX6:       ; %bb.0:
9542; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9543; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
9544; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9545; GFX6-NEXT:    s_mov_b32 s4, 0xffed2705
9546; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
9547; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9548; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9549; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9550; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9551; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9552; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9553; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9554; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9555; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
9556; GFX6-NEXT:    s_add_u32 s2, s2, s8
9557; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
9558; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
9559; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s4
9560; GFX6-NEXT:    s_mov_b32 s9, s8
9561; GFX6-NEXT:    s_addc_u32 s3, s3, s8
9562; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9563; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
9564; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9565; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9566; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
9567; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9568; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9569; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9570; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9571; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9572; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9573; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
9574; GFX6-NEXT:    s_mov_b32 s5, s1
9575; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9576; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
9577; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
9578; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9579; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9580; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9581; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9582; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
9583; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
9584; GFX6-NEXT:    s_mov_b32 s6, -1
9585; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9586; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s4
9587; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
9588; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
9589; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
9590; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
9591; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
9592; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
9593; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
9594; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9595; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9596; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9597; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
9598; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
9599; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
9600; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9601; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9602; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9603; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9604; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
9605; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
9606; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
9607; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
9608; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
9609; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9610; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9611; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
9612; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
9613; GFX6-NEXT:    s_mov_b32 s4, s0
9614; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
9615; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9616; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9617; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
9618; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9619; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
9620; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
9621; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s0
9622; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
9623; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
9624; GFX6-NEXT:    v_mov_b32_e32 v2, s3
9625; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
9626; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
9627; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v0
9628; GFX6-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
9629; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s0, v2
9630; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
9631; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
9632; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
9633; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9634; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
9635; GFX6-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
9636; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9637; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
9638; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
9639; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9640; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
9641; GFX6-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
9642; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9643; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9644; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
9645; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9646; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
9647; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
9648; GFX6-NEXT:    v_mov_b32_e32 v2, s8
9649; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
9650; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
9651; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9652; GFX6-NEXT:    s_endpgm
9653;
9654; GFX9-LABEL: srem_i64_oddk_denom:
9655; GFX9:       ; %bb.0:
9656; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9657; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
9658; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9659; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
9660; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9661; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9662; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9663; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9664; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9665; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9666; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9667; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
9668; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
9669; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
9670; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9671; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
9672; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
9673; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
9674; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
9675; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v4
9676; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
9677; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
9678; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
9679; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
9680; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9681; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
9682; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
9683; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
9684; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9685; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
9686; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9687; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
9688; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
9689; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
9690; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
9691; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9692; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
9693; GFX9-NEXT:    s_add_u32 s0, s6, s2
9694; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9695; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
9696; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
9697; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
9698; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
9699; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
9700; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
9701; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
9702; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
9703; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
9704; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9705; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
9706; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
9707; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
9708; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
9709; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
9710; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9711; GFX9-NEXT:    s_mov_b32 s3, s2
9712; GFX9-NEXT:    s_addc_u32 s1, s7, s2
9713; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
9714; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
9715; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
9716; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
9717; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
9718; GFX9-NEXT:    v_mul_hi_u32 v5, s1, v1
9719; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
9720; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9721; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
9722; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
9723; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
9724; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
9725; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
9726; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9727; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
9728; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9729; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
9730; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
9731; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s3
9732; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
9733; GFX9-NEXT:    v_mov_b32_e32 v3, 0
9734; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
9735; GFX9-NEXT:    v_mov_b32_e32 v2, s1
9736; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
9737; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
9738; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s3, v0
9739; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc
9740; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s3, v2
9741; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc
9742; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
9743; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
9744; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9745; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
9746; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
9747; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
9748; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
9749; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
9750; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9751; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
9752; GFX9-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[0:1]
9753; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9754; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
9755; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
9756; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
9757; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
9758; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
9759; GFX9-NEXT:    v_mov_b32_e32 v2, s2
9760; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
9761; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
9762; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
9763; GFX9-NEXT:    s_endpgm
9764  %r = srem i64 %x, 1235195
9765  store i64 %r, i64 addrspace(1)* %out
9766  ret void
9767}
9768
9769define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
9770; CHECK-LABEL: @srem_i64_pow2k_denom(
9771; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
9772; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9773; CHECK-NEXT:    ret void
9774;
9775; GFX6-LABEL: srem_i64_pow2k_denom:
9776; GFX6:       ; %bb.0:
9777; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
9778; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9779; GFX6-NEXT:    s_mov_b32 s6, -1
9780; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9781; GFX6-NEXT:    s_mov_b32 s4, s0
9782; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
9783; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
9784; GFX6-NEXT:    s_add_u32 s0, s2, s0
9785; GFX6-NEXT:    s_mov_b32 s5, s1
9786; GFX6-NEXT:    s_addc_u32 s1, s3, 0
9787; GFX6-NEXT:    s_and_b32 s0, s0, 0xfffff000
9788; GFX6-NEXT:    s_sub_u32 s0, s2, s0
9789; GFX6-NEXT:    s_subb_u32 s1, s3, s1
9790; GFX6-NEXT:    v_mov_b32_e32 v0, s0
9791; GFX6-NEXT:    v_mov_b32_e32 v1, s1
9792; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9793; GFX6-NEXT:    s_endpgm
9794;
9795; GFX9-LABEL: srem_i64_pow2k_denom:
9796; GFX9:       ; %bb.0:
9797; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
9798; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9799; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9800; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
9801; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
9802; GFX9-NEXT:    s_add_u32 s4, s2, s4
9803; GFX9-NEXT:    s_addc_u32 s5, s3, 0
9804; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
9805; GFX9-NEXT:    s_sub_u32 s2, s2, s4
9806; GFX9-NEXT:    s_subb_u32 s3, s3, s5
9807; GFX9-NEXT:    v_mov_b32_e32 v0, s2
9808; GFX9-NEXT:    v_mov_b32_e32 v1, s3
9809; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
9810; GFX9-NEXT:    s_endpgm
9811  %r = srem i64 %x, 4096
9812  store i64 %r, i64 addrspace(1)* %out
9813  ret void
9814}
9815
9816define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
9817; CHECK-LABEL: @srem_i64_pow2_shl_denom(
9818; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
9819; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
9820; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9821; CHECK-NEXT:    ret void
9822;
9823; GFX6-LABEL: srem_i64_pow2_shl_denom:
9824; GFX6:       ; %bb.0:
9825; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
9826; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
9827; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9828; GFX6-NEXT:    s_mov_b32 s6, -1
9829; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9830; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9831; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
9832; GFX6-NEXT:    s_add_u32 s2, s2, s4
9833; GFX6-NEXT:    s_mov_b32 s5, s4
9834; GFX6-NEXT:    s_addc_u32 s3, s3, s4
9835; GFX6-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
9836; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
9837; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
9838; GFX6-NEXT:    s_sub_u32 s4, 0, s8
9839; GFX6-NEXT:    s_subb_u32 s5, 0, s9
9840; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
9841; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9842; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9843; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9844; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
9845; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9846; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9847; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9848; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9849; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9850; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9851; GFX6-NEXT:    s_add_u32 s2, s2, s10
9852; GFX6-NEXT:    s_mov_b32 s11, s10
9853; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
9854; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
9855; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
9856; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
9857; GFX6-NEXT:    s_addc_u32 s3, s3, s10
9858; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9859; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9860; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9861; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9862; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
9863; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9864; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9865; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9866; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9867; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9868; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9869; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
9870; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9871; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
9872; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
9873; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9874; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9875; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9876; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9877; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
9878; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
9879; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
9880; GFX6-NEXT:    s_mov_b32 s5, s1
9881; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9882; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
9883; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
9884; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
9885; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
9886; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
9887; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
9888; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
9889; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
9890; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
9891; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
9892; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9893; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
9894; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
9895; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
9896; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9897; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9898; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9899; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
9900; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
9901; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
9902; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v1
9903; GFX6-NEXT:    v_mul_hi_u32 v5, s13, v1
9904; GFX6-NEXT:    v_mul_lo_u32 v1, s13, v1
9905; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9906; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
9907; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
9908; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
9909; GFX6-NEXT:    s_mov_b32 s4, s0
9910; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9911; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9912; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
9913; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9914; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
9915; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v1
9916; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
9917; GFX6-NEXT:    v_mul_lo_u32 v3, s9, v0
9918; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
9919; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
9920; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
9921; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s13, v1
9922; GFX6-NEXT:    v_mov_b32_e32 v3, s9
9923; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
9924; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
9925; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
9926; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
9927; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v5
9928; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
9929; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
9930; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
9931; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
9932; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v5
9933; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
9934; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
9935; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
9936; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9937; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
9938; GFX6-NEXT:    v_mov_b32_e32 v5, s13
9939; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
9940; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
9941; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9942; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
9943; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9944; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
9945; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
9946; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
9947; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
9948; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
9949; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9950; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
9951; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
9952; GFX6-NEXT:    v_mov_b32_e32 v2, s10
9953; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
9954; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
9955; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9956; GFX6-NEXT:    s_endpgm
9957;
9958; GFX9-LABEL: srem_i64_pow2_shl_denom:
9959; GFX9:       ; %bb.0:
9960; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
9961; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
9962; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9963; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9964; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
9965; GFX9-NEXT:    s_add_u32 s2, s2, s4
9966; GFX9-NEXT:    s_mov_b32 s5, s4
9967; GFX9-NEXT:    s_addc_u32 s3, s3, s4
9968; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
9969; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
9970; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
9971; GFX9-NEXT:    s_sub_u32 s2, 0, s8
9972; GFX9-NEXT:    s_subb_u32 s3, 0, s9
9973; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9974; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9975; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9976; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9977; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
9978; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9979; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9980; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9981; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9982; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9983; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9984; GFX9-NEXT:    s_add_u32 s0, s6, s10
9985; GFX9-NEXT:    s_mov_b32 s11, s10
9986; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
9987; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
9988; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
9989; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
9990; GFX9-NEXT:    s_addc_u32 s1, s7, s10
9991; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9992; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
9993; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
9994; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
9995; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
9996; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
9997; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
9998; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
9999; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
10000; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
10001; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
10002; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
10003; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
10004; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
10005; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10006; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
10007; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
10008; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
10009; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
10010; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
10011; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
10012; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
10013; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
10014; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10015; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
10016; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
10017; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
10018; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
10019; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
10020; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
10021; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
10022; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
10023; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
10024; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
10025; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
10026; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
10027; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
10028; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
10029; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
10030; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
10031; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
10032; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
10033; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
10034; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
10035; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
10036; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
10037; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10038; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
10039; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
10040; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
10041; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
10042; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
10043; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
10044; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
10045; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
10046; GFX9-NEXT:    v_mul_lo_u32 v1, s8, v1
10047; GFX9-NEXT:    v_mul_hi_u32 v2, s8, v0
10048; GFX9-NEXT:    v_mul_lo_u32 v3, s9, v0
10049; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
10050; GFX9-NEXT:    v_mov_b32_e32 v4, 0
10051; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
10052; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
10053; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
10054; GFX9-NEXT:    v_mov_b32_e32 v3, s9
10055; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
10056; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
10057; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
10058; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
10059; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
10060; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10061; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
10062; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10063; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10064; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
10065; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v5
10066; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
10067; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10068; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
10069; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
10070; GFX9-NEXT:    v_mov_b32_e32 v6, s7
10071; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
10072; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
10073; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10074; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
10075; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10076; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
10077; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
10078; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
10079; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10080; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
10081; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10082; GFX9-NEXT:    v_xor_b32_e32 v0, s10, v0
10083; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
10084; GFX9-NEXT:    v_mov_b32_e32 v2, s10
10085; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s10, v0
10086; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
10087; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
10088; GFX9-NEXT:    s_endpgm
10089  %shl.y = shl i64 4096, %y
10090  %r = srem i64 %x, %shl.y
10091  store i64 %r, i64 addrspace(1)* %out
10092  ret void
10093}
10094
10095define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
10096; CHECK-LABEL: @srem_v2i64_pow2k_denom(
10097; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10098; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
10099; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
10100; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
10101; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
10102; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
10103; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10104; CHECK-NEXT:    ret void
10105;
10106; GFX6-LABEL: srem_v2i64_pow2k_denom:
10107; GFX6:       ; %bb.0:
10108; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
10109; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
10110; GFX6-NEXT:    s_mov_b32 s3, 0xf000
10111; GFX6-NEXT:    s_mov_b32 s2, -1
10112; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10113; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
10114; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
10115; GFX6-NEXT:    s_add_u32 s8, s4, s8
10116; GFX6-NEXT:    s_addc_u32 s9, s5, 0
10117; GFX6-NEXT:    s_and_b32 s8, s8, 0xfffff000
10118; GFX6-NEXT:    s_sub_u32 s4, s4, s8
10119; GFX6-NEXT:    s_subb_u32 s5, s5, s9
10120; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
10121; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
10122; GFX6-NEXT:    s_add_u32 s8, s6, s8
10123; GFX6-NEXT:    s_addc_u32 s9, s7, 0
10124; GFX6-NEXT:    s_and_b32 s8, s8, 0xfffff000
10125; GFX6-NEXT:    s_sub_u32 s6, s6, s8
10126; GFX6-NEXT:    s_subb_u32 s7, s7, s9
10127; GFX6-NEXT:    v_mov_b32_e32 v0, s4
10128; GFX6-NEXT:    v_mov_b32_e32 v1, s5
10129; GFX6-NEXT:    v_mov_b32_e32 v2, s6
10130; GFX6-NEXT:    v_mov_b32_e32 v3, s7
10131; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
10132; GFX6-NEXT:    s_endpgm
10133;
10134; GFX9-LABEL: srem_v2i64_pow2k_denom:
10135; GFX9:       ; %bb.0:
10136; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10137; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10138; GFX9-NEXT:    v_mov_b32_e32 v4, 0
10139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10140; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
10141; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
10142; GFX9-NEXT:    s_add_u32 s0, s4, s0
10143; GFX9-NEXT:    s_addc_u32 s1, s5, 0
10144; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
10145; GFX9-NEXT:    s_sub_u32 s0, s4, s0
10146; GFX9-NEXT:    s_subb_u32 s1, s5, s1
10147; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
10148; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
10149; GFX9-NEXT:    s_add_u32 s4, s6, s4
10150; GFX9-NEXT:    s_addc_u32 s5, s7, 0
10151; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
10152; GFX9-NEXT:    s_sub_u32 s4, s6, s4
10153; GFX9-NEXT:    s_subb_u32 s5, s7, s5
10154; GFX9-NEXT:    v_mov_b32_e32 v0, s0
10155; GFX9-NEXT:    v_mov_b32_e32 v1, s1
10156; GFX9-NEXT:    v_mov_b32_e32 v2, s4
10157; GFX9-NEXT:    v_mov_b32_e32 v3, s5
10158; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10159; GFX9-NEXT:    s_endpgm
10160  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
10161  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10162  ret void
10163}
10164
10165define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
10166; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
10167; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
10168; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10169; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
10170; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
10171; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
10172; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
10173; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
10174; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
10175; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
10176; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10177; CHECK-NEXT:    ret void
10178;
10179; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
10180; GFX6:       ; %bb.0:
10181; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
10182; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
10183; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10184; GFX6-NEXT:    s_mov_b32 s11, 0xf000
10185; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s10
10186; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
10187; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
10188; GFX6-NEXT:    s_add_u32 s2, s2, s8
10189; GFX6-NEXT:    s_mov_b32 s9, s8
10190; GFX6-NEXT:    s_addc_u32 s3, s3, s8
10191; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[8:9]
10192; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s16
10193; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s17
10194; GFX6-NEXT:    s_sub_u32 s2, 0, s16
10195; GFX6-NEXT:    s_subb_u32 s3, 0, s17
10196; GFX6-NEXT:    s_ashr_i32 s12, s5, 31
10197; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
10198; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
10199; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
10200; GFX6-NEXT:    s_add_u32 s0, s4, s12
10201; GFX6-NEXT:    s_mov_b32 s13, s12
10202; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10203; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10204; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
10205; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10206; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
10207; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
10208; GFX6-NEXT:    s_addc_u32 s1, s5, s12
10209; GFX6-NEXT:    s_xor_b64 s[4:5], s[0:1], s[12:13]
10210; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
10211; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
10212; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
10213; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
10214; GFX6-NEXT:    s_mov_b32 s10, -1
10215; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10216; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
10217; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
10218; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
10219; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
10220; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
10221; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
10222; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
10223; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
10224; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
10225; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10226; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
10227; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
10228; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
10229; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10230; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10231; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10232; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
10233; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
10234; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
10235; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
10236; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10237; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
10238; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
10239; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
10240; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
10241; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
10242; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
10243; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
10244; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
10245; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
10246; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
10247; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10248; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
10249; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
10250; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
10251; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10252; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10253; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10254; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
10255; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
10256; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
10257; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v1
10258; GFX6-NEXT:    v_mul_hi_u32 v5, s5, v1
10259; GFX6-NEXT:    v_mul_lo_u32 v1, s5, v1
10260; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10261; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10262; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
10263; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
10264; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10265; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
10266; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
10267; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
10268; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
10269; GFX6-NEXT:    v_mul_lo_u32 v1, s16, v1
10270; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v0
10271; GFX6-NEXT:    v_mul_lo_u32 v3, s17, v0
10272; GFX6-NEXT:    v_mul_lo_u32 v0, s16, v0
10273; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
10274; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10275; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v1
10276; GFX6-NEXT:    v_mov_b32_e32 v3, s17
10277; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
10278; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10279; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s16, v0
10280; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
10281; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v5
10282; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
10283; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v4
10284; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10285; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10286; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v5
10287; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v4
10288; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
10289; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10290; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
10291; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
10292; GFX6-NEXT:    s_add_u32 s4, s14, s2
10293; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
10294; GFX6-NEXT:    v_mov_b32_e32 v5, s5
10295; GFX6-NEXT:    s_mov_b32 s3, s2
10296; GFX6-NEXT:    s_addc_u32 s5, s15, s2
10297; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
10298; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
10299; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s5
10300; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
10301; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
10302; GFX6-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
10303; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10304; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
10305; GFX6-NEXT:    v_rcp_f32_e32 v6, v6
10306; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10307; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
10308; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
10309; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10310; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10311; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
10312; GFX6-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v6
10313; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
10314; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
10315; GFX6-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
10316; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
10317; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
10318; GFX6-NEXT:    s_sub_u32 s0, 0, s4
10319; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10320; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v3
10321; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
10322; GFX6-NEXT:    s_subb_u32 s1, 0, s5
10323; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
10324; GFX6-NEXT:    s_ashr_i32 s14, s7, 31
10325; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
10326; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
10327; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
10328; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
10329; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
10330; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
10331; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v2
10332; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
10333; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
10334; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
10335; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
10336; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
10337; GFX6-NEXT:    s_mov_b32 s15, s14
10338; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
10339; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
10340; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
10341; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
10342; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
10343; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
10344; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10345; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
10346; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
10347; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
10348; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
10349; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
10350; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10351; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
10352; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
10353; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
10354; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
10355; GFX6-NEXT:    v_mul_hi_u32 v10, v2, v4
10356; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
10357; GFX6-NEXT:    v_mul_lo_u32 v5, v3, v5
10358; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v4
10359; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
10360; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
10361; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
10362; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
10363; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
10364; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
10365; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10366; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
10367; GFX6-NEXT:    s_add_u32 s0, s6, s14
10368; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10369; GFX6-NEXT:    s_addc_u32 s1, s7, s14
10370; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
10371; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[14:15]
10372; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
10373; GFX6-NEXT:    v_mul_hi_u32 v5, s6, v2
10374; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
10375; GFX6-NEXT:    v_mul_hi_u32 v8, s7, v3
10376; GFX6-NEXT:    v_mul_lo_u32 v3, s7, v3
10377; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
10378; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
10379; GFX6-NEXT:    v_mul_lo_u32 v7, s7, v2
10380; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
10381; GFX6-NEXT:    v_mov_b32_e32 v6, s12
10382; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
10383; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
10384; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
10385; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
10386; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
10387; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v3
10388; GFX6-NEXT:    v_mul_hi_u32 v4, s4, v2
10389; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v2
10390; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
10391; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v2
10392; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
10393; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
10394; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
10395; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v3
10396; GFX6-NEXT:    v_mov_b32_e32 v5, s5
10397; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
10398; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
10399; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s4, v2
10400; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
10401; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v7
10402; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10403; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v6
10404; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
10405; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10406; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s5, v7
10407; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s4, v6
10408; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
10409; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
10410; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
10411; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
10412; GFX6-NEXT:    v_mov_b32_e32 v7, s7
10413; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
10414; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
10415; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10416; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
10417; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10418; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v3
10419; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
10420; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10421; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
10422; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
10423; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
10424; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
10425; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
10426; GFX6-NEXT:    v_mov_b32_e32 v4, s14
10427; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
10428; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
10429; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10430; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
10431; GFX6-NEXT:    s_endpgm
10432;
10433; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
10434; GFX9:       ; %bb.0:
10435; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
10436; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
10437; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10438; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
10439; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
10440; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
10441; GFX9-NEXT:    s_add_u32 s2, s2, s8
10442; GFX9-NEXT:    s_mov_b32 s9, s8
10443; GFX9-NEXT:    s_addc_u32 s3, s3, s8
10444; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[8:9]
10445; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
10446; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
10447; GFX9-NEXT:    s_sub_u32 s2, 0, s12
10448; GFX9-NEXT:    s_subb_u32 s3, 0, s13
10449; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
10450; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
10451; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
10452; GFX9-NEXT:    s_mov_b32 s9, s8
10453; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10454; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10455; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10456; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10457; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10458; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
10459; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
10460; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
10461; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
10462; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
10463; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10464; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
10465; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
10466; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
10467; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
10468; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
10469; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
10470; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
10471; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
10472; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
10473; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
10474; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
10475; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
10476; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
10477; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10478; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
10479; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
10480; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
10481; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
10482; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
10483; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
10484; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
10485; GFX9-NEXT:    s_add_u32 s2, s4, s8
10486; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10487; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
10488; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
10489; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
10490; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
10491; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
10492; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
10493; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
10494; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
10495; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
10496; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
10497; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
10498; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
10499; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
10500; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
10501; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
10502; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
10503; GFX9-NEXT:    s_addc_u32 s3, s5, s8
10504; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
10505; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
10506; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
10507; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
10508; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v1
10509; GFX9-NEXT:    v_mul_hi_u32 v5, s15, v1
10510; GFX9-NEXT:    v_mul_lo_u32 v1, s15, v1
10511; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10512; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
10513; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
10514; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
10515; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
10516; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
10517; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
10518; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
10519; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
10520; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
10521; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
10522; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
10523; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
10524; GFX9-NEXT:    v_mul_lo_u32 v1, s12, v1
10525; GFX9-NEXT:    v_mov_b32_e32 v0, 0
10526; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10527; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
10528; GFX9-NEXT:    v_sub_u32_e32 v3, s15, v2
10529; GFX9-NEXT:    v_mov_b32_e32 v4, s13
10530; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s14, v1
10531; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
10532; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v1
10533; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
10534; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v6
10535; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10536; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v5
10537; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10538; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v6
10539; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
10540; GFX9-NEXT:    s_ashr_i32 s2, s11, 31
10541; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
10542; GFX9-NEXT:    s_add_u32 s10, s10, s2
10543; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v5
10544; GFX9-NEXT:    s_mov_b32 s3, s2
10545; GFX9-NEXT:    s_addc_u32 s11, s11, s2
10546; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
10547; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[2:3]
10548; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
10549; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s10
10550; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s11
10551; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
10552; GFX9-NEXT:    v_mov_b32_e32 v6, s15
10553; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v6, v2, vcc
10554; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
10555; GFX9-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
10556; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10557; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v1
10558; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
10559; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
10560; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
10561; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
10562; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
10563; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
10564; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
10565; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v7
10566; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
10567; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
10568; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
10569; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
10570; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
10571; GFX9-NEXT:    s_sub_u32 s0, 0, s10
10572; GFX9-NEXT:    s_subb_u32 s1, 0, s11
10573; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v4
10574; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v5
10575; GFX9-NEXT:    v_mul_lo_u32 v8, s1, v4
10576; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
10577; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v4
10578; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
10579; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
10580; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v6
10581; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v3
10582; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v6
10583; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v6
10584; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
10585; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
10586; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
10587; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v3
10588; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
10589; GFX9-NEXT:    s_ashr_i32 s12, s7, 31
10590; GFX9-NEXT:    s_mov_b32 s13, s12
10591; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
10592; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
10593; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v10, vcc
10594; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
10595; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
10596; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
10597; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v6, vcc
10598; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v4
10599; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v3
10600; GFX9-NEXT:    v_mul_lo_u32 v7, s1, v3
10601; GFX9-NEXT:    v_mul_lo_u32 v8, s0, v3
10602; GFX9-NEXT:    s_add_u32 s0, s6, s12
10603; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
10604; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
10605; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v5
10606; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v8
10607; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v5
10608; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v8
10609; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v8
10610; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v5
10611; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
10612; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v11, vcc
10613; GFX9-NEXT:    v_mul_lo_u32 v5, v4, v5
10614; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
10615; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v7, vcc
10616; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
10617; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
10618; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
10619; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
10620; GFX9-NEXT:    s_addc_u32 s1, s7, s12
10621; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
10622; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
10623; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v4
10624; GFX9-NEXT:    v_mul_hi_u32 v6, s6, v3
10625; GFX9-NEXT:    v_mul_hi_u32 v8, s6, v4
10626; GFX9-NEXT:    v_mul_hi_u32 v9, s7, v4
10627; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v4
10628; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
10629; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
10630; GFX9-NEXT:    v_mul_lo_u32 v8, s7, v3
10631; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
10632; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
10633; GFX9-NEXT:    v_xor_b32_e32 v2, s8, v2
10634; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
10635; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
10636; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v9, vcc
10637; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
10638; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
10639; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v4
10640; GFX9-NEXT:    v_mul_hi_u32 v5, s10, v3
10641; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v3
10642; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v3
10643; GFX9-NEXT:    v_mov_b32_e32 v7, s8
10644; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s8, v1
10645; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
10646; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
10647; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
10648; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v4
10649; GFX9-NEXT:    v_mov_b32_e32 v6, s11
10650; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
10651; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
10652; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v3
10653; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
10654; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s11, v8
10655; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10656; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v7
10657; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1]
10658; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
10659; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s11, v8
10660; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v7
10661; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
10662; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
10663; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
10664; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
10665; GFX9-NEXT:    v_mov_b32_e32 v8, s7
10666; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v8, v4, vcc
10667; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
10668; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10669; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
10670; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
10671; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
10672; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
10673; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
10674; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
10675; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
10676; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
10677; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
10678; GFX9-NEXT:    v_xor_b32_e32 v4, s12, v4
10679; GFX9-NEXT:    v_mov_b32_e32 v5, s12
10680; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s12, v3
10681; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
10682; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10683; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
10684; GFX9-NEXT:    s_endpgm
10685  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
10686  %r = srem <2 x i64> %x, %shl.y
10687  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10688  ret void
10689}
10690