1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
6
7define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
8; CHECK-LABEL: @udiv_i32(
9; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
10; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
11; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
12; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
13; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
14; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
15; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
16; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
17; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
18; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
19; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
20; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
21; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
22; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
23; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
24; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
25; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
26; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
27; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
28; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
29; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
30; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
31; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
32; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
33; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
34; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
35; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
36; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
37; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
38; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
39; CHECK-NEXT:    ret void
40;
41; GFX6-LABEL: udiv_i32:
42; GFX6:       ; %bb.0:
43; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
44; GFX6-NEXT:    s_mov_b32 s7, 0xf000
45; GFX6-NEXT:    s_mov_b32 s6, -1
46; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
48; GFX6-NEXT:    s_sub_i32 s4, 0, s3
49; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
50; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
51; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
52; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
53; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
54; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
55; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
56; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
57; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
58; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
59; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
60; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
61; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
62; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
63; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
64; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
65; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
66; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
67; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
69; GFX6-NEXT:    s_endpgm
70;
71; GFX9-LABEL: udiv_i32:
72; GFX9:       ; %bb.0:
73; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
74; GFX9-NEXT:    v_mov_b32_e32 v2, 0
75; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
76; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
78; GFX9-NEXT:    s_sub_i32 s4, 0, s3
79; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
80; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
81; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
82; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
83; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
84; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
85; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
86; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
87; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
88; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
89; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
90; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
91; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
92; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
93; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
94; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
95; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
96; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
97; GFX9-NEXT:    s_endpgm
98  %r = udiv i32 %x, %y
99  store i32 %r, i32 addrspace(1)* %out
100  ret void
101}
102
103define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
104; CHECK-LABEL: @urem_i32(
105; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
106; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
107; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
108; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
109; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
110; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
111; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
112; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
113; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
114; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
115; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
116; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
117; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
118; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
119; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
120; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
121; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
122; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
123; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
124; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
125; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
126; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
127; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
128; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
129; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
130; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
131; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
132; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
133; CHECK-NEXT:    ret void
134;
135; GFX6-LABEL: urem_i32:
136; GFX6:       ; %bb.0:
137; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
138; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
139; GFX6-NEXT:    s_mov_b32 s3, 0xf000
140; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
142; GFX6-NEXT:    s_sub_i32 s2, 0, s5
143; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
144; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
145; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
146; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
147; GFX6-NEXT:    s_mov_b32 s2, -1
148; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
149; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
150; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
151; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
152; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
153; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
154; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
155; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
156; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
157; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
158; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
159; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
160; GFX6-NEXT:    s_endpgm
161;
162; GFX9-LABEL: urem_i32:
163; GFX9:       ; %bb.0:
164; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
165; GFX9-NEXT:    s_nop 0
166; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
167; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
169; GFX9-NEXT:    s_sub_i32 s4, 0, s3
170; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
171; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
172; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
173; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
174; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
175; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
176; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
177; GFX9-NEXT:    v_mov_b32_e32 v1, 0
178; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
179; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
180; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
181; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
182; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
183; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
184; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
185; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
186; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
187; GFX9-NEXT:    s_endpgm
188  %r = urem i32 %x, %y
189  store i32 %r, i32 addrspace(1)* %out
190  ret void
191}
192
193define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
194; CHECK-LABEL: @sdiv_i32(
195; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
196; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
197; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
198; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
199; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
200; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
201; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
202; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
203; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
204; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
205; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
206; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
207; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
208; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
209; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
210; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
211; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
212; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
213; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
214; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
215; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
216; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
217; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
218; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
219; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
220; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
221; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
222; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
223; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
224; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
225; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
226; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
227; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
228; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
229; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
230; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
231; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
232; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
233; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
234; CHECK-NEXT:    ret void
235;
236; GFX6-LABEL: sdiv_i32:
237; GFX6:       ; %bb.0:
238; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
239; GFX6-NEXT:    s_mov_b32 s7, 0xf000
240; GFX6-NEXT:    s_mov_b32 s6, -1
241; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
242; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
243; GFX6-NEXT:    s_add_i32 s3, s3, s8
244; GFX6-NEXT:    s_xor_b32 s3, s3, s8
245; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
246; GFX6-NEXT:    s_sub_i32 s4, 0, s3
247; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
248; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
249; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
250; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
251; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
252; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
253; GFX6-NEXT:    s_add_i32 s1, s2, s0
254; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
255; GFX6-NEXT:    s_xor_b32 s1, s1, s0
256; GFX6-NEXT:    s_xor_b32 s2, s0, s8
257; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
258; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
259; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
260; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
261; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
262; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
263; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
264; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
265; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
266; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
267; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
268; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
269; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
270; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
271; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
273; GFX6-NEXT:    s_endpgm
274;
275; GFX9-LABEL: sdiv_i32:
276; GFX9:       ; %bb.0:
277; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
278; GFX9-NEXT:    v_mov_b32_e32 v2, 0
279; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
280; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
282; GFX9-NEXT:    s_add_i32 s3, s3, s4
283; GFX9-NEXT:    s_xor_b32 s3, s3, s4
284; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
285; GFX9-NEXT:    s_sub_i32 s5, 0, s3
286; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
287; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
288; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
289; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
290; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
291; GFX9-NEXT:    s_add_i32 s2, s2, s5
292; GFX9-NEXT:    s_xor_b32 s2, s2, s5
293; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
294; GFX9-NEXT:    s_xor_b32 s4, s5, s4
295; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
296; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
297; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
298; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
299; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
300; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
301; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
302; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
303; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
304; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
305; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
306; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
307; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
308; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
309; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
310; GFX9-NEXT:    s_endpgm
311  %r = sdiv i32 %x, %y
312  store i32 %r, i32 addrspace(1)* %out
313  ret void
314}
315
316define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
317; CHECK-LABEL: @srem_i32(
318; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
319; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
320; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
321; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
322; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
323; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
324; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
325; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
326; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
327; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
328; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
329; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
330; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
331; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
332; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
333; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
334; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
335; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
336; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
337; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
338; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
339; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
340; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
341; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
342; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
343; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
344; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
345; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
346; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
347; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
348; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
349; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
350; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
351; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
352; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
353; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
354; CHECK-NEXT:    ret void
355;
356; GFX6-LABEL: srem_i32:
357; GFX6:       ; %bb.0:
358; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
359; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
360; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
362; GFX6-NEXT:    s_add_i32 s3, s3, s4
363; GFX6-NEXT:    s_xor_b32 s4, s3, s4
364; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
365; GFX6-NEXT:    s_sub_i32 s3, 0, s4
366; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
367; GFX6-NEXT:    s_add_i32 s2, s2, s5
368; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
369; GFX6-NEXT:    s_xor_b32 s6, s2, s5
370; GFX6-NEXT:    s_mov_b32 s2, -1
371; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
372; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
373; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
374; GFX6-NEXT:    s_mov_b32 s3, 0xf000
375; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
376; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
377; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
378; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
379; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
380; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
381; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
382; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
383; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
384; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
385; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
386; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
387; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
388; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
389; GFX6-NEXT:    s_endpgm
390;
391; GFX9-LABEL: srem_i32:
392; GFX9:       ; %bb.0:
393; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
394; GFX9-NEXT:    s_nop 0
395; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
396; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
398; GFX9-NEXT:    s_add_i32 s3, s3, s4
399; GFX9-NEXT:    s_xor_b32 s3, s3, s4
400; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
401; GFX9-NEXT:    s_sub_i32 s4, 0, s3
402; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
403; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
404; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
405; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
406; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
407; GFX9-NEXT:    s_add_i32 s2, s2, s4
408; GFX9-NEXT:    s_xor_b32 s2, s2, s4
409; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
410; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
411; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
412; GFX9-NEXT:    v_mov_b32_e32 v1, 0
413; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
414; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
415; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
416; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
417; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
418; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
419; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
420; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
421; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
422; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
423; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
424; GFX9-NEXT:    s_endpgm
425  %r = srem i32 %x, %y
426  store i32 %r, i32 addrspace(1)* %out
427  ret void
428}
429
430define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
431; CHECK-LABEL: @udiv_i16(
432; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
433; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
434; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
435; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
436; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
437; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
438; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
439; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
440; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
441; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
442; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
443; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
444; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
445; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
446; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
447; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
448; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
449; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
450; CHECK-NEXT:    ret void
451;
452; GFX6-LABEL: udiv_i16:
453; GFX6:       ; %bb.0:
454; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
455; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
456; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
457; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
458; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
459; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
460; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s2
461; GFX6-NEXT:    s_mov_b32 s3, 0xf000
462; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
463; GFX6-NEXT:    s_mov_b32 s2, -1
464; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
465; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
466; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
467; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
468; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
469; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
470; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
471; GFX6-NEXT:    s_endpgm
472;
473; GFX9-LABEL: udiv_i16:
474; GFX9:       ; %bb.0:
475; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
476; GFX9-NEXT:    v_mov_b32_e32 v3, 0
477; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
478; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
480; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
481; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
482; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
483; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
484; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
485; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
486; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
487; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
488; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
489; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
490; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
491; GFX9-NEXT:    s_endpgm
492  %r = udiv i16 %x, %y
493  store i16 %r, i16 addrspace(1)* %out
494  ret void
495}
496
497define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
498; CHECK-LABEL: @urem_i16(
499; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
500; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
501; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
502; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
503; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
504; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
505; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
506; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
507; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
508; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
509; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
510; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
511; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
512; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
513; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
514; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
515; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
516; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
517; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
518; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
519; CHECK-NEXT:    ret void
520;
521; GFX6-LABEL: urem_i16:
522; GFX6:       ; %bb.0:
523; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
524; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
525; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX6-NEXT:    s_lshr_b32 s2, s4, 16
527; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
528; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
529; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
530; GFX6-NEXT:    s_mov_b32 s3, 0xf000
531; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
532; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
533; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
534; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
535; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
536; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
537; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
538; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
539; GFX6-NEXT:    s_mov_b32 s2, -1
540; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
541; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
542; GFX6-NEXT:    s_endpgm
543;
544; GFX9-LABEL: urem_i16:
545; GFX9:       ; %bb.0:
546; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
547; GFX9-NEXT:    s_nop 0
548; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
549; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
550; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
551; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
552; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
553; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
554; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
555; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
556; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
557; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
558; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
559; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
560; GFX9-NEXT:    v_mov_b32_e32 v1, 0
561; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
562; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
563; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
564; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
565; GFX9-NEXT:    s_endpgm
566  %r = urem i16 %x, %y
567  store i16 %r, i16 addrspace(1)* %out
568  ret void
569}
570
571define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
572; CHECK-LABEL: @sdiv_i16(
573; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
574; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
575; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
576; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
577; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
578; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
579; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
580; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
581; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
582; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
583; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
584; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
585; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
586; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
587; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
588; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
589; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
590; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
591; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
592; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
593; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
594; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
595; CHECK-NEXT:    ret void
596;
597; GFX6-LABEL: sdiv_i16:
598; GFX6:       ; %bb.0:
599; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
600; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
601; GFX6-NEXT:    s_mov_b32 s7, 0xf000
602; GFX6-NEXT:    s_mov_b32 s6, -1
603; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
604; GFX6-NEXT:    s_ashr_i32 s1, s0, 16
605; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
606; GFX6-NEXT:    s_sext_i32_i16 s0, s0
607; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
608; GFX6-NEXT:    s_xor_b32 s0, s0, s1
609; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
610; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
611; GFX6-NEXT:    s_or_b32 s0, s0, 1
612; GFX6-NEXT:    v_mov_b32_e32 v3, s0
613; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
614; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
615; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
616; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
617; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
618; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
619; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
620; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
621; GFX6-NEXT:    s_endpgm
622;
623; GFX9-LABEL: sdiv_i16:
624; GFX9:       ; %bb.0:
625; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
626; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
627; GFX9-NEXT:    v_mov_b32_e32 v1, 0
628; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX9-NEXT:    s_ashr_i32 s0, s4, 16
630; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
631; GFX9-NEXT:    s_sext_i32_i16 s1, s4
632; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
633; GFX9-NEXT:    s_xor_b32 s0, s1, s0
634; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
635; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
636; GFX9-NEXT:    s_or_b32 s4, s0, 1
637; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
638; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
639; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
640; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
641; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
642; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
643; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
644; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
645; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
646; GFX9-NEXT:    s_endpgm
647  %r = sdiv i16 %x, %y
648  store i16 %r, i16 addrspace(1)* %out
649  ret void
650}
651
652define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
653; CHECK-LABEL: @srem_i16(
654; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
655; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
656; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
657; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
658; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
659; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
660; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
661; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
662; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
663; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
664; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
665; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
666; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
667; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
668; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
669; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
670; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
671; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
672; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
673; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
674; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
675; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
676; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
677; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
678; CHECK-NEXT:    ret void
679;
680; GFX6-LABEL: srem_i16:
681; GFX6:       ; %bb.0:
682; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
683; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
684; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
685; GFX6-NEXT:    s_ashr_i32 s2, s4, 16
686; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
687; GFX6-NEXT:    s_sext_i32_i16 s3, s4
688; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
689; GFX6-NEXT:    s_xor_b32 s3, s3, s2
690; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
691; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
692; GFX6-NEXT:    s_or_b32 s3, s3, 1
693; GFX6-NEXT:    v_mov_b32_e32 v3, s3
694; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
695; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
696; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
697; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
698; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
699; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
700; GFX6-NEXT:    s_mov_b32 s3, 0xf000
701; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
702; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
703; GFX6-NEXT:    s_mov_b32 s2, -1
704; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
705; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
706; GFX6-NEXT:    s_endpgm
707;
708; GFX9-LABEL: srem_i16:
709; GFX9:       ; %bb.0:
710; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
711; GFX9-NEXT:    s_nop 0
712; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
713; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
714; GFX9-NEXT:    s_ashr_i32 s5, s4, 16
715; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
716; GFX9-NEXT:    s_sext_i32_i16 s2, s4
717; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
718; GFX9-NEXT:    s_xor_b32 s2, s2, s5
719; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
720; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
721; GFX9-NEXT:    s_or_b32 s6, s2, 1
722; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
723; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
724; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
725; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
726; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
727; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
728; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
729; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
730; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
731; GFX9-NEXT:    v_mov_b32_e32 v1, 0
732; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
733; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
734; GFX9-NEXT:    s_endpgm
735  %r = srem i16 %x, %y
736  store i16 %r, i16 addrspace(1)* %out
737  ret void
738}
739
740define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
741; CHECK-LABEL: @udiv_i8(
742; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
743; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
744; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
745; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
746; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
747; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
748; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
749; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
750; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
751; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
752; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
753; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
754; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
755; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
756; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
757; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
758; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
759; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
760; CHECK-NEXT:    ret void
761;
762; GFX6-LABEL: udiv_i8:
763; GFX6:       ; %bb.0:
764; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
765; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
766; GFX6-NEXT:    s_mov_b32 s7, 0xf000
767; GFX6-NEXT:    s_mov_b32 s6, -1
768; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
769; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s0
770; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
771; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
772; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
773; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
774; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
775; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
776; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
777; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
778; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
779; GFX6-NEXT:    s_endpgm
780;
781; GFX9-LABEL: udiv_i8:
782; GFX9:       ; %bb.0:
783; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
784; GFX9-NEXT:    v_mov_b32_e32 v2, 0
785; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
786; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
788; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
789; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
790; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
791; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
792; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
793; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
794; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
795; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
796; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
797; GFX9-NEXT:    s_endpgm
798  %r = udiv i8 %x, %y
799  store i8 %r, i8 addrspace(1)* %out
800  ret void
801}
802
803define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
804; CHECK-LABEL: @urem_i8(
805; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
806; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
807; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
808; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
809; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
810; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
811; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
812; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
813; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
814; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
815; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
816; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
817; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
818; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
819; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
820; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
821; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
822; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
823; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
824; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
825; CHECK-NEXT:    ret void
826;
827; GFX6-LABEL: urem_i8:
828; GFX6:       ; %bb.0:
829; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
830; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
831; GFX6-NEXT:    s_mov_b32 s3, 0xf000
832; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
833; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
834; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
835; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
836; GFX6-NEXT:    s_lshr_b32 s2, s4, 8
837; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
838; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
839; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
840; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
841; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
842; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
843; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
844; GFX6-NEXT:    s_mov_b32 s2, -1
845; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
846; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
847; GFX6-NEXT:    s_endpgm
848;
849; GFX9-LABEL: urem_i8:
850; GFX9:       ; %bb.0:
851; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
852; GFX9-NEXT:    s_nop 0
853; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
854; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
856; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
857; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
858; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
859; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
860; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
861; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
862; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
863; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
864; GFX9-NEXT:    v_mov_b32_e32 v1, 0
865; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
866; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
867; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
868; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
869; GFX9-NEXT:    s_endpgm
870  %r = urem i8 %x, %y
871  store i8 %r, i8 addrspace(1)* %out
872  ret void
873}
874
875define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
876; CHECK-LABEL: @sdiv_i8(
877; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
878; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
879; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
880; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
881; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
882; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
883; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
884; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
885; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
886; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
887; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
888; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
889; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
890; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
891; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
892; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
893; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
894; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
895; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
896; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
897; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
898; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
899; CHECK-NEXT:    ret void
900;
901; GFX6-LABEL: sdiv_i8:
902; GFX6:       ; %bb.0:
903; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
904; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
905; GFX6-NEXT:    s_mov_b32 s7, 0xf000
906; GFX6-NEXT:    s_mov_b32 s6, -1
907; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
908; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x80008
909; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
910; GFX6-NEXT:    s_sext_i32_i8 s0, s0
911; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
912; GFX6-NEXT:    s_xor_b32 s0, s0, s1
913; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
914; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
915; GFX6-NEXT:    s_or_b32 s0, s0, 1
916; GFX6-NEXT:    v_mov_b32_e32 v3, s0
917; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
918; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
919; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
920; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
921; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
922; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
923; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
924; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
925; GFX6-NEXT:    s_endpgm
926;
927; GFX9-LABEL: sdiv_i8:
928; GFX9:       ; %bb.0:
929; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
930; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
931; GFX9-NEXT:    v_mov_b32_e32 v1, 0
932; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
933; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x80008
934; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
935; GFX9-NEXT:    s_sext_i32_i8 s1, s4
936; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
937; GFX9-NEXT:    s_xor_b32 s0, s1, s0
938; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
939; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
940; GFX9-NEXT:    s_or_b32 s4, s0, 1
941; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
942; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
943; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
944; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
945; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
946; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
947; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
948; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
949; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
950; GFX9-NEXT:    s_endpgm
951  %r = sdiv i8 %x, %y
952  store i8 %r, i8 addrspace(1)* %out
953  ret void
954}
955
956define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
957; CHECK-LABEL: @srem_i8(
958; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
959; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
960; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
961; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
962; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
963; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
964; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
965; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
966; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
967; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
968; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
969; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
970; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
971; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
972; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
973; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
974; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
975; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
976; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
977; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
978; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
979; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
980; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
981; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
982; CHECK-NEXT:    ret void
983;
984; GFX6-LABEL: srem_i8:
985; GFX6:       ; %bb.0:
986; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
987; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
988; GFX6-NEXT:    s_mov_b32 s7, 0xf000
989; GFX6-NEXT:    s_mov_b32 s6, -1
990; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x80008
992; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
993; GFX6-NEXT:    s_sext_i32_i8 s3, s0
994; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
995; GFX6-NEXT:    s_xor_b32 s1, s3, s1
996; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
997; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
998; GFX6-NEXT:    s_or_b32 s1, s1, 1
999; GFX6-NEXT:    v_mov_b32_e32 v3, s1
1000; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
1001; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
1002; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
1003; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
1004; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
1005; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
1006; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
1007; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1008; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
1009; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1010; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1011; GFX6-NEXT:    s_endpgm
1012;
1013; GFX9-LABEL: srem_i8:
1014; GFX9:       ; %bb.0:
1015; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
1016; GFX9-NEXT:    s_nop 0
1017; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1018; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1019; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x80008
1020; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
1021; GFX9-NEXT:    s_sext_i32_i8 s3, s4
1022; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
1023; GFX9-NEXT:    s_xor_b32 s2, s3, s2
1024; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1025; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
1026; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
1027; GFX9-NEXT:    s_or_b32 s6, s2, 1
1028; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
1029; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1030; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
1031; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
1032; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
1033; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
1034; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
1035; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
1036; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
1037; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1038; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1039; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
1040; GFX9-NEXT:    s_endpgm
1041  %r = srem i8 %x, %y
1042  store i8 %r, i8 addrspace(1)* %out
1043  ret void
1044}
1045
1046define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1047; CHECK-LABEL: @udiv_v4i32(
1048; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1049; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1050; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1051; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1052; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1053; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1054; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1055; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1056; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1057; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1058; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1059; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1060; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1061; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1062; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1063; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1064; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1065; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1066; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1067; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1068; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1069; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1070; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1071; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1072; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
1073; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
1074; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1075; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
1076; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
1077; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
1078; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
1079; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0
1080; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
1081; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1082; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
1083; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
1084; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
1085; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
1086; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
1087; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
1088; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
1089; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
1090; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
1091; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1092; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
1093; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
1094; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
1095; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
1096; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
1097; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
1098; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1099; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
1100; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
1101; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
1102; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
1103; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
1104; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
1105; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
1106; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
1107; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
1108; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
1109; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
1110; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
1111; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
1112; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
1113; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1114; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
1115; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
1116; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
1117; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
1118; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
1119; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
1120; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
1121; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
1122; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
1123; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
1124; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
1125; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
1126; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
1127; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
1128; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
1129; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
1130; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
1131; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
1132; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
1133; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
1134; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
1135; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
1136; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
1137; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
1138; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
1139; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
1140; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
1141; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
1142; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
1143; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
1144; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
1145; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1146; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
1147; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
1148; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
1149; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
1150; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
1151; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
1152; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
1153; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1154; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1155; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1156; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1157; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1158; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
1159; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
1160; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
1161; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
1162; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
1163; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
1164; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
1165; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
1166; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
1167; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
1168; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
1169; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
1170; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
1171; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
1172; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
1173; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
1174; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
1175; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
1176; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1177; CHECK-NEXT:    ret void
1178;
1179; GFX6-LABEL: udiv_v4i32:
1180; GFX6:       ; %bb.0:
1181; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1182; GFX6-NEXT:    s_mov_b32 s3, 0x4f7ffffe
1183; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1184; GFX6-NEXT:    s_mov_b32 s15, 0xf000
1185; GFX6-NEXT:    s_mov_b32 s14, -1
1186; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1188; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1189; GFX6-NEXT:    s_sub_i32 s2, 0, s8
1190; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s10
1191; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1192; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1193; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s11
1194; GFX6-NEXT:    v_mul_f32_e32 v0, s3, v0
1195; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1196; GFX6-NEXT:    v_mul_f32_e32 v1, s3, v1
1197; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1198; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
1199; GFX6-NEXT:    s_sub_i32 s2, 0, s9
1200; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
1201; GFX6-NEXT:    s_sub_i32 s2, 0, s10
1202; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1203; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
1204; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1205; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1206; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
1207; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1208; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
1209; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1210; GFX6-NEXT:    v_mul_lo_u32 v5, v1, s9
1211; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
1212; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
1213; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1214; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
1215; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1216; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1217; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
1218; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v4
1219; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1220; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
1221; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1222; GFX6-NEXT:    v_mul_f32_e32 v2, s3, v2
1223; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1224; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
1225; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1226; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
1227; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
1228; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1229; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
1230; GFX6-NEXT:    s_sub_i32 s0, 0, s11
1231; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
1232; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1233; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v6
1234; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1235; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1236; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1237; GFX6-NEXT:    v_mul_f32_e32 v4, s3, v4
1238; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1239; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s10
1240; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1241; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1242; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
1243; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
1244; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1245; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1246; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
1247; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
1248; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1249; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
1250; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1251; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1252; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1253; GFX6-NEXT:    v_mul_lo_u32 v6, v4, s11
1254; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1255; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
1256; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
1257; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1258; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
1259; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1260; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1261; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1262; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1263; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1264; GFX6-NEXT:    s_endpgm
1265;
1266; GFX9-LABEL: udiv_v4i32:
1267; GFX9:       ; %bb.0:
1268; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1269; GFX9-NEXT:    s_mov_b32 s12, 0x4f7ffffe
1270; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1271; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1272; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1273; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1274; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1275; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1276; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1277; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1278; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1279; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
1280; GFX9-NEXT:    v_mul_f32_e32 v0, s12, v0
1281; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1282; GFX9-NEXT:    v_mul_f32_e32 v1, s12, v1
1283; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1284; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1285; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
1286; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1287; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
1288; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1289; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1290; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1291; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1292; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1293; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v5
1294; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1295; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
1296; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s11
1297; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1298; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
1299; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v5
1300; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
1301; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
1302; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v5
1303; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1304; GFX9-NEXT:    v_mul_lo_u32 v6, v1, s9
1305; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1306; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
1307; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
1308; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
1309; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
1310; GFX9-NEXT:    v_sub_u32_e32 v6, s5, v6
1311; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v2
1312; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1313; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
1314; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1315; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v7
1316; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1317; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1318; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v6
1319; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
1320; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
1321; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
1322; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
1323; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
1324; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
1325; GFX9-NEXT:    v_mul_lo_u32 v8, v3, s10
1326; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
1327; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
1328; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
1329; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v2
1330; GFX9-NEXT:    v_sub_u32_e32 v6, s6, v8
1331; GFX9-NEXT:    v_add_u32_e32 v7, 1, v3
1332; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
1333; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc
1334; GFX9-NEXT:    v_subrev_u32_e32 v3, s10, v6
1335; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
1336; GFX9-NEXT:    v_mul_lo_u32 v6, v5, s11
1337; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1338; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
1339; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
1340; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v6
1341; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
1342; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1343; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1344; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v3
1345; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1346; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
1347; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1348; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
1349; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1350; GFX9-NEXT:    s_endpgm
1351  %r = udiv <4 x i32> %x, %y
1352  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1353  ret void
1354}
1355
1356define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1357; CHECK-LABEL: @urem_v4i32(
1358; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1359; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1360; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1361; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1362; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1363; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1364; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1365; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1366; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1367; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1368; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1369; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1370; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1371; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1372; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1373; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1374; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1375; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1376; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1377; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1378; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1379; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1380; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1381; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1382; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1383; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
1384; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
1385; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
1386; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
1387; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0
1388; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
1389; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1390; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
1391; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
1392; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
1393; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
1394; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
1395; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
1396; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
1397; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
1398; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
1399; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
1400; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
1401; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1402; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
1403; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
1404; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1405; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1406; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1407; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1408; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1409; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1410; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1411; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1412; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1413; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1414; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1415; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1416; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1417; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1418; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1419; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1420; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1421; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1422; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1423; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1424; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1425; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1426; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1427; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1428; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1429; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1430; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1431; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1432; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1433; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1434; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1435; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1436; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1437; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1438; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1439; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1440; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1441; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1442; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1443; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1444; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1445; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1446; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1447; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1448; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1449; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1450; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1451; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1452; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1453; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1454; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1455; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1456; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1457; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1458; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1459; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1460; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1461; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1462; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1463; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1464; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1465; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1466; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1467; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1468; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1469; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1470; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1471; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1472; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1473; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1474; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1475; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1476; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1477; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1478; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1479; CHECK-NEXT:    ret void
1480;
1481; GFX6-LABEL: urem_v4i32:
1482; GFX6:       ; %bb.0:
1483; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1484; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
1485; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1486; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1487; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1488; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1489; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1490; GFX6-NEXT:    s_sub_i32 s2, 0, s8
1491; GFX6-NEXT:    s_sub_i32 s12, 0, s9
1492; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1493; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1494; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
1495; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
1496; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
1497; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1498; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v1
1499; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1500; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1501; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
1502; GFX6-NEXT:    s_mov_b32 s2, -1
1503; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v1
1504; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1505; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
1506; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1507; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1508; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1509; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1510; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
1511; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
1512; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1513; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
1514; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1515; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1516; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1517; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1518; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1519; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1520; GFX6-NEXT:    s_sub_i32 s4, 0, s10
1521; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1522; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v2
1523; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1524; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1525; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1526; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1527; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
1528; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
1529; GFX6-NEXT:    s_sub_i32 s4, 0, s11
1530; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1531; GFX6-NEXT:    v_mul_f32_e32 v3, s13, v4
1532; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1533; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1534; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1535; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1536; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
1537; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1538; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
1539; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
1540; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
1541; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1542; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1543; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
1544; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1545; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1546; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1547; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
1548; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1549; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1550; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1551; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1552; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1553; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1554; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1555; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1556; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1557; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1558; GFX6-NEXT:    s_endpgm
1559;
1560; GFX9-LABEL: urem_v4i32:
1561; GFX9:       ; %bb.0:
1562; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1563; GFX9-NEXT:    s_mov_b32 s12, 0x4f7ffffe
1564; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1565; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1566; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1567; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1568; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1569; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1570; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
1571; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1572; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1573; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1574; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1575; GFX9-NEXT:    v_mul_f32_e32 v0, s12, v0
1576; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1577; GFX9-NEXT:    v_mul_f32_e32 v1, s12, v1
1578; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1579; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
1580; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
1581; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1582; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
1583; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1584; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1585; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1586; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v5
1587; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1588; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1589; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v6
1590; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1591; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
1592; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1593; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v3
1594; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1595; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
1596; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1597; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
1598; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
1599; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
1600; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
1601; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
1602; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1603; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
1604; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
1605; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1606; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1607; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
1608; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
1609; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s10
1610; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
1611; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
1612; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1613; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1614; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
1615; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1616; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1617; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s11
1618; GFX9-NEXT:    v_sub_u32_e32 v2, s6, v2
1619; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
1620; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1621; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1622; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
1623; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1624; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1625; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
1626; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1627; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
1628; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1629; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
1630; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1631; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1632; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
1633; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1634; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1635; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1636; GFX9-NEXT:    s_endpgm
1637  %r = urem <4 x i32> %x, %y
1638  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1639  ret void
1640}
1641
1642define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1643; CHECK-LABEL: @sdiv_v4i32(
1644; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1645; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1646; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1647; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1648; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1649; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1650; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1651; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1652; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1653; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1654; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1655; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1656; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1657; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1658; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1659; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1660; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1661; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1662; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1663; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1664; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1665; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1666; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1667; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1668; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1669; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1670; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1671; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1672; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1673; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1674; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1675; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
1676; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1677; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1678; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1679; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1680; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
1681; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1682; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1683; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1684; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0
1685; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1686; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1687; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1688; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1689; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1690; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1691; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1692; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1693; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1694; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1695; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1696; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1697; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1698; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1699; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1700; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1701; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1702; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1703; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1704; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1705; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1706; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1707; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1708; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1709; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1710; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1711; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1712; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1713; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1714; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1715; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1716; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
1717; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1718; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1719; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1720; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1721; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
1722; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1723; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1724; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1725; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1726; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1727; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1728; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1729; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1730; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1731; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1732; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1733; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1734; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1735; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1736; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1737; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1738; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1739; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1740; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1741; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1742; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1743; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1744; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1745; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1746; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1747; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1748; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1749; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1750; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1751; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1752; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1753; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1754; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1755; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1756; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1757; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
1758; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1759; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1760; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1761; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1762; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
1763; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1764; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1765; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1766; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1767; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1768; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1769; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1770; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1771; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1772; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1773; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1774; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1775; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1776; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1777; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1778; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1779; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1780; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1781; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1782; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1783; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1784; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1785; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1786; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1787; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1788; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1789; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1790; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1791; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1792; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1793; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1794; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1795; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1796; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1797; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1798; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
1799; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1800; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1801; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1802; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1803; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
1804; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1805; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1806; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1807; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1808; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1809; CHECK-NEXT:    ret void
1810;
1811; GFX6-LABEL: sdiv_v4i32:
1812; GFX6:       ; %bb.0:
1813; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1814; GFX6-NEXT:    s_mov_b32 s16, 0x4f7ffffe
1815; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1816; GFX6-NEXT:    s_mov_b32 s15, 0xf000
1817; GFX6-NEXT:    s_mov_b32 s14, -1
1818; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1819; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
1820; GFX6-NEXT:    s_add_i32 s3, s8, s2
1821; GFX6-NEXT:    s_xor_b32 s3, s3, s2
1822; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
1823; GFX6-NEXT:    s_ashr_i32 s8, s9, 31
1824; GFX6-NEXT:    s_add_i32 s0, s9, s8
1825; GFX6-NEXT:    s_xor_b32 s9, s0, s8
1826; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1827; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1828; GFX6-NEXT:    s_sub_i32 s1, 0, s3
1829; GFX6-NEXT:    s_ashr_i32 s0, s4, 31
1830; GFX6-NEXT:    v_mul_f32_e32 v0, s16, v0
1831; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1832; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1833; GFX6-NEXT:    s_xor_b32 s2, s0, s2
1834; GFX6-NEXT:    v_mul_lo_u32 v2, s1, v0
1835; GFX6-NEXT:    s_add_i32 s1, s4, s0
1836; GFX6-NEXT:    v_mul_f32_e32 v1, s16, v1
1837; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1838; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1839; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1840; GFX6-NEXT:    s_sub_i32 s0, 0, s9
1841; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1842; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
1843; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
1844; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
1845; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
1846; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1847; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1848; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v3
1849; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1850; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v3
1851; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
1852; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1853; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1854; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
1855; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1856; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
1857; GFX6-NEXT:    s_add_i32 s1, s5, s0
1858; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
1859; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
1860; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1861; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
1862; GFX6-NEXT:    s_xor_b32 s2, s0, s8
1863; GFX6-NEXT:    s_add_i32 s0, s10, s3
1864; GFX6-NEXT:    s_xor_b32 s4, s0, s3
1865; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
1866; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
1867; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1868; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
1869; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1870; GFX6-NEXT:    v_mul_f32_e32 v3, s16, v3
1871; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
1872; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1873; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
1874; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1875; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v2
1876; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1877; GFX6-NEXT:    s_sub_i32 s0, 0, s4
1878; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
1879; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1880; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1881; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1882; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v5
1883; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
1884; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
1885; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
1886; GFX6-NEXT:    s_ashr_i32 s0, s6, 31
1887; GFX6-NEXT:    s_add_i32 s5, s11, s2
1888; GFX6-NEXT:    s_add_i32 s1, s6, s0
1889; GFX6-NEXT:    s_xor_b32 s5, s5, s2
1890; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
1891; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1892; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1893; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
1894; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1895; GFX6-NEXT:    s_xor_b32 s3, s0, s3
1896; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s4
1897; GFX6-NEXT:    v_mul_f32_e32 v4, s16, v4
1898; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1899; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1900; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1901; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
1902; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1903; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v3
1904; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1905; GFX6-NEXT:    s_sub_i32 s0, 0, s5
1906; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1907; GFX6-NEXT:    s_ashr_i32 s0, s7, 31
1908; GFX6-NEXT:    s_add_i32 s1, s7, s0
1909; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1910; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1911; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1912; GFX6-NEXT:    s_xor_b32 s2, s0, s2
1913; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1914; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
1915; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
1916; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1917; GFX6-NEXT:    v_xor_b32_e32 v2, s3, v2
1918; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s5
1919; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1920; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
1921; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1922; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v3
1923; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1924; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v3
1925; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1926; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1927; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
1928; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1929; GFX6-NEXT:    v_xor_b32_e32 v3, s2, v3
1930; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
1931; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1932; GFX6-NEXT:    s_endpgm
1933;
1934; GFX9-LABEL: sdiv_v4i32:
1935; GFX9:       ; %bb.0:
1936; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1937; GFX9-NEXT:    s_mov_b32 s15, 0x4f7ffffe
1938; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1939; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1940; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1941; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
1942; GFX9-NEXT:    s_add_i32 s3, s8, s2
1943; GFX9-NEXT:    s_xor_b32 s3, s3, s2
1944; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
1945; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
1946; GFX9-NEXT:    s_add_i32 s9, s9, s12
1947; GFX9-NEXT:    s_xor_b32 s9, s9, s12
1948; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1949; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1950; GFX9-NEXT:    s_sub_i32 s14, 0, s3
1951; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
1952; GFX9-NEXT:    v_mul_f32_e32 v0, s15, v0
1953; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1954; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1955; GFX9-NEXT:    s_add_i32 s4, s4, s8
1956; GFX9-NEXT:    s_xor_b32 s4, s4, s8
1957; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v0
1958; GFX9-NEXT:    v_mul_f32_e32 v1, s15, v1
1959; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1960; GFX9-NEXT:    s_sub_i32 s14, 0, s9
1961; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1962; GFX9-NEXT:    s_ashr_i32 s13, s5, 31
1963; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
1964; GFX9-NEXT:    s_add_i32 s5, s5, s13
1965; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1966; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1967; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
1968; GFX9-NEXT:    s_xor_b32 s5, s5, s13
1969; GFX9-NEXT:    s_xor_b32 s2, s8, s2
1970; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
1971; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
1972; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
1973; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1974; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
1975; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
1976; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1977; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v3
1978; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1979; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
1980; GFX9-NEXT:    s_ashr_i32 s3, s10, 31
1981; GFX9-NEXT:    s_add_i32 s4, s10, s3
1982; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
1983; GFX9-NEXT:    s_xor_b32 s4, s4, s3
1984; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1985; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
1986; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s9
1987; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1988; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
1989; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1990; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
1991; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1992; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1993; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
1994; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1995; GFX9-NEXT:    v_subrev_u32_e32 v5, s9, v2
1996; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1997; GFX9-NEXT:    s_sub_i32 s5, 0, s4
1998; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
1999; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v3
2000; GFX9-NEXT:    s_add_i32 s9, s11, s8
2001; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
2002; GFX9-NEXT:    s_xor_b32 s9, s9, s8
2003; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
2004; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2005; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
2006; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
2007; GFX9-NEXT:    s_add_i32 s6, s6, s5
2008; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
2009; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v5
2010; GFX9-NEXT:    s_xor_b32 s6, s6, s5
2011; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
2012; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
2013; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
2014; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2015; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s4
2016; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
2017; GFX9-NEXT:    s_xor_b32 s2, s13, s12
2018; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
2019; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
2020; GFX9-NEXT:    s_xor_b32 s2, s5, s3
2021; GFX9-NEXT:    s_sub_i32 s3, 0, s9
2022; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v3
2023; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v5
2024; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2025; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2026; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2027; GFX9-NEXT:    v_subrev_u32_e32 v6, s4, v5
2028; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2029; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v7
2030; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
2031; GFX9-NEXT:    s_add_i32 s5, s7, s3
2032; GFX9-NEXT:    s_xor_b32 s5, s5, s3
2033; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
2034; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
2035; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
2036; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2037; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2038; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s9
2039; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2040; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
2041; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
2042; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
2043; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2044; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2045; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v5
2046; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2047; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2048; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2049; GFX9-NEXT:    s_xor_b32 s2, s3, s8
2050; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2051; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
2052; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
2053; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2054; GFX9-NEXT:    s_endpgm
2055  %r = sdiv <4 x i32> %x, %y
2056  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2057  ret void
2058}
2059
2060define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
2061; CHECK-LABEL: @srem_v4i32(
2062; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
2063; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
2064; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
2065; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
2066; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
2067; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
2068; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
2069; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
2070; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
2071; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2072; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
2073; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
2074; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
2075; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
2076; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
2077; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
2078; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
2079; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
2080; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
2081; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
2082; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
2083; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
2084; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
2085; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
2086; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
2087; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
2088; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
2089; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
2090; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
2091; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
2092; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
2093; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
2094; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
2095; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
2096; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
2097; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
2098; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
2099; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0
2100; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
2101; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
2102; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
2103; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
2104; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
2105; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
2106; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
2107; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
2108; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
2109; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
2110; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
2111; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
2112; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
2113; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
2114; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
2115; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
2116; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
2117; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
2118; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
2119; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
2120; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
2121; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
2122; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
2123; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
2124; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
2125; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
2126; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
2127; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
2128; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
2129; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
2130; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
2131; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
2132; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
2133; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
2134; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
2135; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
2136; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
2137; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
2138; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
2139; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
2140; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
2141; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
2142; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
2143; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
2144; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
2145; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
2146; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
2147; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
2148; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
2149; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
2150; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
2151; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
2152; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
2153; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
2154; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
2155; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
2156; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
2157; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
2158; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
2159; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
2160; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
2161; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
2162; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
2163; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
2164; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
2165; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
2166; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
2167; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
2168; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
2169; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
2170; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
2171; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
2172; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
2173; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
2174; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
2175; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
2176; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
2177; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
2178; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
2179; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
2180; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
2181; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
2182; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
2183; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
2184; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
2185; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
2186; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
2187; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
2188; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
2189; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
2190; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
2191; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
2192; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
2193; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
2194; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
2195; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
2196; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
2197; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
2198; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
2199; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
2200; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
2201; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
2202; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
2203; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
2204; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
2205; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
2206; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
2207; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
2208; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
2209; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
2210; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
2211; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
2212; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
2213; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
2214; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
2215; CHECK-NEXT:    ret void
2216;
2217; GFX6-LABEL: srem_v4i32:
2218; GFX6:       ; %bb.0:
2219; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
2220; GFX6-NEXT:    s_mov_b32 s14, 0x4f7ffffe
2221; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2222; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2223; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2224; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
2225; GFX6-NEXT:    s_add_i32 s8, s8, s2
2226; GFX6-NEXT:    s_xor_b32 s8, s8, s2
2227; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
2228; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
2229; GFX6-NEXT:    s_add_i32 s9, s9, s12
2230; GFX6-NEXT:    s_xor_b32 s9, s9, s12
2231; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2232; GFX6-NEXT:    s_sub_i32 s13, 0, s8
2233; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
2234; GFX6-NEXT:    s_ashr_i32 s12, s4, 31
2235; GFX6-NEXT:    v_mul_f32_e32 v0, s14, v0
2236; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2237; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2238; GFX6-NEXT:    s_add_i32 s4, s4, s12
2239; GFX6-NEXT:    s_xor_b32 s4, s4, s12
2240; GFX6-NEXT:    v_mul_lo_u32 v2, s13, v0
2241; GFX6-NEXT:    v_mul_f32_e32 v1, s14, v1
2242; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2243; GFX6-NEXT:    s_sub_i32 s13, 0, s9
2244; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
2245; GFX6-NEXT:    s_mov_b32 s2, -1
2246; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2247; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
2248; GFX6-NEXT:    v_mul_lo_u32 v2, s13, v1
2249; GFX6-NEXT:    s_ashr_i32 s13, s5, 31
2250; GFX6-NEXT:    s_add_i32 s5, s5, s13
2251; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
2252; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
2253; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
2254; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
2255; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2256; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2257; GFX6-NEXT:    s_xor_b32 s4, s5, s13
2258; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2259; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
2260; GFX6-NEXT:    s_ashr_i32 s5, s10, 31
2261; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
2262; GFX6-NEXT:    s_add_i32 s8, s10, s5
2263; GFX6-NEXT:    s_xor_b32 s5, s8, s5
2264; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
2265; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
2266; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2267; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
2268; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2269; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
2270; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
2271; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
2272; GFX6-NEXT:    v_mul_f32_e32 v2, s14, v2
2273; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2274; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
2275; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
2276; GFX6-NEXT:    s_sub_i32 s4, 0, s5
2277; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2278; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v2
2279; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2280; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
2281; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
2282; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2283; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
2284; GFX6-NEXT:    s_add_i32 s9, s11, s8
2285; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
2286; GFX6-NEXT:    s_xor_b32 s8, s9, s8
2287; GFX6-NEXT:    s_add_i32 s6, s6, s4
2288; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
2289; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
2290; GFX6-NEXT:    s_xor_b32 s6, s6, s4
2291; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
2292; GFX6-NEXT:    v_xor_b32_e32 v1, s13, v1
2293; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2294; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s13, v1
2295; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s5
2296; GFX6-NEXT:    v_mul_f32_e32 v3, s14, v3
2297; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
2298; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
2299; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v2
2300; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
2301; GFX6-NEXT:    s_sub_i32 s6, 0, s8
2302; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2303; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
2304; GFX6-NEXT:    s_ashr_i32 s6, s7, 31
2305; GFX6-NEXT:    s_add_i32 s7, s7, s6
2306; GFX6-NEXT:    s_xor_b32 s7, s7, s6
2307; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
2308; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v2
2309; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
2310; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
2311; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
2312; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2313; GFX6-NEXT:    v_xor_b32_e32 v2, s4, v2
2314; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s8
2315; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
2316; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
2317; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2318; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2319; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2320; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2321; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2322; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2323; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
2324; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v3
2325; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2326; GFX6-NEXT:    s_endpgm
2327;
2328; GFX9-LABEL: srem_v4i32:
2329; GFX9:       ; %bb.0:
2330; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
2331; GFX9-NEXT:    s_mov_b32 s13, 0x4f7ffffe
2332; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2333; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2334; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2335; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
2336; GFX9-NEXT:    s_add_i32 s8, s8, s2
2337; GFX9-NEXT:    s_xor_b32 s2, s8, s2
2338; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
2339; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
2340; GFX9-NEXT:    s_sub_i32 s12, 0, s2
2341; GFX9-NEXT:    s_add_i32 s8, s9, s3
2342; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2343; GFX9-NEXT:    s_xor_b32 s3, s8, s3
2344; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
2345; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
2346; GFX9-NEXT:    v_mul_f32_e32 v0, s13, v0
2347; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2348; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2349; GFX9-NEXT:    s_add_i32 s4, s4, s8
2350; GFX9-NEXT:    s_xor_b32 s4, s4, s8
2351; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
2352; GFX9-NEXT:    v_mul_f32_e32 v1, s13, v1
2353; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2354; GFX9-NEXT:    s_sub_i32 s12, 0, s3
2355; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
2356; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
2357; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
2358; GFX9-NEXT:    s_add_i32 s5, s5, s9
2359; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
2360; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
2361; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
2362; GFX9-NEXT:    s_xor_b32 s5, s5, s9
2363; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
2364; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
2365; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
2366; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
2367; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
2368; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2369; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2370; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
2371; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2372; GFX9-NEXT:    s_ashr_i32 s2, s10, 31
2373; GFX9-NEXT:    s_add_i32 s4, s10, s2
2374; GFX9-NEXT:    s_xor_b32 s2, s4, s2
2375; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2376; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
2377; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
2378; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
2379; GFX9-NEXT:    v_subrev_u32_e32 v0, s8, v0
2380; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2381; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
2382; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
2383; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2384; GFX9-NEXT:    v_mul_f32_e32 v2, s13, v2
2385; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2386; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2387; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
2388; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2389; GFX9-NEXT:    s_sub_i32 s3, 0, s2
2390; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2391; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
2392; GFX9-NEXT:    s_ashr_i32 s3, s11, 31
2393; GFX9-NEXT:    s_add_i32 s4, s11, s3
2394; GFX9-NEXT:    s_xor_b32 s3, s4, s3
2395; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s3
2396; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
2397; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
2398; GFX9-NEXT:    s_add_i32 s5, s6, s4
2399; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
2400; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
2401; GFX9-NEXT:    s_xor_b32 s5, s5, s4
2402; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
2403; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v5
2404; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2405; GFX9-NEXT:    s_sub_i32 s6, 0, s3
2406; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
2407; GFX9-NEXT:    v_xor_b32_e32 v1, s9, v1
2408; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v3
2409; GFX9-NEXT:    v_subrev_u32_e32 v1, s9, v1
2410; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
2411; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
2412; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
2413; GFX9-NEXT:    s_add_i32 s6, s7, s5
2414; GFX9-NEXT:    s_xor_b32 s6, s6, s5
2415; GFX9-NEXT:    v_subrev_u32_e32 v6, s2, v2
2416; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
2417; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
2418; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
2419; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2420; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v2
2421; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s3
2422; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
2423; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2424; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
2425; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
2426; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
2427; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
2428; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2429; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
2430; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
2431; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2432; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
2433; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
2434; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
2435; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2436; GFX9-NEXT:    s_endpgm
2437  %r = srem <4 x i32> %x, %y
2438  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2439  ret void
2440}
2441
2442define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2443; CHECK-LABEL: @udiv_v4i16(
2444; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2445; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2446; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2447; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2448; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2449; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2450; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2451; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2452; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2453; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2454; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2455; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2456; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2457; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2458; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2459; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2460; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2461; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2462; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2463; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
2464; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
2465; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2466; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2467; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2468; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2469; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2470; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2471; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2472; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2473; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2474; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2475; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2476; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2477; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2478; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2479; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2480; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2481; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2482; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2483; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2484; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
2485; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2486; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2487; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2488; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2489; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2490; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2491; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2492; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2493; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2494; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2495; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2496; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2497; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2498; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2499; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2500; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2501; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2502; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2503; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2504; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
2505; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2506; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
2507; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
2508; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
2509; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
2510; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
2511; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
2512; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
2513; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
2514; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
2515; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
2516; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
2517; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2518; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
2519; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
2520; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
2521; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
2522; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
2523; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
2524; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2525; CHECK-NEXT:    ret void
2526;
2527; GFX6-LABEL: udiv_v4i16:
2528; GFX6:       ; %bb.0:
2529; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2530; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2531; GFX6-NEXT:    s_mov_b32 s8, 0xffff
2532; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2533; GFX6-NEXT:    s_mov_b32 s6, -1
2534; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2535; GFX6-NEXT:    s_and_b32 s9, s2, s8
2536; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
2537; GFX6-NEXT:    s_lshr_b32 s9, s0, 16
2538; GFX6-NEXT:    s_and_b32 s0, s0, s8
2539; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
2540; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
2541; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
2542; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2543; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
2544; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2545; GFX6-NEXT:    s_and_b32 s2, s3, s8
2546; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
2547; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2548; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
2549; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2550; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2551; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2552; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2553; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2554; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
2555; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s2
2556; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
2557; GFX6-NEXT:    s_and_b32 s1, s1, s8
2558; GFX6-NEXT:    s_lshr_b32 s10, s3, 16
2559; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
2560; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2561; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
2562; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
2563; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2564; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
2565; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v3
2566; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2567; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
2568; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
2569; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2570; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
2571; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
2572; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2573; GFX6-NEXT:    v_mul_f32_e32 v4, v6, v7
2574; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
2575; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v4
2576; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2577; GFX6-NEXT:    v_mad_f32 v4, -v4, v3, v6
2578; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
2579; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2580; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
2581; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2582; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
2583; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
2584; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2585; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2586; GFX6-NEXT:    s_endpgm
2587;
2588; GFX9-LABEL: udiv_v4i16:
2589; GFX9:       ; %bb.0:
2590; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2591; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
2592; GFX9-NEXT:    s_mov_b32 s8, 0xffff
2593; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2594; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2595; GFX9-NEXT:    s_and_b32 s1, s6, s8
2596; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
2597; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
2598; GFX9-NEXT:    s_and_b32 s4, s4, s8
2599; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
2600; GFX9-NEXT:    s_lshr_b32 s4, s6, 16
2601; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2602; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
2603; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
2604; GFX9-NEXT:    s_and_b32 s0, s7, s8
2605; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
2606; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2607; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2608; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
2609; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2610; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2611; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
2612; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2613; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
2614; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
2615; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
2616; GFX9-NEXT:    s_and_b32 s0, s5, s8
2617; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
2618; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2619; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2620; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
2621; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
2622; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2623; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
2624; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
2625; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
2626; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
2627; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
2628; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2629; GFX9-NEXT:    v_mad_f32 v6, -v1, v5, v6
2630; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
2631; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2632; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
2633; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2634; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
2635; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2636; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
2637; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
2638; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
2639; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
2640; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
2641; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
2642; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
2643; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
2644; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
2645; GFX9-NEXT:    s_endpgm
2646  %r = udiv <4 x i16> %x, %y
2647  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2648  ret void
2649}
2650
2651define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2652; CHECK-LABEL: @urem_v4i16(
2653; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2654; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2655; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2656; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2657; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2658; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2659; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2660; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2661; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2662; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2663; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2664; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2665; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2666; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2667; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2668; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2669; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2670; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2671; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2672; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2673; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2674; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
2675; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
2676; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2677; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2678; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2679; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2680; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2681; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2682; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2683; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2684; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2685; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2686; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2687; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2688; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2689; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2690; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2691; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2692; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2693; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2694; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2695; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2696; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2697; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
2698; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2699; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2700; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2701; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2702; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2703; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2704; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2705; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2706; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2707; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2708; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2709; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2710; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2711; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2712; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2713; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2714; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2715; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2716; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2717; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2718; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2719; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
2720; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2721; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
2722; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
2723; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
2724; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
2725; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
2726; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
2727; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
2728; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
2729; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
2730; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
2731; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
2732; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
2733; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
2734; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
2735; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
2736; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
2737; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
2738; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
2739; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
2740; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
2741; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2742; CHECK-NEXT:    ret void
2743;
2744; GFX6-LABEL: urem_v4i16:
2745; GFX6:       ; %bb.0:
2746; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2747; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2748; GFX6-NEXT:    s_mov_b32 s8, 0xffff
2749; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2750; GFX6-NEXT:    s_mov_b32 s6, -1
2751; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2752; GFX6-NEXT:    s_and_b32 s9, s2, s8
2753; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
2754; GFX6-NEXT:    s_and_b32 s10, s0, s8
2755; GFX6-NEXT:    s_lshr_b32 s11, s2, 16
2756; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
2757; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2758; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s11
2759; GFX6-NEXT:    s_lshr_b32 s9, s0, 16
2760; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
2761; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
2762; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2763; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2764; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
2765; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2766; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2767; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2768; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2769; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2770; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
2771; GFX6-NEXT:    v_mad_f32 v1, -v1, v3, v4
2772; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
2773; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
2774; GFX6-NEXT:    s_and_b32 s2, s3, s8
2775; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
2776; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
2777; GFX6-NEXT:    s_and_b32 s2, s1, s8
2778; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
2779; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
2780; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2781; GFX6-NEXT:    s_lshr_b32 s12, s3, 16
2782; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
2783; GFX6-NEXT:    s_lshr_b32 s10, s1, 16
2784; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
2785; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s12
2786; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s10
2787; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2788; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2789; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2790; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
2791; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2792; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2793; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
2794; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2795; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
2796; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2797; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v6
2798; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2799; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2800; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
2801; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s12
2802; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
2803; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2804; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
2805; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2806; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
2807; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
2808; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2809; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2810; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2811; GFX6-NEXT:    s_endpgm
2812;
2813; GFX9-LABEL: urem_v4i16:
2814; GFX9:       ; %bb.0:
2815; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2816; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
2817; GFX9-NEXT:    s_mov_b32 s8, 0xffff
2818; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2819; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2820; GFX9-NEXT:    s_and_b32 s1, s6, s8
2821; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
2822; GFX9-NEXT:    s_and_b32 s9, s4, s8
2823; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
2824; GFX9-NEXT:    s_lshr_b32 s9, s6, 16
2825; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2826; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s9
2827; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
2828; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
2829; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
2830; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2831; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
2832; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2833; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2834; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2835; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
2836; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
2837; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
2838; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
2839; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2840; GFX9-NEXT:    s_and_b32 s6, s7, s8
2841; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
2842; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
2843; GFX9-NEXT:    s_and_b32 s6, s5, s8
2844; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2845; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
2846; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
2847; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2848; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2849; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
2850; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
2851; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
2852; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
2853; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2854; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2855; GFX9-NEXT:    v_mad_f32 v6, -v3, v5, v6
2856; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
2857; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2858; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
2859; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2860; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
2861; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2862; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
2863; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
2864; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
2865; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
2866; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
2867; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s10
2868; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
2869; GFX9-NEXT:    v_sub_u32_e32 v5, s0, v1
2870; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v3
2871; GFX9-NEXT:    v_sub_u32_e32 v3, s1, v4
2872; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
2873; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
2874; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
2875; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
2876; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
2877; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
2878; GFX9-NEXT:    s_endpgm
2879  %r = urem <4 x i16> %x, %y
2880  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2881  ret void
2882}
2883
2884define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2885; CHECK-LABEL: @sdiv_v4i16(
2886; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2887; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2888; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2889; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2890; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2891; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2892; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2893; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2894; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2895; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2896; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2897; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2898; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2899; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2900; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2901; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2902; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2903; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2904; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2905; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2906; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2907; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2908; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2909; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
2910; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2911; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2912; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2913; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2914; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2915; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2916; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2917; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2918; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2919; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2920; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2921; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2922; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2923; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2924; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2925; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2926; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2927; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2928; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2929; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2930; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2931; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2932; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2933; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2934; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2935; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2936; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2937; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2938; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2939; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2940; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2941; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2942; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2943; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2944; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2945; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2946; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2947; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2948; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2949; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2950; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2951; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2952; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2953; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2954; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2955; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2956; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2957; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2958; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2959; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2960; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2961; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2962; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2963; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2964; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2965; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2966; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2967; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2968; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2969; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2970; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2971; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2972; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2973; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2974; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2975; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2976; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2977; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2978; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2979; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2980; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2981; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2982; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2983; CHECK-NEXT:    ret void
2984;
2985; GFX6-LABEL: sdiv_v4i16:
2986; GFX6:       ; %bb.0:
2987; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2988; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2989; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2990; GFX6-NEXT:    s_mov_b32 s6, -1
2991; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2992; GFX6-NEXT:    s_sext_i32_i16 s8, s2
2993; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
2994; GFX6-NEXT:    s_sext_i32_i16 s9, s0
2995; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
2996; GFX6-NEXT:    s_xor_b32 s8, s9, s8
2997; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2998; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
2999; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3000; GFX6-NEXT:    s_or_b32 s8, s8, 1
3001; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3002; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3003; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3004; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3005; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3006; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
3007; GFX6-NEXT:    v_mov_b32_e32 v3, s8
3008; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3009; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3010; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3011; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3012; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3013; GFX6-NEXT:    s_xor_b32 s0, s0, s2
3014; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3015; GFX6-NEXT:    s_or_b32 s0, s0, 1
3016; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
3017; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3018; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
3019; GFX6-NEXT:    v_mov_b32_e32 v4, s0
3020; GFX6-NEXT:    s_sext_i32_i16 s0, s3
3021; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3022; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3023; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3024; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3025; GFX6-NEXT:    s_sext_i32_i16 s2, s1
3026; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
3027; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
3028; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3029; GFX6-NEXT:    s_xor_b32 s0, s2, s0
3030; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3031; GFX6-NEXT:    s_or_b32 s0, s0, 1
3032; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3033; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3034; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
3035; GFX6-NEXT:    v_mov_b32_e32 v5, s0
3036; GFX6-NEXT:    s_ashr_i32 s0, s3, 16
3037; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
3038; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3039; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3040; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
3041; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3042; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3043; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
3044; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3045; GFX6-NEXT:    s_xor_b32 s0, s1, s0
3046; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3047; GFX6-NEXT:    s_or_b32 s0, s0, 1
3048; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3049; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3050; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
3051; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3052; GFX6-NEXT:    v_mov_b32_e32 v6, s0
3053; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
3054; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
3055; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
3056; GFX6-NEXT:    s_mov_b32 s0, 0xffff
3057; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3058; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
3059; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3060; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
3061; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
3062; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3063; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3064; GFX6-NEXT:    s_endpgm
3065;
3066; GFX9-LABEL: sdiv_v4i16:
3067; GFX9:       ; %bb.0:
3068; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3069; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3070; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3071; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3072; GFX9-NEXT:    s_sext_i32_i16 s0, s6
3073; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3074; GFX9-NEXT:    s_sext_i32_i16 s1, s4
3075; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
3076; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3077; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3078; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3079; GFX9-NEXT:    s_or_b32 s8, s0, 1
3080; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3081; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3082; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3083; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3084; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3085; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3086; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
3087; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3088; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
3089; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s4
3090; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3091; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3092; GFX9-NEXT:    v_add_u32_e32 v3, s0, v3
3093; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
3094; GFX9-NEXT:    s_xor_b32 s0, s4, s1
3095; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3096; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3097; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
3098; GFX9-NEXT:    s_or_b32 s4, s0, 1
3099; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3100; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3101; GFX9-NEXT:    s_sext_i32_i16 s1, s7
3102; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3103; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3104; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3105; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
3106; GFX9-NEXT:    s_sext_i32_i16 s0, s5
3107; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
3108; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
3109; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3110; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3111; GFX9-NEXT:    s_or_b32 s4, s0, 1
3112; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
3113; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3114; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
3115; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3116; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3117; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3118; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3119; GFX9-NEXT:    s_ashr_i32 s1, s7, 16
3120; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3121; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
3122; GFX9-NEXT:    s_ashr_i32 s0, s5, 16
3123; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
3124; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
3125; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3126; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3127; GFX9-NEXT:    s_or_b32 s4, s0, 1
3128; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3129; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3130; GFX9-NEXT:    v_mad_f32 v5, -v6, v0, v5
3131; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3132; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
3133; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3134; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3135; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
3136; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
3137; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
3138; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
3139; GFX9-NEXT:    v_and_b32_e32 v0, v5, v3
3140; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
3141; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3142; GFX9-NEXT:    s_endpgm
3143  %r = sdiv <4 x i16> %x, %y
3144  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3145  ret void
3146}
3147
3148define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
3149; CHECK-LABEL: @srem_v4i16(
3150; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3151; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3152; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3153; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3154; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3155; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3156; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3157; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3158; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3159; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3160; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3161; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3162; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3163; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3164; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3165; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3166; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3167; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3168; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3169; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3170; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3171; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3172; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3173; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3174; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3175; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
3176; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
3177; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3178; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3179; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3180; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3181; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3182; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3183; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3184; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3185; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3186; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3187; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3188; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3189; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3190; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3191; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3192; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3193; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3194; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3195; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3196; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3197; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3198; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3199; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3200; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3201; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3202; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
3203; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3204; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3205; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3206; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3207; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3208; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3209; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3210; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3211; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3212; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3213; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3214; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3215; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3216; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3217; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3218; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3219; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3220; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3221; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3222; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3223; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3224; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3225; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3226; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3227; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3228; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
3229; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3230; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
3231; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
3232; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
3233; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
3234; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
3235; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
3236; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
3237; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
3238; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
3239; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
3240; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
3241; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
3242; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
3243; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
3244; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
3245; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
3246; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
3247; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
3248; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
3249; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
3250; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
3251; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
3252; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
3253; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
3254; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
3255; CHECK-NEXT:    ret void
3256;
3257; GFX6-LABEL: srem_v4i16:
3258; GFX6:       ; %bb.0:
3259; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3260; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
3261; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3262; GFX6-NEXT:    s_mov_b32 s6, -1
3263; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3264; GFX6-NEXT:    s_sext_i32_i16 s8, s2
3265; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
3266; GFX6-NEXT:    s_sext_i32_i16 s9, s0
3267; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
3268; GFX6-NEXT:    s_xor_b32 s8, s9, s8
3269; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3270; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3271; GFX6-NEXT:    s_or_b32 s8, s8, 1
3272; GFX6-NEXT:    v_mov_b32_e32 v3, s8
3273; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3274; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3275; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3276; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3277; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3278; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3279; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3280; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
3281; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3282; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
3283; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
3284; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3285; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3286; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3287; GFX6-NEXT:    s_xor_b32 s8, s0, s2
3288; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3289; GFX6-NEXT:    s_or_b32 s8, s8, 1
3290; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
3291; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3292; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
3293; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3294; GFX6-NEXT:    v_mov_b32_e32 v4, s8
3295; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3296; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3297; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
3298; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s2
3299; GFX6-NEXT:    s_sext_i32_i16 s2, s3
3300; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s2
3301; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s0, v1
3302; GFX6-NEXT:    s_sext_i32_i16 s0, s1
3303; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
3304; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3305; GFX6-NEXT:    s_xor_b32 s0, s0, s2
3306; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3307; GFX6-NEXT:    s_or_b32 s0, s0, 1
3308; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3309; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3310; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
3311; GFX6-NEXT:    v_mov_b32_e32 v5, s0
3312; GFX6-NEXT:    s_ashr_i32 s0, s3, 16
3313; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
3314; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3315; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3316; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
3317; GFX6-NEXT:    s_ashr_i32 s2, s1, 16
3318; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3319; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s2
3320; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3321; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
3322; GFX6-NEXT:    s_xor_b32 s3, s2, s0
3323; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
3324; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3325; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3326; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
3327; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3328; GFX6-NEXT:    s_or_b32 s3, s3, 1
3329; GFX6-NEXT:    v_mov_b32_e32 v6, s3
3330; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
3331; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
3332; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
3333; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
3334; GFX6-NEXT:    s_mov_b32 s0, 0xffff
3335; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
3336; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
3337; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
3338; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3339; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3340; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
3341; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
3342; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3343; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3344; GFX6-NEXT:    s_endpgm
3345;
3346; GFX9-LABEL: srem_v4i16:
3347; GFX9:       ; %bb.0:
3348; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3349; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3350; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3351; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3352; GFX9-NEXT:    s_sext_i32_i16 s0, s6
3353; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3354; GFX9-NEXT:    s_sext_i32_i16 s1, s4
3355; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
3356; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3357; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3358; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3359; GFX9-NEXT:    s_or_b32 s8, s0, 1
3360; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3361; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3362; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3363; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3364; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3365; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3366; GFX9-NEXT:    s_ashr_i32 s9, s6, 16
3367; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3368; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s9
3369; GFX9-NEXT:    s_ashr_i32 s8, s4, 16
3370; GFX9-NEXT:    v_add_u32_e32 v1, s0, v3
3371; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s8
3372; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3373; GFX9-NEXT:    s_xor_b32 s0, s8, s9
3374; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3375; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
3376; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
3377; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3378; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
3379; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3380; GFX9-NEXT:    s_or_b32 s6, s0, 1
3381; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
3382; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3383; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
3384; GFX9-NEXT:    v_add_u32_e32 v0, s0, v4
3385; GFX9-NEXT:    s_sext_i32_i16 s0, s7
3386; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
3387; GFX9-NEXT:    s_sext_i32_i16 s1, s5
3388; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
3389; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3390; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3391; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3392; GFX9-NEXT:    s_or_b32 s6, s0, 1
3393; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
3394; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
3395; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3396; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
3397; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
3398; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3399; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3400; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
3401; GFX9-NEXT:    s_ashr_i32 s6, s7, 16
3402; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s6
3403; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
3404; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
3405; GFX9-NEXT:    s_ashr_i32 s7, s5, 16
3406; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s7
3407; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3408; GFX9-NEXT:    s_xor_b32 s0, s7, s6
3409; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3410; GFX9-NEXT:    s_or_b32 s9, s0, 1
3411; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3412; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3413; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
3414; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3415; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
3416; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3417; GFX9-NEXT:    s_cselect_b32 s0, s9, 0
3418; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
3419; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
3420; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
3421; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v3
3422; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
3423; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v4
3424; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
3425; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
3426; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
3427; GFX9-NEXT:    v_and_b32_e32 v3, v4, v5
3428; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
3429; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3430; GFX9-NEXT:    s_endpgm
3431  %r = srem <4 x i16> %x, %y
3432  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3433  ret void
3434}
3435
3436define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3437; CHECK-LABEL: @udiv_i3(
3438; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3439; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3440; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3441; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3442; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3443; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3444; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3445; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3446; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3447; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3448; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3449; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3450; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3451; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3452; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3453; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
3454; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
3455; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
3456; CHECK-NEXT:    ret void
3457;
3458; GFX6-LABEL: udiv_i3:
3459; GFX6:       ; %bb.0:
3460; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3461; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3462; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3463; GFX6-NEXT:    s_mov_b32 s6, -1
3464; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3465; GFX6-NEXT:    s_bfe_u32 s1, s0, 0x30008
3466; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
3467; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3468; GFX6-NEXT:    s_and_b32 s0, s0, 7
3469; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
3470; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3471; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3472; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3473; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3474; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3475; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3476; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3477; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
3478; GFX6-NEXT:    s_endpgm
3479;
3480; GFX9-LABEL: udiv_i3:
3481; GFX9:       ; %bb.0:
3482; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3483; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3484; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3485; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3486; GFX9-NEXT:    s_bfe_u32 s0, s4, 0x30008
3487; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
3488; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3489; GFX9-NEXT:    s_and_b32 s0, s4, 7
3490; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
3491; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
3492; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3493; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
3494; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
3495; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3496; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
3497; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3498; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
3499; GFX9-NEXT:    s_endpgm
3500  %r = udiv i3 %x, %y
3501  store i3 %r, i3 addrspace(1)* %out
3502  ret void
3503}
3504
3505define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3506; CHECK-LABEL: @urem_i3(
3507; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3508; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3509; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3510; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3511; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3512; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3513; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3514; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3515; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3516; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3517; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3518; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3519; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3520; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3521; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3522; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
3523; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
3524; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
3525; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
3526; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
3527; CHECK-NEXT:    ret void
3528;
3529; GFX6-LABEL: urem_i3:
3530; GFX6:       ; %bb.0:
3531; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3532; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3533; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3534; GFX6-NEXT:    s_mov_b32 s6, -1
3535; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3536; GFX6-NEXT:    s_bfe_u32 s1, s0, 0x30008
3537; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
3538; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3539; GFX6-NEXT:    s_and_b32 s2, s0, 7
3540; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
3541; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
3542; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3543; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3544; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3545; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3546; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3547; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3548; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s1
3549; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
3550; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3551; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
3552; GFX6-NEXT:    s_endpgm
3553;
3554; GFX9-LABEL: urem_i3:
3555; GFX9:       ; %bb.0:
3556; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
3557; GFX9-NEXT:    s_nop 0
3558; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3559; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3560; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x30008
3561; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
3562; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3563; GFX9-NEXT:    s_and_b32 s4, s2, 7
3564; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
3565; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
3566; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
3567; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3568; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
3569; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
3570; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3571; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3572; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3573; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
3574; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
3575; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3576; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3577; GFX9-NEXT:    s_endpgm
3578  %r = urem i3 %x, %y
3579  store i3 %r, i3 addrspace(1)* %out
3580  ret void
3581}
3582
3583define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3584; CHECK-LABEL: @sdiv_i3(
3585; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3586; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3587; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3588; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3589; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3590; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3591; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3592; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3593; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3594; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3595; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3596; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3597; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3598; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3599; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3600; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3601; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3602; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3603; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
3604; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
3605; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
3606; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
3607; CHECK-NEXT:    ret void
3608;
3609; GFX6-LABEL: sdiv_i3:
3610; GFX6:       ; %bb.0:
3611; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3612; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3613; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3614; GFX6-NEXT:    s_mov_b32 s6, -1
3615; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3616; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x30008
3617; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
3618; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x30000
3619; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
3620; GFX6-NEXT:    s_xor_b32 s0, s0, s1
3621; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3622; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3623; GFX6-NEXT:    s_or_b32 s0, s0, 1
3624; GFX6-NEXT:    v_mov_b32_e32 v3, s0
3625; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3626; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3627; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3628; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3629; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3630; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3631; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3632; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3633; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
3634; GFX6-NEXT:    s_endpgm
3635;
3636; GFX9-LABEL: sdiv_i3:
3637; GFX9:       ; %bb.0:
3638; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3639; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3640; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3641; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3642; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x30008
3643; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3644; GFX9-NEXT:    s_bfe_i32 s1, s4, 0x30000
3645; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
3646; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3647; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3648; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3649; GFX9-NEXT:    s_or_b32 s4, s0, 1
3650; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
3651; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3652; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
3653; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3654; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
3655; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3656; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3657; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
3658; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3659; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
3660; GFX9-NEXT:    s_endpgm
3661  %r = sdiv i3 %x, %y
3662  store i3 %r, i3 addrspace(1)* %out
3663  ret void
3664}
3665
3666define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3667; CHECK-LABEL: @srem_i3(
3668; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3669; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3670; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3671; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3672; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3673; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3674; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3675; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3676; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3677; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3678; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3679; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3680; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3681; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3682; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3683; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3684; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3685; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3686; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
3687; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
3688; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
3689; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
3690; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
3691; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
3692; CHECK-NEXT:    ret void
3693;
3694; GFX6-LABEL: srem_i3:
3695; GFX6:       ; %bb.0:
3696; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3697; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3698; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3699; GFX6-NEXT:    s_mov_b32 s6, -1
3700; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3701; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x30008
3702; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
3703; GFX6-NEXT:    s_bfe_i32 s3, s0, 0x30000
3704; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
3705; GFX6-NEXT:    s_xor_b32 s1, s3, s1
3706; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3707; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
3708; GFX6-NEXT:    s_or_b32 s1, s1, 1
3709; GFX6-NEXT:    v_mov_b32_e32 v3, s1
3710; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3711; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3712; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3713; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3714; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3715; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3716; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
3717; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3718; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
3719; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
3720; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3721; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
3722; GFX6-NEXT:    s_endpgm
3723;
3724; GFX9-LABEL: srem_i3:
3725; GFX9:       ; %bb.0:
3726; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3727; GFX9-NEXT:    s_nop 0
3728; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3729; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3730; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x30008
3731; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
3732; GFX9-NEXT:    s_bfe_i32 s3, s4, 0x30000
3733; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
3734; GFX9-NEXT:    s_xor_b32 s2, s3, s2
3735; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3736; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
3737; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
3738; GFX9-NEXT:    s_or_b32 s6, s2, 1
3739; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
3740; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3741; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
3742; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
3743; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
3744; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
3745; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
3746; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
3747; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
3748; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3749; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3750; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3751; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3752; GFX9-NEXT:    s_endpgm
3753  %r = srem i3 %x, %y
3754  store i3 %r, i3 addrspace(1)* %out
3755  ret void
3756}
3757
3758define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3759; CHECK-LABEL: @udiv_v3i16(
3760; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3761; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3762; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3763; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3764; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3765; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3766; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3767; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3768; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3769; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3770; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3771; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3772; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3773; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3774; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3775; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3776; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3777; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
3778; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
3779; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
3780; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
3781; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3782; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
3783; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
3784; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3785; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3786; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3787; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3788; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3789; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3790; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3791; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3792; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3793; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3794; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3795; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3796; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3797; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
3798; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
3799; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
3800; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
3801; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3802; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
3803; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
3804; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3805; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3806; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3807; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3808; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3809; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3810; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3811; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3812; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3813; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3814; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3815; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3816; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3817; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
3818; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
3819; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
3820; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3821; CHECK-NEXT:    ret void
3822;
3823; GFX6-LABEL: udiv_v3i16:
3824; GFX6:       ; %bb.0:
3825; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3826; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3827; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3828; GFX6-NEXT:    s_mov_b32 s8, 0xffff
3829; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3830; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3831; GFX6-NEXT:    s_and_b32 s6, s0, s8
3832; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
3833; GFX6-NEXT:    s_and_b32 s6, s2, s8
3834; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
3835; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s0
3836; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
3837; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3838; GFX6-NEXT:    s_lshr_b32 s0, s2, 16
3839; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
3840; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3841; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3842; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3843; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3844; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
3845; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3846; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
3847; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3848; GFX6-NEXT:    s_and_b32 s0, s1, s8
3849; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
3850; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
3851; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
3852; GFX6-NEXT:    s_and_b32 s0, s3, s8
3853; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
3854; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
3855; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3856; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
3857; GFX6-NEXT:    s_mov_b32 s6, -1
3858; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3859; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
3860; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3861; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
3862; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
3863; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3864; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3865; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
3866; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
3867; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3868; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3869; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3870; GFX6-NEXT:    s_endpgm
3871;
3872; GFX9-LABEL: udiv_v3i16:
3873; GFX9:       ; %bb.0:
3874; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3875; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
3876; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
3877; GFX9-NEXT:    s_mov_b32 s8, 0xffff
3878; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3879; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3880; GFX9-NEXT:    s_and_b32 s0, s6, s8
3881; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
3882; GFX9-NEXT:    s_and_b32 s0, s4, s8
3883; GFX9-NEXT:    s_lshr_b32 s1, s6, 16
3884; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s0
3885; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3886; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s1
3887; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
3888; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
3889; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
3890; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3891; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3892; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
3893; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
3894; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
3895; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
3896; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3897; GFX9-NEXT:    s_and_b32 s0, s7, s8
3898; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3899; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
3900; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
3901; GFX9-NEXT:    s_and_b32 s0, s5, s8
3902; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
3903; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
3904; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3905; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3906; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3907; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
3908; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
3909; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3910; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
3911; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v6
3912; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
3913; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
3914; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
3915; GFX9-NEXT:    global_store_short v1, v3, s[2:3] offset:4
3916; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
3917; GFX9-NEXT:    s_endpgm
3918  %r = udiv <3 x i16> %x, %y
3919  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3920  ret void
3921}
3922
3923define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3924; CHECK-LABEL: @urem_v3i16(
3925; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3926; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3927; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3928; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3929; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3930; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3931; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3932; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3933; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3934; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3935; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3936; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3937; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3938; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3939; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3940; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3941; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3942; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3943; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3944; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
3945; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
3946; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
3947; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
3948; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3949; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
3950; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
3951; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3952; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3953; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3954; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3955; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3956; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3957; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3958; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3959; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3960; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3961; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3962; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3963; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3964; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3965; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3966; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
3967; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
3968; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
3969; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
3970; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3971; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
3972; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
3973; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3974; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3975; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3976; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3977; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3978; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3979; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3980; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3981; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3982; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3983; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3984; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3985; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3986; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3987; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3988; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
3989; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
3990; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
3991; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3992; CHECK-NEXT:    ret void
3993;
3994; GFX6-LABEL: urem_v3i16:
3995; GFX6:       ; %bb.0:
3996; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3997; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3998; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3999; GFX6-NEXT:    s_mov_b32 s8, 0xffff
4000; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4001; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4002; GFX6-NEXT:    v_mov_b32_e32 v1, s2
4003; GFX6-NEXT:    s_and_b32 s6, s0, s8
4004; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
4005; GFX6-NEXT:    s_and_b32 s6, s2, s8
4006; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
4007; GFX6-NEXT:    v_mov_b32_e32 v4, s0
4008; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4009; GFX6-NEXT:    v_alignbit_b32 v4, s1, v4, 16
4010; GFX6-NEXT:    v_and_b32_e32 v5, s8, v4
4011; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
4012; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
4013; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4014; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
4015; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
4016; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
4017; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v5
4018; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
4019; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
4020; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
4021; GFX6-NEXT:    s_and_b32 s0, s1, s8
4022; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
4023; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
4024; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
4025; GFX6-NEXT:    s_and_b32 s0, s3, s8
4026; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s0
4027; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
4028; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4029; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4030; GFX6-NEXT:    v_mad_f32 v3, -v5, v2, v3
4031; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
4032; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
4033; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
4034; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4035; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
4036; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4037; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
4038; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
4039; GFX6-NEXT:    v_mad_f32 v3, -v3, v6, v7
4040; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
4041; GFX6-NEXT:    s_mov_b32 s6, -1
4042; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4043; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
4044; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
4045; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4046; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
4047; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
4048; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4049; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
4050; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4051; GFX6-NEXT:    s_endpgm
4052;
4053; GFX9-LABEL: urem_v3i16:
4054; GFX9:       ; %bb.0:
4055; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4056; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4057; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4058; GFX9-NEXT:    s_mov_b32 s8, 0xffff
4059; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4060; GFX9-NEXT:    s_and_b32 s0, s4, s8
4061; GFX9-NEXT:    s_and_b32 s1, s6, s8
4062; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
4063; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
4064; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
4065; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
4066; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4067; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
4068; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
4069; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v2
4070; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
4071; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4072; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v3
4073; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
4074; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4075; GFX9-NEXT:    v_mul_f32_e32 v1, v4, v5
4076; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4077; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s1
4078; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4079; GFX9-NEXT:    s_and_b32 s1, s7, s8
4080; GFX9-NEXT:    v_mad_f32 v3, -v1, v2, v4
4081; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s1
4082; GFX9-NEXT:    s_and_b32 s5, s5, s8
4083; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
4084; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
4085; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
4086; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
4087; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
4088; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
4089; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
4090; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4091; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
4092; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
4093; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
4094; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
4095; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
4096; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s1
4097; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4098; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
4099; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4100; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
4101; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
4102; GFX9-NEXT:    global_store_short v3, v2, s[2:3] offset:4
4103; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
4104; GFX9-NEXT:    s_endpgm
4105  %r = urem <3 x i16> %x, %y
4106  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4107  ret void
4108}
4109
4110define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4111; CHECK-LABEL: @sdiv_v3i16(
4112; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4113; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4114; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4115; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4116; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4117; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4118; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4119; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4120; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4121; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4122; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4123; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4124; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4125; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4126; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4127; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4128; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4129; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4130; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4131; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4132; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
4133; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
4134; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
4135; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
4136; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
4137; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4138; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
4139; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
4140; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4141; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4142; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4143; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4144; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4145; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4146; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4147; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4148; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4149; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4150; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4151; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4152; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4153; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4154; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4155; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4156; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
4157; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
4158; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
4159; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
4160; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
4161; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4162; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
4163; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
4164; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4165; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4166; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4167; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4168; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4169; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4170; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4171; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4172; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4173; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4174; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4175; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4176; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4177; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4178; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4179; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4180; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
4181; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
4182; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
4183; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
4184; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4185; CHECK-NEXT:    ret void
4186;
4187; GFX6-LABEL: sdiv_v3i16:
4188; GFX6:       ; %bb.0:
4189; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4190; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4191; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4192; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4193; GFX6-NEXT:    s_mov_b32 s6, -1
4194; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4195; GFX6-NEXT:    s_sext_i32_i16 s9, s2
4196; GFX6-NEXT:    s_sext_i32_i16 s8, s0
4197; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
4198; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
4199; GFX6-NEXT:    s_xor_b32 s8, s9, s8
4200; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
4201; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4202; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
4203; GFX6-NEXT:    s_or_b32 s8, s8, 1
4204; GFX6-NEXT:    v_mov_b32_e32 v3, s8
4205; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4206; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4207; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4208; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4209; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4210; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
4211; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4212; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
4213; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4214; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s2
4215; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
4216; GFX6-NEXT:    s_xor_b32 s0, s2, s0
4217; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
4218; GFX6-NEXT:    s_or_b32 s0, s0, 1
4219; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
4220; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4221; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
4222; GFX6-NEXT:    v_mov_b32_e32 v4, s0
4223; GFX6-NEXT:    s_sext_i32_i16 s0, s1
4224; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
4225; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
4226; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
4227; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
4228; GFX6-NEXT:    s_sext_i32_i16 s1, s3
4229; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
4230; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
4231; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4232; GFX6-NEXT:    s_xor_b32 s0, s1, s0
4233; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
4234; GFX6-NEXT:    s_or_b32 s0, s0, 1
4235; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4236; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4237; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
4238; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
4239; GFX6-NEXT:    v_mov_b32_e32 v5, s0
4240; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
4241; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
4242; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
4243; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4244; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4245; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4246; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
4247; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4248; GFX6-NEXT:    s_endpgm
4249;
4250; GFX9-LABEL: sdiv_v3i16:
4251; GFX9:       ; %bb.0:
4252; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4253; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4254; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4255; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4256; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4257; GFX9-NEXT:    s_sext_i32_i16 s1, s4
4258; GFX9-NEXT:    s_sext_i32_i16 s0, s6
4259; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
4260; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
4261; GFX9-NEXT:    s_xor_b32 s0, s1, s0
4262; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4263; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4264; GFX9-NEXT:    s_or_b32 s8, s0, 1
4265; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4266; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4267; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4268; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
4269; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4270; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
4271; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
4272; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4273; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
4274; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
4275; GFX9-NEXT:    v_add_u32_e32 v2, s0, v3
4276; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
4277; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4278; GFX9-NEXT:    s_xor_b32 s0, s4, s1
4279; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4280; GFX9-NEXT:    s_or_b32 s4, s0, 1
4281; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4282; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4283; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
4284; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
4285; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4286; GFX9-NEXT:    s_sext_i32_i16 s1, s7
4287; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4288; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
4289; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4290; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4291; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
4292; GFX9-NEXT:    s_sext_i32_i16 s0, s5
4293; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
4294; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
4295; GFX9-NEXT:    s_xor_b32 s0, s0, s1
4296; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4297; GFX9-NEXT:    s_or_b32 s4, s0, 1
4298; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4299; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4300; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
4301; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
4302; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
4303; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4304; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4305; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
4306; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
4307; GFX9-NEXT:    global_store_short v1, v0, s[2:3] offset:4
4308; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
4309; GFX9-NEXT:    s_endpgm
4310  %r = sdiv <3 x i16> %x, %y
4311  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4312  ret void
4313}
4314
4315define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4316; CHECK-LABEL: @srem_v3i16(
4317; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4318; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4319; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4320; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4321; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4322; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4323; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4324; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4325; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4326; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4327; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4328; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4329; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4330; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4331; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4332; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4333; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4334; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4335; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4336; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4337; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
4338; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
4339; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
4340; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
4341; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
4342; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
4343; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
4344; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4345; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
4346; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
4347; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
4348; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
4349; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
4350; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
4351; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
4352; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
4353; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
4354; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
4355; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
4356; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
4357; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
4358; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
4359; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
4360; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
4361; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
4362; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
4363; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
4364; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
4365; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
4366; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
4367; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
4368; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
4369; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
4370; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4371; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
4372; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
4373; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
4374; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
4375; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
4376; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
4377; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
4378; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
4379; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
4380; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
4381; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
4382; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
4383; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
4384; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
4385; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
4386; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
4387; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
4388; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
4389; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
4390; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
4391; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
4392; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
4393; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
4394; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
4395; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4396; CHECK-NEXT:    ret void
4397;
4398; GFX6-LABEL: srem_v3i16:
4399; GFX6:       ; %bb.0:
4400; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4401; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4402; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4403; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4404; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4405; GFX6-NEXT:    s_sext_i32_i16 s8, s2
4406; GFX6-NEXT:    s_sext_i32_i16 s6, s0
4407; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
4408; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s8
4409; GFX6-NEXT:    s_xor_b32 s6, s8, s6
4410; GFX6-NEXT:    s_ashr_i32 s6, s6, 30
4411; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4412; GFX6-NEXT:    s_or_b32 s6, s6, 1
4413; GFX6-NEXT:    v_mov_b32_e32 v3, s6
4414; GFX6-NEXT:    s_mov_b32 s6, -1
4415; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4416; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4417; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4418; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4419; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4420; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4421; GFX6-NEXT:    v_mov_b32_e32 v1, s2
4422; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4423; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4424; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 16
4425; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4426; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
4427; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
4428; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
4429; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
4430; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
4431; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
4432; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
4433; GFX6-NEXT:    s_sext_i32_i16 s0, s1
4434; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
4435; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4436; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
4437; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
4438; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4439; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
4440; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
4441; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s0
4442; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
4443; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
4444; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
4445; GFX6-NEXT:    s_sext_i32_i16 s2, s3
4446; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4447; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s2
4448; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v4
4449; GFX6-NEXT:    s_xor_b32 s0, s2, s0
4450; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
4451; GFX6-NEXT:    s_or_b32 s0, s0, 1
4452; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
4453; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4454; GFX6-NEXT:    v_mad_f32 v3, -v5, v4, v3
4455; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4456; GFX6-NEXT:    v_mov_b32_e32 v6, s0
4457; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
4458; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
4459; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
4460; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
4461; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
4462; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4463; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4464; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
4465; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4466; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
4467; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4468; GFX6-NEXT:    s_endpgm
4469;
4470; GFX9-LABEL: srem_v3i16:
4471; GFX9:       ; %bb.0:
4472; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
4473; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
4474; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
4475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4476; GFX9-NEXT:    s_sext_i32_i16 s8, s2
4477; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
4478; GFX9-NEXT:    s_sext_i32_i16 s9, s6
4479; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
4480; GFX9-NEXT:    s_xor_b32 s0, s9, s8
4481; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4482; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4483; GFX9-NEXT:    s_or_b32 s10, s0, 1
4484; GFX9-NEXT:    s_sext_i32_i16 s3, s3
4485; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
4486; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4487; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
4488; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
4489; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4490; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
4491; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
4492; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
4493; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
4494; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
4495; GFX9-NEXT:    v_add_u32_e32 v1, s0, v2
4496; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s6
4497; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4498; GFX9-NEXT:    s_xor_b32 s0, s6, s2
4499; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4500; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
4501; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4502; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4503; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4504; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4505; GFX9-NEXT:    s_or_b32 s8, s0, 1
4506; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
4507; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4508; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s3
4509; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
4510; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
4511; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
4512; GFX9-NEXT:    s_sext_i32_i16 s2, s7
4513; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
4514; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4515; GFX9-NEXT:    s_xor_b32 s0, s2, s3
4516; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4517; GFX9-NEXT:    s_or_b32 s7, s0, 1
4518; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4519; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4520; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
4521; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4522; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
4523; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4524; GFX9-NEXT:    s_cselect_b32 s0, s7, 0
4525; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
4526; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
4527; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
4528; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4529; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
4530; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4531; GFX9-NEXT:    v_sub_u32_e32 v2, s2, v2
4532; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
4533; GFX9-NEXT:    global_store_short v3, v2, s[4:5] offset:4
4534; GFX9-NEXT:    global_store_dword v3, v0, s[4:5]
4535; GFX9-NEXT:    s_endpgm
4536  %r = srem <3 x i16> %x, %y
4537  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4538  ret void
4539}
4540
4541define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4542; CHECK-LABEL: @udiv_v3i15(
4543; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4544; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4545; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4546; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4547; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4548; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4549; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4550; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4551; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4552; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4553; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4554; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4555; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4556; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4557; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4558; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4559; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4560; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
4561; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
4562; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
4563; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
4564; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4565; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
4566; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
4567; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
4568; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
4569; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
4570; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
4571; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
4572; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
4573; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
4574; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
4575; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
4576; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
4577; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
4578; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
4579; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
4580; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
4581; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
4582; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
4583; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
4584; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4585; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
4586; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
4587; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
4588; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
4589; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
4590; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
4591; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
4592; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
4593; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
4594; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
4595; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
4596; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
4597; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
4598; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
4599; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
4600; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
4601; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
4602; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
4603; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4604; CHECK-NEXT:    ret void
4605;
4606; GFX6-LABEL: udiv_v3i15:
4607; GFX6:       ; %bb.0:
4608; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4609; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4610; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4611; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4612; GFX6-NEXT:    s_mov_b32 s6, -1
4613; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4614; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4615; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4616; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
4617; GFX6-NEXT:    s_and_b32 s9, s0, s3
4618; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
4619; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4620; GFX6-NEXT:    s_and_b32 s8, s2, s3
4621; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf000f
4622; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
4623; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
4624; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4625; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf000f
4626; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
4627; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
4628; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4629; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
4630; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
4631; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4632; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4633; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4634; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
4635; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4636; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
4637; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
4638; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4639; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4640; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
4641; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4642; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
4643; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
4644; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
4645; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4646; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v6
4647; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4648; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
4649; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
4650; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
4651; GFX6-NEXT:    v_and_b32_e32 v2, s3, v3
4652; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
4653; GFX6-NEXT:    v_and_b32_e32 v3, s3, v4
4654; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4655; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4656; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
4657; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4658; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4659; GFX6-NEXT:    s_waitcnt expcnt(0)
4660; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4661; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
4662; GFX6-NEXT:    s_endpgm
4663;
4664; GFX9-LABEL: udiv_v3i15:
4665; GFX9:       ; %bb.0:
4666; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4667; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4668; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4669; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
4670; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4671; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4672; GFX9-NEXT:    s_and_b32 s0, s4, s8
4673; GFX9-NEXT:    s_and_b32 s1, s6, s8
4674; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
4675; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
4676; GFX9-NEXT:    s_bfe_u32 s0, s6, 0xf000f
4677; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
4678; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4679; GFX9-NEXT:    v_mov_b32_e32 v3, s6
4680; GFX9-NEXT:    s_bfe_u32 s1, s4, 0xf000f
4681; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
4682; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4683; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
4684; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4685; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
4686; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4687; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4688; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4689; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
4690; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4691; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4692; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
4693; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
4694; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
4695; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4696; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
4697; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
4698; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
4699; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
4700; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
4701; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
4702; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
4703; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v7
4704; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4705; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
4706; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
4707; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
4708; GFX9-NEXT:    v_and_b32_e32 v3, s8, v4
4709; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4710; GFX9-NEXT:    v_and_b32_e32 v4, s8, v5
4711; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4712; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4713; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4714; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4715; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
4716; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4717; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
4718; GFX9-NEXT:    s_endpgm
4719  %r = udiv <3 x i15> %x, %y
4720  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4721  ret void
4722}
4723
4724define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4725; CHECK-LABEL: @urem_v3i15(
4726; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4727; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4728; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4729; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4730; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4731; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4732; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4733; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4734; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4735; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4736; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4737; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4738; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4739; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4740; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4741; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4742; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4743; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
4744; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
4745; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
4746; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
4747; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
4748; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
4749; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4750; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
4751; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
4752; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
4753; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
4754; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
4755; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
4756; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
4757; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
4758; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
4759; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
4760; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4761; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
4762; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
4763; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
4764; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
4765; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
4766; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
4767; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
4768; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
4769; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
4770; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
4771; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4772; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
4773; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
4774; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
4775; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
4776; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
4777; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
4778; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
4779; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
4780; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
4781; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
4782; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
4783; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
4784; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
4785; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
4786; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
4787; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
4788; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
4789; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
4790; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
4791; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
4792; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4793; CHECK-NEXT:    ret void
4794;
4795; GFX6-LABEL: urem_v3i15:
4796; GFX6:       ; %bb.0:
4797; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4798; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4799; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4800; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4801; GFX6-NEXT:    s_mov_b32 s6, -1
4802; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4803; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4804; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4805; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
4806; GFX6-NEXT:    s_and_b32 s10, s0, s3
4807; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
4808; GFX6-NEXT:    s_and_b32 s9, s2, s3
4809; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
4810; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4811; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4812; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
4813; GFX6-NEXT:    s_bfe_u32 s1, s0, 0xf000f
4814; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
4815; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4816; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4817; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4818; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4819; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4820; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
4821; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
4822; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
4823; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
4824; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
4825; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
4826; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
4827; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
4828; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
4829; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
4830; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
4831; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4832; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
4833; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
4834; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
4835; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4836; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
4837; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4838; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4839; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
4840; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4841; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
4842; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
4843; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
4844; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
4845; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4846; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
4847; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
4848; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
4849; GFX6-NEXT:    v_and_b32_e32 v3, s3, v3
4850; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4851; GFX6-NEXT:    v_and_b32_e32 v2, s3, v6
4852; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4853; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
4854; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4855; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4856; GFX6-NEXT:    s_waitcnt expcnt(0)
4857; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4858; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
4859; GFX6-NEXT:    s_endpgm
4860;
4861; GFX9-LABEL: urem_v3i15:
4862; GFX9:       ; %bb.0:
4863; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4864; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4865; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4866; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
4867; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4868; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4869; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4870; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
4871; GFX9-NEXT:    s_and_b32 s5, s6, s8
4872; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
4873; GFX9-NEXT:    s_and_b32 s0, s4, s8
4874; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
4875; GFX9-NEXT:    s_bfe_u32 s5, s6, 0xf000f
4876; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4877; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s5
4878; GFX9-NEXT:    v_mov_b32_e32 v3, s6
4879; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
4880; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4881; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4882; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4883; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4884; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4885; GFX9-NEXT:    s_bfe_u32 s1, s4, 0xf000f
4886; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
4887; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
4888; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
4889; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
4890; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4891; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
4892; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
4893; GFX9-NEXT:    s_lshr_b32 s0, s6, 15
4894; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
4895; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
4896; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4897; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
4898; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
4899; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
4900; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
4901; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
4902; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
4903; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
4904; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
4905; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
4906; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s0
4907; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
4908; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
4909; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
4910; GFX9-NEXT:    s_lshr_b32 s0, s4, 15
4911; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v4
4912; GFX9-NEXT:    v_and_b32_e32 v4, s8, v4
4913; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
4914; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
4915; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4916; GFX9-NEXT:    v_and_b32_e32 v3, s8, v5
4917; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4918; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4919; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4920; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
4921; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4922; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
4923; GFX9-NEXT:    s_endpgm
4924  %r = urem <3 x i15> %x, %y
4925  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4926  ret void
4927}
4928
4929define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4930; CHECK-LABEL: @sdiv_v3i15(
4931; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4932; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4933; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
4934; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
4935; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4936; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4937; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4938; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4939; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4940; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4941; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4942; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4943; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4944; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4945; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4946; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4947; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4948; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4949; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4950; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4951; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
4952; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
4953; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
4954; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
4955; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
4956; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4957; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
4958; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
4959; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4960; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4961; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4962; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4963; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4964; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4965; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4966; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4967; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4968; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4969; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4970; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4971; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4972; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4973; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4974; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4975; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
4976; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
4977; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
4978; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
4979; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
4980; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4981; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
4982; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
4983; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4984; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4985; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4986; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4987; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4988; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4989; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4990; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4991; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4992; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4993; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4994; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4995; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4996; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4997; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4998; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4999; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
5000; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
5001; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
5002; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
5003; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
5004; CHECK-NEXT:    ret void
5005;
5006; GFX6-LABEL: sdiv_v3i15:
5007; GFX6:       ; %bb.0:
5008; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5009; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5010; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5011; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5012; GFX6-NEXT:    s_mov_b32 s6, -1
5013; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5014; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5015; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5016; GFX6-NEXT:    s_bfe_i32 s3, s0, 0xf0000
5017; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s3
5018; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5019; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
5020; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf0000
5021; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
5022; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5023; GFX6-NEXT:    s_xor_b32 s1, s1, s3
5024; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
5025; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
5026; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
5027; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
5028; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
5029; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
5030; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
5031; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
5032; GFX6-NEXT:    s_or_b32 s1, s1, 1
5033; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5034; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
5035; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
5036; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5037; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
5038; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5039; GFX6-NEXT:    s_xor_b32 s0, s1, s0
5040; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
5041; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5042; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
5043; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5044; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
5045; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
5046; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5047; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v1
5048; GFX6-NEXT:    s_or_b32 s0, s0, 1
5049; GFX6-NEXT:    v_mov_b32_e32 v6, s0
5050; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
5051; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
5052; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5053; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
5054; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5055; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
5056; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5057; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
5058; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
5059; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
5060; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
5061; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
5062; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
5063; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5064; GFX6-NEXT:    s_movk_i32 s0, 0x7fff
5065; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5066; GFX6-NEXT:    v_and_b32_e32 v3, s0, v3
5067; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5068; GFX6-NEXT:    v_and_b32_e32 v2, s0, v2
5069; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5070; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
5071; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5072; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5073; GFX6-NEXT:    s_waitcnt expcnt(0)
5074; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5075; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5076; GFX6-NEXT:    s_endpgm
5077;
5078; GFX9-LABEL: sdiv_v3i15:
5079; GFX9:       ; %bb.0:
5080; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5081; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5082; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5083; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5084; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5085; GFX9-NEXT:    s_bfe_i32 s1, s4, 0xf0000
5086; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf0000
5087; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
5088; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
5089; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5090; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5091; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5092; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5093; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
5094; GFX9-NEXT:    s_or_b32 s5, s0, 1
5095; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
5096; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
5097; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
5098; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
5099; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
5100; GFX9-NEXT:    s_cselect_b32 s0, s5, 0
5101; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
5102; GFX9-NEXT:    s_bfe_i32 s1, s6, 0xf000f
5103; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s1
5104; GFX9-NEXT:    v_mov_b32_e32 v1, s6
5105; GFX9-NEXT:    v_add_u32_e32 v4, s0, v5
5106; GFX9-NEXT:    s_bfe_i32 s0, s4, 0xf000f
5107; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
5108; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
5109; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
5110; GFX9-NEXT:    s_xor_b32 s0, s0, s1
5111; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5112; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
5113; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5114; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
5115; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
5116; GFX9-NEXT:    s_or_b32 s4, s0, 1
5117; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
5118; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
5119; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
5120; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
5121; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
5122; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
5123; GFX9-NEXT:    v_add_u32_e32 v5, s0, v6
5124; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v0
5125; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
5126; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
5127; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5128; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
5129; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
5130; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
5131; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v1
5132; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
5133; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
5134; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5135; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
5136; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
5137; GFX9-NEXT:    v_and_b32_e32 v3, s0, v4
5138; GFX9-NEXT:    v_and_b32_e32 v4, s0, v5
5139; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
5140; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5141; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
5142; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
5143; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
5144; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5145; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
5146; GFX9-NEXT:    s_endpgm
5147  %r = sdiv <3 x i15> %x, %y
5148  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
5149  ret void
5150}
5151
5152define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
5153; CHECK-LABEL: @srem_v3i15(
5154; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
5155; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
5156; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
5157; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
5158; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
5159; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
5160; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
5161; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
5162; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
5163; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5164; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
5165; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
5166; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
5167; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
5168; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
5169; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
5170; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
5171; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
5172; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
5173; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
5174; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
5175; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
5176; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
5177; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
5178; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
5179; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
5180; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
5181; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
5182; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
5183; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
5184; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
5185; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
5186; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
5187; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
5188; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
5189; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5190; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
5191; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
5192; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
5193; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
5194; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
5195; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
5196; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
5197; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
5198; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
5199; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
5200; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
5201; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
5202; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
5203; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
5204; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
5205; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
5206; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
5207; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
5208; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
5209; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
5210; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
5211; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
5212; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
5213; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
5214; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
5215; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
5216; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
5217; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
5218; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
5219; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
5220; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
5221; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
5222; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
5223; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
5224; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
5225; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
5226; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
5227; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
5228; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
5229; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
5230; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
5231; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
5232; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
5233; CHECK-NEXT:    ret void
5234;
5235; GFX6-LABEL: srem_v3i15:
5236; GFX6:       ; %bb.0:
5237; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5238; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5239; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5240; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5241; GFX6-NEXT:    s_mov_b32 s6, -1
5242; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5243; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5244; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5245; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
5246; GFX6-NEXT:    s_and_b32 s11, s0, s3
5247; GFX6-NEXT:    s_bfe_i32 s11, s11, 0xf0000
5248; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s11
5249; GFX6-NEXT:    s_and_b32 s9, s2, s3
5250; GFX6-NEXT:    s_bfe_i32 s9, s9, 0xf0000
5251; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s9
5252; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5253; GFX6-NEXT:    s_xor_b32 s9, s9, s11
5254; GFX6-NEXT:    s_ashr_i32 s9, s9, 30
5255; GFX6-NEXT:    s_or_b32 s9, s9, 1
5256; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
5257; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
5258; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
5259; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
5260; GFX6-NEXT:    v_mov_b32_e32 v5, s9
5261; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
5262; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
5263; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5264; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5265; GFX6-NEXT:    s_bfe_u32 s12, s0, 0xf000f
5266; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
5267; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
5268; GFX6-NEXT:    s_lshr_b32 s1, s0, 15
5269; GFX6-NEXT:    s_bfe_i32 s0, s12, 0xf0000
5270; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
5271; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
5272; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
5273; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
5274; GFX6-NEXT:    s_bfe_i32 s2, s10, 0xf0000
5275; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s2
5276; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5277; GFX6-NEXT:    s_xor_b32 s0, s2, s0
5278; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5279; GFX6-NEXT:    s_or_b32 s0, s0, 1
5280; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
5281; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5282; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
5283; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5284; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
5285; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
5286; GFX6-NEXT:    v_mov_b32_e32 v6, s0
5287; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
5288; GFX6-NEXT:    v_bfe_i32 v4, v1, 0, 15
5289; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5290; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v4
5291; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
5292; GFX6-NEXT:    v_bfe_i32 v6, v0, 0, 15
5293; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v6
5294; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v5
5295; GFX6-NEXT:    v_xor_b32_e32 v4, v6, v4
5296; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
5297; GFX6-NEXT:    v_or_b32_e32 v4, 1, v4
5298; GFX6-NEXT:    v_mul_f32_e32 v6, v7, v8
5299; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
5300; GFX6-NEXT:    v_mad_f32 v7, -v6, v5, v7
5301; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
5302; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v5|
5303; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
5304; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
5305; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
5306; GFX6-NEXT:    v_mul_lo_u32 v1, v4, v1
5307; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
5308; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
5309; GFX6-NEXT:    v_and_b32_e32 v3, s3, v3
5310; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
5311; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5312; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5313; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
5314; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5315; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5316; GFX6-NEXT:    s_waitcnt expcnt(0)
5317; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5318; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5319; GFX6-NEXT:    s_endpgm
5320;
5321; GFX9-LABEL: srem_v3i15:
5322; GFX9:       ; %bb.0:
5323; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5324; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5325; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5326; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
5327; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5328; GFX9-NEXT:    s_and_b32 s0, s4, s8
5329; GFX9-NEXT:    s_and_b32 s1, s6, s8
5330; GFX9-NEXT:    s_bfe_i32 s1, s1, 0xf0000
5331; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
5332; GFX9-NEXT:    s_bfe_i32 s0, s0, 0xf0000
5333; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
5334; GFX9-NEXT:    s_xor_b32 s0, s0, s1
5335; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5336; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5337; GFX9-NEXT:    v_mov_b32_e32 v1, s6
5338; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5339; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
5340; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
5341; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
5342; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
5343; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
5344; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
5345; GFX9-NEXT:    s_or_b32 s11, s0, 1
5346; GFX9-NEXT:    s_lshr_b32 s9, s4, 15
5347; GFX9-NEXT:    s_bfe_u32 s5, s4, 0xf000f
5348; GFX9-NEXT:    s_lshr_b32 s7, s6, 15
5349; GFX9-NEXT:    s_bfe_u32 s10, s6, 0xf000f
5350; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
5351; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
5352; GFX9-NEXT:    s_cselect_b32 s0, s11, 0
5353; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
5354; GFX9-NEXT:    s_bfe_i32 s0, s10, 0xf0000
5355; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
5356; GFX9-NEXT:    s_bfe_i32 s1, s5, 0xf0000
5357; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
5358; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5359; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5360; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5361; GFX9-NEXT:    s_or_b32 s5, s0, 1
5362; GFX9-NEXT:    v_and_b32_e32 v1, s8, v1
5363; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
5364; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
5365; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
5366; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
5367; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
5368; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
5369; GFX9-NEXT:    s_cselect_b32 s0, s5, 0
5370; GFX9-NEXT:    v_bfe_i32 v4, v1, 0, 15
5371; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
5372; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, v4
5373; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
5374; GFX9-NEXT:    v_bfe_i32 v6, v0, 0, 15
5375; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, v6
5376; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
5377; GFX9-NEXT:    v_xor_b32_e32 v4, v6, v4
5378; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
5379; GFX9-NEXT:    v_or_b32_e32 v4, 1, v4
5380; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v8
5381; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5382; GFX9-NEXT:    v_cvt_i32_f32_e32 v8, v6
5383; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v7
5384; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v5|
5385; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
5386; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
5387; GFX9-NEXT:    v_add_u32_e32 v4, v8, v4
5388; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s6
5389; GFX9-NEXT:    v_mul_lo_u32 v1, v4, v1
5390; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
5391; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
5392; GFX9-NEXT:    v_sub_u32_e32 v2, s4, v2
5393; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
5394; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5395; GFX9-NEXT:    v_and_b32_e32 v2, s8, v2
5396; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5397; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
5398; GFX9-NEXT:    v_mov_b32_e32 v4, 0
5399; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
5400; GFX9-NEXT:    global_store_dword v4, v0, s[2:3]
5401; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5402; GFX9-NEXT:    global_store_short v4, v0, s[2:3] offset:4
5403; GFX9-NEXT:    s_endpgm
5404  %r = srem <3 x i15> %x, %y
5405  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
5406  ret void
5407}
5408
5409define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
5410; CHECK-LABEL: @udiv_i32_oddk_denom(
5411; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
5412; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5413; CHECK-NEXT:    ret void
5414;
5415; GFX6-LABEL: udiv_i32_oddk_denom:
5416; GFX6:       ; %bb.0:
5417; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5418; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
5419; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5420; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5421; GFX6-NEXT:    s_mov_b32 s6, -1
5422; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5423; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
5424; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
5425; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5426; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5427; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5428; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5429; GFX6-NEXT:    s_endpgm
5430;
5431; GFX9-LABEL: udiv_i32_oddk_denom:
5432; GFX9:       ; %bb.0:
5433; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5434; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5435; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5436; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5437; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
5438; GFX9-NEXT:    s_sub_i32 s1, s4, s0
5439; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
5440; GFX9-NEXT:    s_add_i32 s1, s1, s0
5441; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
5442; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5443; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5444; GFX9-NEXT:    s_endpgm
5445  %r = udiv i32 %x, 1235195
5446  store i32 %r, i32 addrspace(1)* %out
5447  ret void
5448}
5449
5450define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5451; CHECK-LABEL: @udiv_i32_pow2k_denom(
5452; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
5453; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5454; CHECK-NEXT:    ret void
5455;
5456; GFX6-LABEL: udiv_i32_pow2k_denom:
5457; GFX6:       ; %bb.0:
5458; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5459; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
5460; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5461; GFX6-NEXT:    s_mov_b32 s6, -1
5462; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5463; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
5464; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5465; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5466; GFX6-NEXT:    s_endpgm
5467;
5468; GFX9-LABEL: udiv_i32_pow2k_denom:
5469; GFX9:       ; %bb.0:
5470; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5471; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5472; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5473; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5474; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
5475; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5476; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5477; GFX9-NEXT:    s_endpgm
5478  %r = udiv i32 %x, 4096
5479  store i32 %r, i32 addrspace(1)* %out
5480  ret void
5481}
5482
5483define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5484; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
5485; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5486; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
5487; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5488; CHECK-NEXT:    ret void
5489;
5490; GFX6-LABEL: udiv_i32_pow2_shl_denom:
5491; GFX6:       ; %bb.0:
5492; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5493; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5494; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5495; GFX6-NEXT:    s_mov_b32 s6, -1
5496; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5497; GFX6-NEXT:    s_add_i32 s1, s1, 12
5498; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
5499; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5500; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5501; GFX6-NEXT:    s_endpgm
5502;
5503; GFX9-LABEL: udiv_i32_pow2_shl_denom:
5504; GFX9:       ; %bb.0:
5505; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5506; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5507; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5508; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5509; GFX9-NEXT:    s_add_i32 s0, s5, 12
5510; GFX9-NEXT:    s_lshr_b32 s0, s4, s0
5511; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5512; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5513; GFX9-NEXT:    s_endpgm
5514  %shl.y = shl i32 4096, %y
5515  %r = udiv i32 %x, %shl.y
5516  store i32 %r, i32 addrspace(1)* %out
5517  ret void
5518}
5519
5520define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5521; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
5522; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5523; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5524; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5525; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5526; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
5527; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5528; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5529; CHECK-NEXT:    ret void
5530;
5531; GFX6-LABEL: udiv_v2i32_pow2k_denom:
5532; GFX6:       ; %bb.0:
5533; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5534; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5535; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5536; GFX6-NEXT:    s_mov_b32 s6, -1
5537; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5538; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
5539; GFX6-NEXT:    s_lshr_b32 s1, s1, 12
5540; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5541; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5542; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5543; GFX6-NEXT:    s_endpgm
5544;
5545; GFX9-LABEL: udiv_v2i32_pow2k_denom:
5546; GFX9:       ; %bb.0:
5547; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5548; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5549; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5550; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5551; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
5552; GFX9-NEXT:    s_lshr_b32 s1, s5, 12
5553; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5554; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5555; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5556; GFX9-NEXT:    s_endpgm
5557  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
5558  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5559  ret void
5560}
5561
5562define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5563; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
5564; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5565; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5566; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5567; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5568; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
5569; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5570; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5571; CHECK-NEXT:    ret void
5572;
5573; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom:
5574; GFX6:       ; %bb.0:
5575; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5576; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5577; GFX6-NEXT:    v_mov_b32_e32 v0, 0x100101
5578; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5579; GFX6-NEXT:    s_mov_b32 s6, -1
5580; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5581; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
5582; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
5583; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v0
5584; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5585; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5586; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
5587; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5588; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5589; GFX6-NEXT:    s_endpgm
5590;
5591; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
5592; GFX9:       ; %bb.0:
5593; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5594; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5595; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5596; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5597; GFX9-NEXT:    s_mul_hi_u32 s1, s5, 0x100101
5598; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
5599; GFX9-NEXT:    s_sub_i32 s4, s5, s1
5600; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
5601; GFX9-NEXT:    s_add_i32 s4, s4, s1
5602; GFX9-NEXT:    s_lshr_b32 s1, s4, 11
5603; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5604; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5605; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5606; GFX9-NEXT:    s_endpgm
5607  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
5608  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5609  ret void
5610}
5611
5612define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5613; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
5614; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5615; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5616; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5617; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5618; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5619; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5620; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5621; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5622; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5623; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5624; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5625; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5626; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5627; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5628; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5629; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5630; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5631; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5632; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5633; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5634; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5635; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5636; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5637; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5638; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5639; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
5640; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
5641; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5642; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
5643; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
5644; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
5645; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
5646; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0
5647; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
5648; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5649; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
5650; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5651; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
5652; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
5653; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
5654; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
5655; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
5656; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
5657; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
5658; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5659; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
5660; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
5661; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
5662; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
5663; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
5664; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
5665; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5666; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
5667; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
5668; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
5669; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
5670; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
5671; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
5672; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
5673; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
5674; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
5675; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
5676; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
5677; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
5678; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
5679; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5680; CHECK-NEXT:    ret void
5681;
5682; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
5683; GFX6:       ; %bb.0:
5684; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
5685; GFX6-NEXT:    s_movk_i32 s4, 0x1000
5686; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5687; GFX6-NEXT:    s_mov_b32 s6, -1
5688; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5689; GFX6-NEXT:    s_lshl_b32 s8, s4, s2
5690; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
5691; GFX6-NEXT:    s_lshl_b32 s9, s4, s3
5692; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
5693; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5694; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5695; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5696; GFX6-NEXT:    s_mov_b32 s0, 0x4f7ffffe
5697; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5698; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
5699; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
5700; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
5701; GFX6-NEXT:    s_sub_i32 s0, 0, s8
5702; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
5703; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
5704; GFX6-NEXT:    s_sub_i32 s0, 0, s9
5705; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
5706; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
5707; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
5708; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
5709; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5710; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
5711; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
5712; GFX6-NEXT:    v_mul_hi_u32 v1, s3, v1
5713; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
5714; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
5715; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
5716; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
5717; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
5718; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
5719; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
5720; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
5721; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
5722; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
5723; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5724; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v4
5725; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
5726; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
5727; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
5728; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v2
5729; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
5730; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
5731; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
5732; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5733; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5734; GFX6-NEXT:    s_endpgm
5735;
5736; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
5737; GFX9:       ; %bb.0:
5738; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
5739; GFX9-NEXT:    s_movk_i32 s4, 0x1000
5740; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5741; GFX9-NEXT:    s_lshl_b32 s5, s4, s3
5742; GFX9-NEXT:    s_lshl_b32 s4, s4, s2
5743; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
5744; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
5745; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
5746; GFX9-NEXT:    s_sub_i32 s3, 0, s5
5747; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5748; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5749; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
5750; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
5751; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
5752; GFX9-NEXT:    s_sub_i32 s2, 0, s4
5753; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
5754; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
5755; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
5756; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
5757; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5758; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
5759; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
5760; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5761; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5762; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
5763; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
5764; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
5765; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5766; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
5767; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
5768; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
5769; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
5770; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v3
5771; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
5772; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
5773; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
5774; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
5775; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v4
5776; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
5777; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
5778; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
5779; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v4
5780; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v4
5781; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
5782; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
5783; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
5784; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
5785; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5786; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
5787; GFX9-NEXT:    s_endpgm
5788  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
5789  %r = udiv <2 x i32> %x, %shl.y
5790  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5791  ret void
5792}
5793
5794define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
5795; CHECK-LABEL: @urem_i32_oddk_denom(
5796; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
5797; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5798; CHECK-NEXT:    ret void
5799;
5800; GFX6-LABEL: urem_i32_oddk_denom:
5801; GFX6:       ; %bb.0:
5802; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5803; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5804; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
5805; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5806; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5807; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5808; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
5809; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
5810; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5811; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5812; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5813; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
5814; GFX6-NEXT:    s_mov_b32 s2, -1
5815; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
5816; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5817; GFX6-NEXT:    s_endpgm
5818;
5819; GFX9-LABEL: urem_i32_oddk_denom:
5820; GFX9:       ; %bb.0:
5821; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5822; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5823; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5824; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5825; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
5826; GFX9-NEXT:    s_sub_i32 s1, s4, s0
5827; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
5828; GFX9-NEXT:    s_add_i32 s1, s1, s0
5829; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
5830; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
5831; GFX9-NEXT:    s_sub_i32 s0, s4, s0
5832; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5833; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5834; GFX9-NEXT:    s_endpgm
5835  %r = urem i32 %x, 1235195
5836  store i32 %r, i32 addrspace(1)* %out
5837  ret void
5838}
5839
5840define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5841; CHECK-LABEL: @urem_i32_pow2k_denom(
5842; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
5843; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5844; CHECK-NEXT:    ret void
5845;
5846; GFX6-LABEL: urem_i32_pow2k_denom:
5847; GFX6:       ; %bb.0:
5848; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5849; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
5850; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5851; GFX6-NEXT:    s_mov_b32 s6, -1
5852; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5853; GFX6-NEXT:    s_and_b32 s0, s0, 0xfff
5854; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5855; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5856; GFX6-NEXT:    s_endpgm
5857;
5858; GFX9-LABEL: urem_i32_pow2k_denom:
5859; GFX9:       ; %bb.0:
5860; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5861; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5862; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5863; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5864; GFX9-NEXT:    s_and_b32 s0, s4, 0xfff
5865; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5866; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5867; GFX9-NEXT:    s_endpgm
5868  %r = urem i32 %x, 4096
5869  store i32 %r, i32 addrspace(1)* %out
5870  ret void
5871}
5872
5873define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5874; CHECK-LABEL: @urem_i32_pow2_shl_denom(
5875; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5876; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
5877; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5878; CHECK-NEXT:    ret void
5879;
5880; GFX6-LABEL: urem_i32_pow2_shl_denom:
5881; GFX6:       ; %bb.0:
5882; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5883; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5884; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5885; GFX6-NEXT:    s_mov_b32 s6, -1
5886; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5887; GFX6-NEXT:    s_lshl_b32 s1, 0x1000, s1
5888; GFX6-NEXT:    s_add_i32 s1, s1, -1
5889; GFX6-NEXT:    s_and_b32 s0, s0, s1
5890; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5891; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5892; GFX6-NEXT:    s_endpgm
5893;
5894; GFX9-LABEL: urem_i32_pow2_shl_denom:
5895; GFX9:       ; %bb.0:
5896; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5897; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5898; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5899; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5900; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s5
5901; GFX9-NEXT:    s_add_i32 s0, s0, -1
5902; GFX9-NEXT:    s_and_b32 s0, s4, s0
5903; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5904; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5905; GFX9-NEXT:    s_endpgm
5906  %shl.y = shl i32 4096, %y
5907  %r = urem i32 %x, %shl.y
5908  store i32 %r, i32 addrspace(1)* %out
5909  ret void
5910}
5911
5912define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5913; CHECK-LABEL: @urem_v2i32_pow2k_denom(
5914; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5915; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
5916; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5917; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5918; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
5919; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5920; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5921; CHECK-NEXT:    ret void
5922;
5923; GFX6-LABEL: urem_v2i32_pow2k_denom:
5924; GFX6:       ; %bb.0:
5925; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5926; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5927; GFX6-NEXT:    s_movk_i32 s2, 0xfff
5928; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5929; GFX6-NEXT:    s_mov_b32 s6, -1
5930; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5931; GFX6-NEXT:    s_and_b32 s0, s0, s2
5932; GFX6-NEXT:    s_and_b32 s1, s1, s2
5933; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5934; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5935; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5936; GFX6-NEXT:    s_endpgm
5937;
5938; GFX9-LABEL: urem_v2i32_pow2k_denom:
5939; GFX9:       ; %bb.0:
5940; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5941; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5942; GFX9-NEXT:    s_movk_i32 s0, 0xfff
5943; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5944; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5945; GFX9-NEXT:    s_and_b32 s1, s4, s0
5946; GFX9-NEXT:    s_and_b32 s0, s5, s0
5947; GFX9-NEXT:    v_mov_b32_e32 v0, s1
5948; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5949; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5950; GFX9-NEXT:    s_endpgm
5951  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
5952  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5953  ret void
5954}
5955
5956define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5957; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
5958; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5959; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5960; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5961; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5962; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5963; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5964; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5965; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5966; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5967; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5968; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5969; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5970; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5971; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5972; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5973; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5974; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5975; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5976; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5977; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5978; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5979; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5980; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5981; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5982; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5983; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5984; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
5985; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
5986; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
5987; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
5988; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0
5989; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
5990; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5991; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
5992; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
5993; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
5994; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
5995; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
5996; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
5997; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
5998; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
5999; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
6000; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
6001; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
6002; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
6003; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
6004; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
6005; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
6006; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
6007; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
6008; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
6009; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
6010; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
6011; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
6012; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
6013; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
6014; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
6015; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
6016; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
6017; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
6018; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
6019; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6020; CHECK-NEXT:    ret void
6021;
6022; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
6023; GFX6:       ; %bb.0:
6024; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
6025; GFX6-NEXT:    s_movk_i32 s4, 0x1000
6026; GFX6-NEXT:    s_mov_b32 s5, 0x4f7ffffe
6027; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6028; GFX6-NEXT:    s_mov_b32 s6, -1
6029; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6030; GFX6-NEXT:    s_lshl_b32 s2, s4, s2
6031; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
6032; GFX6-NEXT:    s_lshl_b32 s3, s4, s3
6033; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
6034; GFX6-NEXT:    s_sub_i32 s4, 0, s2
6035; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6036; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6037; GFX6-NEXT:    v_mul_f32_e32 v0, s5, v0
6038; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6039; GFX6-NEXT:    v_mul_f32_e32 v1, s5, v1
6040; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6041; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v0
6042; GFX6-NEXT:    s_sub_i32 s4, 0, s3
6043; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
6044; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6045; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6046; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
6047; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
6048; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
6049; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6050; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
6051; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
6052; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
6053; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
6054; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
6055; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
6056; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v0
6057; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
6058; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6059; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v0
6060; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
6061; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6062; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
6063; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
6064; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6065; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6066; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
6067; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6068; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6069; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6070; GFX6-NEXT:    s_endpgm
6071;
6072; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
6073; GFX9:       ; %bb.0:
6074; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
6075; GFX9-NEXT:    s_movk_i32 s4, 0x1000
6076; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6077; GFX9-NEXT:    s_lshl_b32 s5, s4, s3
6078; GFX9-NEXT:    s_lshl_b32 s4, s4, s2
6079; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
6080; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
6081; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
6082; GFX9-NEXT:    s_sub_i32 s3, 0, s5
6083; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6084; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6085; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
6086; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
6087; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6088; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6089; GFX9-NEXT:    s_sub_i32 s2, 0, s4
6090; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
6091; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
6092; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6093; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
6094; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
6095; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6096; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
6097; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
6098; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6099; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6100; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
6101; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6102; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
6103; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s5
6104; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
6105; GFX9-NEXT:    v_sub_u32_e32 v1, s3, v1
6106; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
6107; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6108; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6109; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
6110; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
6111; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6112; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
6113; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6114; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6115; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
6116; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
6117; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6118; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
6119; GFX9-NEXT:    s_endpgm
6120  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6121  %r = urem <2 x i32> %x, %shl.y
6122  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6123  ret void
6124}
6125
6126define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
6127; CHECK-LABEL: @sdiv_i32_oddk_denom(
6128; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
6129; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6130; CHECK-NEXT:    ret void
6131;
6132; GFX6-LABEL: sdiv_i32_oddk_denom:
6133; GFX6:       ; %bb.0:
6134; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6135; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
6136; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6137; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6138; GFX6-NEXT:    s_mov_b32 s6, -1
6139; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6140; GFX6-NEXT:    v_mul_hi_i32 v0, s0, v0
6141; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
6142; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6143; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
6144; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6145; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6146; GFX6-NEXT:    s_endpgm
6147;
6148; GFX9-LABEL: sdiv_i32_oddk_denom:
6149; GFX9:       ; %bb.0:
6150; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6151; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6152; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6153; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6154; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
6155; GFX9-NEXT:    s_add_i32 s0, s0, s4
6156; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
6157; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
6158; GFX9-NEXT:    s_add_i32 s0, s0, s1
6159; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6160; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6161; GFX9-NEXT:    s_endpgm
6162  %r = sdiv i32 %x, 1235195
6163  store i32 %r, i32 addrspace(1)* %out
6164  ret void
6165}
6166
6167define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
6168; CHECK-LABEL: @sdiv_i32_pow2k_denom(
6169; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
6170; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6171; CHECK-NEXT:    ret void
6172;
6173; GFX6-LABEL: sdiv_i32_pow2k_denom:
6174; GFX6:       ; %bb.0:
6175; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6176; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
6177; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6178; GFX6-NEXT:    s_mov_b32 s6, -1
6179; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6180; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
6181; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
6182; GFX6-NEXT:    s_add_i32 s0, s0, s1
6183; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
6184; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6185; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6186; GFX6-NEXT:    s_endpgm
6187;
6188; GFX9-LABEL: sdiv_i32_pow2k_denom:
6189; GFX9:       ; %bb.0:
6190; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6191; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6192; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6193; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6194; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6195; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6196; GFX9-NEXT:    s_add_i32 s4, s4, s0
6197; GFX9-NEXT:    s_ashr_i32 s0, s4, 12
6198; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6199; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6200; GFX9-NEXT:    s_endpgm
6201  %r = sdiv i32 %x, 4096
6202  store i32 %r, i32 addrspace(1)* %out
6203  ret void
6204}
6205
6206define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
6207; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
6208; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6209; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
6210; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6211; CHECK-NEXT:    ret void
6212;
6213; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
6214; GFX6:       ; %bb.0:
6215; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6216; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6217; GFX6-NEXT:    s_mov_b32 s6, -1
6218; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6219; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6220; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
6221; GFX6-NEXT:    s_add_i32 s3, s3, s8
6222; GFX6-NEXT:    s_xor_b32 s3, s3, s8
6223; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
6224; GFX6-NEXT:    s_sub_i32 s4, 0, s3
6225; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6226; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6227; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6228; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
6229; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6230; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
6231; GFX6-NEXT:    s_add_i32 s1, s2, s0
6232; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6233; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6234; GFX6-NEXT:    s_xor_b32 s2, s0, s8
6235; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6236; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
6237; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
6238; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
6239; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
6240; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
6241; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6242; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
6243; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
6244; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
6245; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6246; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6247; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
6248; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6249; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6250; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6251; GFX6-NEXT:    s_endpgm
6252;
6253; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
6254; GFX9:       ; %bb.0:
6255; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6256; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6257; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6259; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6260; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6261; GFX9-NEXT:    s_add_i32 s3, s3, s4
6262; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6263; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6264; GFX9-NEXT:    s_sub_i32 s5, 0, s3
6265; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6266; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6267; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6268; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
6269; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
6270; GFX9-NEXT:    s_add_i32 s2, s2, s5
6271; GFX9-NEXT:    s_xor_b32 s2, s2, s5
6272; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
6273; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
6274; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6275; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
6276; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
6277; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
6278; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6279; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6280; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
6281; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6282; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
6283; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6284; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6285; GFX9-NEXT:    s_xor_b32 s2, s5, s4
6286; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
6287; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
6288; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
6289; GFX9-NEXT:    s_endpgm
6290  %shl.y = shl i32 4096, %y
6291  %r = sdiv i32 %x, %shl.y
6292  store i32 %r, i32 addrspace(1)* %out
6293  ret void
6294}
6295
6296define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6297; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
6298; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6299; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6300; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6301; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6302; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
6303; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6304; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6305; CHECK-NEXT:    ret void
6306;
6307; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
6308; GFX6:       ; %bb.0:
6309; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6310; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6311; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6312; GFX6-NEXT:    s_mov_b32 s6, -1
6313; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6314; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
6315; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
6316; GFX6-NEXT:    s_ashr_i32 s3, s1, 31
6317; GFX6-NEXT:    s_add_i32 s0, s0, s2
6318; GFX6-NEXT:    s_lshr_b32 s2, s3, 20
6319; GFX6-NEXT:    s_add_i32 s1, s1, s2
6320; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
6321; GFX6-NEXT:    s_ashr_i32 s1, s1, 12
6322; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6323; GFX6-NEXT:    v_mov_b32_e32 v1, s1
6324; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6325; GFX6-NEXT:    s_endpgm
6326;
6327; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
6328; GFX9:       ; %bb.0:
6329; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6330; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6331; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6333; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6334; GFX9-NEXT:    s_ashr_i32 s1, s5, 31
6335; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6336; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
6337; GFX9-NEXT:    s_add_i32 s0, s4, s0
6338; GFX9-NEXT:    s_add_i32 s1, s5, s1
6339; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
6340; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
6341; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6342; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6343; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6344; GFX9-NEXT:    s_endpgm
6345  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
6346  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6347  ret void
6348}
6349
6350define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6351; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
6352; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6353; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6354; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6355; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6356; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
6357; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6358; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6359; CHECK-NEXT:    ret void
6360;
6361; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6362; GFX6:       ; %bb.0:
6363; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6364; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6365; GFX6-NEXT:    v_mov_b32_e32 v0, 0x80080081
6366; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6367; GFX6-NEXT:    s_mov_b32 s6, -1
6368; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6369; GFX6-NEXT:    v_mul_hi_i32 v0, s1, v0
6370; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
6371; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
6372; GFX6-NEXT:    s_add_i32 s0, s0, s2
6373; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
6374; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6375; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
6376; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
6377; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
6378; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6379; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6380; GFX6-NEXT:    s_endpgm
6381;
6382; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6383; GFX9:       ; %bb.0:
6384; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6385; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6386; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6387; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6388; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6389; GFX9-NEXT:    s_mul_hi_i32 s1, s5, 0x80080081
6390; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6391; GFX9-NEXT:    s_add_i32 s1, s1, s5
6392; GFX9-NEXT:    s_add_i32 s0, s4, s0
6393; GFX9-NEXT:    s_lshr_b32 s4, s1, 31
6394; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
6395; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
6396; GFX9-NEXT:    s_add_i32 s1, s1, s4
6397; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6398; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6399; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6400; GFX9-NEXT:    s_endpgm
6401  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
6402  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6403  ret void
6404}
6405
6406define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
6407; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
6408; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
6409; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6410; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6411; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6412; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6413; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
6414; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
6415; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
6416; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
6417; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
6418; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
6419; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
6420; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
6421; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
6422; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
6423; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
6424; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
6425; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
6426; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
6427; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
6428; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
6429; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
6430; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
6431; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
6432; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
6433; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
6434; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
6435; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
6436; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
6437; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
6438; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
6439; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
6440; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
6441; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
6442; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
6443; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
6444; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
6445; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
6446; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
6447; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
6448; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
6449; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0
6450; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
6451; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6452; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
6453; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
6454; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
6455; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
6456; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
6457; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
6458; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
6459; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
6460; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
6461; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
6462; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
6463; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
6464; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
6465; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
6466; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
6467; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
6468; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
6469; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
6470; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
6471; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
6472; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
6473; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
6474; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
6475; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
6476; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
6477; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
6478; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
6479; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
6480; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
6481; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
6482; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
6483; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
6484; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
6485; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
6486; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
6487; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
6488; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
6489; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
6490; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
6491; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6492; CHECK-NEXT:    ret void
6493;
6494; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
6495; GFX6:       ; %bb.0:
6496; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
6497; GFX6-NEXT:    s_movk_i32 s10, 0x1000
6498; GFX6-NEXT:    s_mov_b32 s12, 0x4f7ffffe
6499; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6500; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
6501; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6502; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6503; GFX6-NEXT:    s_lshl_b32 s2, s10, s2
6504; GFX6-NEXT:    s_ashr_i32 s11, s2, 31
6505; GFX6-NEXT:    s_add_i32 s2, s2, s11
6506; GFX6-NEXT:    s_xor_b32 s2, s2, s11
6507; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
6508; GFX6-NEXT:    s_lshl_b32 s0, s10, s3
6509; GFX6-NEXT:    s_sub_i32 s10, 0, s2
6510; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
6511; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6512; GFX6-NEXT:    s_add_i32 s0, s0, s3
6513; GFX6-NEXT:    s_ashr_i32 s1, s8, 31
6514; GFX6-NEXT:    s_mov_b32 s6, -1
6515; GFX6-NEXT:    v_mul_f32_e32 v0, s12, v0
6516; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6517; GFX6-NEXT:    v_mul_lo_u32 v1, s10, v0
6518; GFX6-NEXT:    s_xor_b32 s10, s0, s3
6519; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
6520; GFX6-NEXT:    s_add_i32 s0, s8, s1
6521; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6522; GFX6-NEXT:    s_xor_b32 s0, s0, s1
6523; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
6524; GFX6-NEXT:    s_xor_b32 s8, s1, s11
6525; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6526; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
6527; GFX6-NEXT:    v_mul_f32_e32 v1, s12, v2
6528; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6529; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
6530; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
6531; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
6532; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
6533; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
6534; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
6535; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
6536; GFX6-NEXT:    s_sub_i32 s0, 0, s10
6537; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
6538; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
6539; GFX6-NEXT:    s_add_i32 s1, s9, s0
6540; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6541; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
6542; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
6543; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
6544; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
6545; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
6546; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6547; GFX6-NEXT:    s_xor_b32 s2, s0, s3
6548; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
6549; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
6550; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
6551; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
6552; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
6553; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
6554; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6555; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
6556; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
6557; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
6558; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
6559; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6560; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
6561; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
6562; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6563; GFX6-NEXT:    s_endpgm
6564;
6565; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
6566; GFX9:       ; %bb.0:
6567; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
6568; GFX9-NEXT:    s_movk_i32 s8, 0x1000
6569; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
6570; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
6571; GFX9-NEXT:    s_mov_b32 s10, 0x4f7ffffe
6572; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6573; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6574; GFX9-NEXT:    s_lshl_b32 s2, s8, s2
6575; GFX9-NEXT:    s_ashr_i32 s9, s2, 31
6576; GFX9-NEXT:    s_add_i32 s2, s2, s9
6577; GFX9-NEXT:    s_xor_b32 s2, s2, s9
6578; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
6579; GFX9-NEXT:    s_lshl_b32 s0, s8, s3
6580; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
6581; GFX9-NEXT:    s_add_i32 s0, s0, s1
6582; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6583; GFX9-NEXT:    s_xor_b32 s0, s0, s1
6584; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
6585; GFX9-NEXT:    s_sub_i32 s3, 0, s2
6586; GFX9-NEXT:    v_mul_f32_e32 v0, s10, v0
6587; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6588; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6589; GFX9-NEXT:    s_sub_i32 s8, 0, s0
6590; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
6591; GFX9-NEXT:    v_mul_f32_e32 v1, s10, v1
6592; GFX9-NEXT:    s_ashr_i32 s3, s6, 31
6593; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6594; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
6595; GFX9-NEXT:    s_add_i32 s6, s6, s3
6596; GFX9-NEXT:    s_xor_b32 s6, s6, s3
6597; GFX9-NEXT:    s_xor_b32 s3, s3, s9
6598; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
6599; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
6600; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
6601; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
6602; GFX9-NEXT:    s_xor_b32 s1, s8, s1
6603; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
6604; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
6605; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
6606; GFX9-NEXT:    v_sub_u32_e32 v4, s6, v4
6607; GFX9-NEXT:    s_add_i32 s6, s7, s8
6608; GFX9-NEXT:    s_xor_b32 s6, s6, s8
6609; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
6610; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
6611; GFX9-NEXT:    v_mul_hi_u32 v1, s6, v1
6612; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
6613; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v4
6614; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
6615; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
6616; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
6617; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6618; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s0
6619; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
6620; GFX9-NEXT:    v_xor_b32_e32 v0, s3, v0
6621; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
6622; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
6623; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
6624; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6625; GFX9-NEXT:    v_subrev_u32_e32 v4, s0, v3
6626; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
6627; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
6628; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
6629; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6630; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
6631; GFX9-NEXT:    v_subrev_u32_e32 v1, s1, v1
6632; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
6633; GFX9-NEXT:    s_endpgm
6634  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6635  %r = sdiv <2 x i32> %x, %shl.y
6636  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6637  ret void
6638}
6639
6640define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
6641; CHECK-LABEL: @srem_i32_oddk_denom(
6642; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
6643; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6644; CHECK-NEXT:    ret void
6645;
6646; GFX6-LABEL: srem_i32_oddk_denom:
6647; GFX6:       ; %bb.0:
6648; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6649; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6650; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
6651; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6652; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6653; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6654; GFX6-NEXT:    v_mul_hi_i32 v0, s4, v0
6655; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
6656; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6657; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
6658; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6659; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
6660; GFX6-NEXT:    s_mov_b32 s2, -1
6661; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
6662; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6663; GFX6-NEXT:    s_endpgm
6664;
6665; GFX9-LABEL: srem_i32_oddk_denom:
6666; GFX9:       ; %bb.0:
6667; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6668; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6669; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6670; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6671; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
6672; GFX9-NEXT:    s_add_i32 s0, s0, s4
6673; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
6674; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
6675; GFX9-NEXT:    s_add_i32 s0, s0, s1
6676; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
6677; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6678; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6679; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6680; GFX9-NEXT:    s_endpgm
6681  %r = srem i32 %x, 1235195
6682  store i32 %r, i32 addrspace(1)* %out
6683  ret void
6684}
6685
6686define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
6687; CHECK-LABEL: @srem_i32_pow2k_denom(
6688; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
6689; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6690; CHECK-NEXT:    ret void
6691;
6692; GFX6-LABEL: srem_i32_pow2k_denom:
6693; GFX6:       ; %bb.0:
6694; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6695; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
6696; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6697; GFX6-NEXT:    s_mov_b32 s6, -1
6698; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6699; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
6700; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
6701; GFX6-NEXT:    s_add_i32 s1, s0, s1
6702; GFX6-NEXT:    s_and_b32 s1, s1, 0xfffff000
6703; GFX6-NEXT:    s_sub_i32 s0, s0, s1
6704; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6705; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6706; GFX6-NEXT:    s_endpgm
6707;
6708; GFX9-LABEL: srem_i32_pow2k_denom:
6709; GFX9:       ; %bb.0:
6710; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6711; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6712; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6713; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6714; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6715; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6716; GFX9-NEXT:    s_add_i32 s0, s4, s0
6717; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
6718; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6719; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6720; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6721; GFX9-NEXT:    s_endpgm
6722  %r = srem i32 %x, 4096
6723  store i32 %r, i32 addrspace(1)* %out
6724  ret void
6725}
6726
6727define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
6728; CHECK-LABEL: @srem_i32_pow2_shl_denom(
6729; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6730; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
6731; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6732; CHECK-NEXT:    ret void
6733;
6734; GFX6-LABEL: srem_i32_pow2_shl_denom:
6735; GFX6:       ; %bb.0:
6736; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6737; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6738; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6739; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6740; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
6741; GFX6-NEXT:    s_add_i32 s3, s3, s4
6742; GFX6-NEXT:    s_xor_b32 s4, s3, s4
6743; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
6744; GFX6-NEXT:    s_sub_i32 s3, 0, s4
6745; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
6746; GFX6-NEXT:    s_add_i32 s2, s2, s5
6747; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6748; GFX6-NEXT:    s_xor_b32 s6, s2, s5
6749; GFX6-NEXT:    s_mov_b32 s2, -1
6750; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6751; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6752; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
6753; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6754; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6755; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6756; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
6757; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
6758; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
6759; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
6760; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6761; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6762; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
6763; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6764; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6765; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
6766; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
6767; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6768; GFX6-NEXT:    s_endpgm
6769;
6770; GFX9-LABEL: srem_i32_pow2_shl_denom:
6771; GFX9:       ; %bb.0:
6772; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6773; GFX9-NEXT:    s_nop 0
6774; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6775; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6776; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6777; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6778; GFX9-NEXT:    s_add_i32 s3, s3, s4
6779; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6780; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6781; GFX9-NEXT:    s_sub_i32 s4, 0, s3
6782; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6783; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6784; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6785; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
6786; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
6787; GFX9-NEXT:    s_add_i32 s2, s2, s4
6788; GFX9-NEXT:    s_xor_b32 s2, s2, s4
6789; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
6790; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
6791; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6792; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6793; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
6794; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
6795; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
6796; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6797; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6798; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
6799; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6800; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6801; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
6802; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
6803; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
6804; GFX9-NEXT:    s_endpgm
6805  %shl.y = shl i32 4096, %y
6806  %r = srem i32 %x, %shl.y
6807  store i32 %r, i32 addrspace(1)* %out
6808  ret void
6809}
6810
6811define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6812; CHECK-LABEL: @srem_v2i32_pow2k_denom(
6813; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6814; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
6815; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6816; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6817; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
6818; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6819; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6820; CHECK-NEXT:    ret void
6821;
6822; GFX6-LABEL: srem_v2i32_pow2k_denom:
6823; GFX6:       ; %bb.0:
6824; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6825; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6826; GFX6-NEXT:    s_movk_i32 s2, 0xf000
6827; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6828; GFX6-NEXT:    s_mov_b32 s6, -1
6829; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6830; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
6831; GFX6-NEXT:    s_lshr_b32 s3, s3, 20
6832; GFX6-NEXT:    s_add_i32 s3, s0, s3
6833; GFX6-NEXT:    s_and_b32 s3, s3, s2
6834; GFX6-NEXT:    s_sub_i32 s0, s0, s3
6835; GFX6-NEXT:    s_ashr_i32 s3, s1, 31
6836; GFX6-NEXT:    s_lshr_b32 s3, s3, 20
6837; GFX6-NEXT:    s_add_i32 s3, s1, s3
6838; GFX6-NEXT:    s_and_b32 s2, s3, s2
6839; GFX6-NEXT:    s_sub_i32 s1, s1, s2
6840; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6841; GFX6-NEXT:    v_mov_b32_e32 v1, s1
6842; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6843; GFX6-NEXT:    s_endpgm
6844;
6845; GFX9-LABEL: srem_v2i32_pow2k_denom:
6846; GFX9:       ; %bb.0:
6847; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6848; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6849; GFX9-NEXT:    s_movk_i32 s6, 0xf000
6850; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6851; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6852; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6853; GFX9-NEXT:    s_ashr_i32 s1, s5, 31
6854; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6855; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
6856; GFX9-NEXT:    s_add_i32 s0, s4, s0
6857; GFX9-NEXT:    s_add_i32 s1, s5, s1
6858; GFX9-NEXT:    s_and_b32 s0, s0, s6
6859; GFX9-NEXT:    s_and_b32 s1, s1, s6
6860; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6861; GFX9-NEXT:    s_sub_i32 s1, s5, s1
6862; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6863; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6864; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6865; GFX9-NEXT:    s_endpgm
6866  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
6867  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6868  ret void
6869}
6870
6871define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
6872; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
6873; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
6874; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6875; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6876; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6877; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6878; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
6879; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
6880; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
6881; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
6882; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
6883; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
6884; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
6885; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
6886; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
6887; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
6888; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
6889; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
6890; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
6891; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
6892; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
6893; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
6894; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
6895; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
6896; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
6897; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
6898; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
6899; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
6900; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
6901; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
6902; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
6903; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
6904; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
6905; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
6906; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
6907; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
6908; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
6909; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
6910; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
6911; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0
6912; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
6913; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6914; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
6915; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
6916; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
6917; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
6918; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
6919; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
6920; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
6921; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
6922; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
6923; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
6924; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
6925; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
6926; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
6927; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
6928; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
6929; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
6930; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
6931; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
6932; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
6933; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
6934; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
6935; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
6936; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
6937; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
6938; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
6939; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
6940; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
6941; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
6942; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
6943; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
6944; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
6945; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
6946; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
6947; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
6948; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
6949; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
6950; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6951; CHECK-NEXT:    ret void
6952;
6953; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
6954; GFX6:       ; %bb.0:
6955; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
6956; GFX6-NEXT:    s_movk_i32 s6, 0x1000
6957; GFX6-NEXT:    s_mov_b32 s10, 0x4f7ffffe
6958; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6959; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6960; GFX6-NEXT:    s_lshl_b32 s2, s6, s2
6961; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
6962; GFX6-NEXT:    s_add_i32 s2, s2, s4
6963; GFX6-NEXT:    s_xor_b32 s2, s2, s4
6964; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
6965; GFX6-NEXT:    s_lshl_b32 s3, s6, s3
6966; GFX6-NEXT:    s_ashr_i32 s6, s3, 31
6967; GFX6-NEXT:    s_add_i32 s3, s3, s6
6968; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6969; GFX6-NEXT:    s_xor_b32 s3, s3, s6
6970; GFX6-NEXT:    s_sub_i32 s9, 0, s2
6971; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
6972; GFX6-NEXT:    v_mul_f32_e32 v0, s10, v0
6973; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6974; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6975; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6976; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
6977; GFX6-NEXT:    s_mov_b32 s6, -1
6978; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v0
6979; GFX6-NEXT:    s_sub_i32 s9, 0, s3
6980; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6981; GFX6-NEXT:    s_ashr_i32 s8, s0, 31
6982; GFX6-NEXT:    s_add_i32 s0, s0, s8
6983; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6984; GFX6-NEXT:    s_xor_b32 s0, s0, s8
6985; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6986; GFX6-NEXT:    v_mul_f32_e32 v1, s10, v2
6987; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6988; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
6989; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
6990; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
6991; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
6992; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
6993; GFX6-NEXT:    s_ashr_i32 s0, s1, 31
6994; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
6995; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
6996; GFX6-NEXT:    s_add_i32 s1, s1, s0
6997; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6998; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6999; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
7000; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
7001; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
7002; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
7003; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7004; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
7005; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
7006; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
7007; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
7008; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
7009; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
7010; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7011; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
7012; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
7013; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7014; GFX6-NEXT:    v_xor_b32_e32 v1, s0, v1
7015; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
7016; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7017; GFX6-NEXT:    s_endpgm
7018;
7019; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
7020; GFX9:       ; %bb.0:
7021; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7022; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
7023; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
7024; GFX9-NEXT:    s_movk_i32 s8, 0x1000
7025; GFX9-NEXT:    s_mov_b32 s9, 0x4f7ffffe
7026; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7027; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7028; GFX9-NEXT:    s_lshl_b32 s0, s8, s6
7029; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
7030; GFX9-NEXT:    s_add_i32 s0, s0, s1
7031; GFX9-NEXT:    s_xor_b32 s0, s0, s1
7032; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
7033; GFX9-NEXT:    s_lshl_b32 s1, s8, s7
7034; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
7035; GFX9-NEXT:    s_add_i32 s1, s1, s6
7036; GFX9-NEXT:    s_xor_b32 s1, s1, s6
7037; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
7038; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
7039; GFX9-NEXT:    s_sub_i32 s7, 0, s0
7040; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
7041; GFX9-NEXT:    v_mul_f32_e32 v0, s9, v0
7042; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
7043; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7044; GFX9-NEXT:    s_add_i32 s4, s4, s6
7045; GFX9-NEXT:    s_xor_b32 s4, s4, s6
7046; GFX9-NEXT:    v_mul_f32_e32 v1, s9, v1
7047; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v0
7048; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7049; GFX9-NEXT:    s_sub_i32 s7, 0, s1
7050; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
7051; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v1
7052; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
7053; GFX9-NEXT:    s_add_i32 s5, s5, s7
7054; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
7055; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
7056; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
7057; GFX9-NEXT:    s_xor_b32 s5, s5, s7
7058; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7059; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
7060; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
7061; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s1
7062; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
7063; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v0
7064; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
7065; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7066; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v0
7067; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
7068; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
7069; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7070; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v1
7071; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
7072; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7073; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v1
7074; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
7075; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7076; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
7077; GFX9-NEXT:    v_xor_b32_e32 v1, s7, v1
7078; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
7079; GFX9-NEXT:    v_subrev_u32_e32 v1, s7, v1
7080; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7081; GFX9-NEXT:    s_endpgm
7082  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
7083  %r = srem <2 x i32> %x, %shl.y
7084  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7085  ret void
7086}
7087
7088define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
7089; CHECK-LABEL: @udiv_i64_oddk_denom(
7090; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
7091; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7092; CHECK-NEXT:    ret void
7093;
7094; GFX6-LABEL: udiv_i64_oddk_denom:
7095; GFX6:       ; %bb.0:
7096; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
7097; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7098; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7099; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7100; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
7101; GFX6-NEXT:    s_mov_b32 s3, 0x68958c89
7102; GFX6-NEXT:    v_mov_b32_e32 v8, 0
7103; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7104; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7105; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7106; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7107; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7108; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7109; GFX6-NEXT:    v_mov_b32_e32 v7, 0
7110; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7111; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7112; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7113; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7114; GFX6-NEXT:    s_mov_b32 s11, 0xf000
7115; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7116; GFX6-NEXT:    s_mov_b32 s8, s4
7117; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7118; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
7119; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
7120; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
7121; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
7122; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
7123; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
7124; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7125; GFX6-NEXT:    s_mov_b32 s4, 0x976a7376
7126; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
7127; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7128; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7129; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7130; GFX6-NEXT:    s_mov_b32 s10, -1
7131; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
7132; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
7133; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
7134; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7135; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
7136; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7137; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
7138; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
7139; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
7140; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
7141; GFX6-NEXT:    s_movk_i32 s2, 0x11f
7142; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7143; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
7144; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7145; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
7146; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
7147; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
7148; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
7149; GFX6-NEXT:    s_mov_b32 s3, 0x976a7377
7150; GFX6-NEXT:    s_mov_b32 s9, s5
7151; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
7152; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
7153; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
7154; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
7155; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
7156; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
7157; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
7158; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
7159; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
7160; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7161; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
7162; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
7163; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7164; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7165; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
7166; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
7167; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
7168; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
7169; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
7170; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7171; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7172; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
7173; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
7174; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7175; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7176; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
7177; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7178; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
7179; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7180; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7181; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7182; GFX6-NEXT:    v_mov_b32_e32 v5, s2
7183; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7184; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
7185; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7186; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
7187; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
7188; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
7189; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s3, v3
7190; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7191; GFX6-NEXT:    s_movk_i32 s3, 0x11e
7192; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
7193; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7194; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v5
7195; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7196; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
7197; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
7198; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
7199; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
7200; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
7201; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
7202; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7203; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
7204; GFX6-NEXT:    v_mov_b32_e32 v6, s7
7205; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
7206; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
7207; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7208; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
7209; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7210; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
7211; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
7212; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7213; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
7214; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7215; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7216; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
7217; GFX6-NEXT:    s_endpgm
7218;
7219; GFX9-LABEL: udiv_i64_oddk_denom:
7220; GFX9:       ; %bb.0:
7221; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
7222; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7223; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7224; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7225; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
7226; GFX9-NEXT:    s_mov_b32 s5, 0x68958c89
7227; GFX9-NEXT:    v_mov_b32_e32 v8, 0
7228; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7229; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7230; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7231; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7232; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7233; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7234; GFX9-NEXT:    v_mov_b32_e32 v5, 0
7235; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
7236; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
7237; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
7238; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s5
7239; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7240; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7241; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
7242; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v6
7243; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
7244; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
7245; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7246; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
7247; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
7248; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
7249; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
7250; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
7251; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
7252; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
7253; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7254; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
7255; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
7256; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
7257; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
7258; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
7259; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s5
7260; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
7261; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7262; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
7263; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
7264; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
7265; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v9
7266; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
7267; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
7268; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7269; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
7270; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v10, vcc
7271; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
7272; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
7273; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
7274; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
7275; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v9, vcc
7276; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
7277; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
7278; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
7279; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
7280; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7281; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7282; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7283; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7284; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7285; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
7286; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
7287; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7288; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7289; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
7290; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
7291; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7292; GFX9-NEXT:    s_movk_i32 s2, 0x11f
7293; GFX9-NEXT:    s_mov_b32 s3, 0x976a7377
7294; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
7295; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7296; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
7297; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7298; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
7299; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
7300; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
7301; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
7302; GFX9-NEXT:    v_mov_b32_e32 v6, s2
7303; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7304; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
7305; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7306; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
7307; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
7308; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
7309; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v3
7310; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7311; GFX9-NEXT:    s_movk_i32 s3, 0x11e
7312; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
7313; GFX9-NEXT:    s_mov_b32 s6, 0x976a7376
7314; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
7315; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v6
7316; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7317; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
7318; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
7319; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
7320; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
7321; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
7322; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
7323; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7324; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
7325; GFX9-NEXT:    v_mov_b32_e32 v7, s7
7326; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
7327; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
7328; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7329; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v3
7330; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7331; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
7332; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
7333; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7334; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
7335; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7336; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7337; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
7338; GFX9-NEXT:    s_endpgm
7339  %r = udiv i64 %x, 1235195949943
7340  store i64 %r, i64 addrspace(1)* %out
7341  ret void
7342}
7343
7344define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
7345; CHECK-LABEL: @udiv_i64_pow2k_denom(
7346; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
7347; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7348; CHECK-NEXT:    ret void
7349;
7350; GFX6-LABEL: udiv_i64_pow2k_denom:
7351; GFX6:       ; %bb.0:
7352; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
7353; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7354; GFX6-NEXT:    s_mov_b32 s6, -1
7355; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7356; GFX6-NEXT:    s_mov_b32 s4, s0
7357; GFX6-NEXT:    s_mov_b32 s5, s1
7358; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
7359; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7360; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7361; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7362; GFX6-NEXT:    s_endpgm
7363;
7364; GFX9-LABEL: udiv_i64_pow2k_denom:
7365; GFX9:       ; %bb.0:
7366; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
7367; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7368; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7369; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7370; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7371; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7372; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7373; GFX9-NEXT:    s_endpgm
7374  %r = udiv i64 %x, 4096
7375  store i64 %r, i64 addrspace(1)* %out
7376  ret void
7377}
7378
7379define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
7380; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
7381; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7382; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
7383; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7384; CHECK-NEXT:    ret void
7385;
7386; GFX6-LABEL: udiv_i64_pow2_shl_denom:
7387; GFX6:       ; %bb.0:
7388; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7389; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
7390; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7391; GFX6-NEXT:    s_mov_b32 s2, -1
7392; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7393; GFX6-NEXT:    s_mov_b32 s0, s4
7394; GFX6-NEXT:    s_add_i32 s8, s8, 12
7395; GFX6-NEXT:    s_mov_b32 s1, s5
7396; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
7397; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7398; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7399; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
7400; GFX6-NEXT:    s_endpgm
7401;
7402; GFX9-LABEL: udiv_i64_pow2_shl_denom:
7403; GFX9:       ; %bb.0:
7404; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7405; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
7406; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7407; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7408; GFX9-NEXT:    s_add_i32 s2, s2, 12
7409; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s2
7410; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7411; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7412; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7413; GFX9-NEXT:    s_endpgm
7414  %shl.y = shl i64 4096, %y
7415  %r = udiv i64 %x, %shl.y
7416  store i64 %r, i64 addrspace(1)* %out
7417  ret void
7418}
7419
7420define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7421; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
7422; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7423; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7424; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7425; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7426; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
7427; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7428; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7429; CHECK-NEXT:    ret void
7430;
7431; GFX6-LABEL: udiv_v2i64_pow2k_denom:
7432; GFX6:       ; %bb.0:
7433; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7434; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
7435; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7436; GFX6-NEXT:    s_mov_b32 s6, -1
7437; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7438; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
7439; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7440; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7441; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7442; GFX6-NEXT:    v_mov_b32_e32 v2, s2
7443; GFX6-NEXT:    v_mov_b32_e32 v3, s3
7444; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7445; GFX6-NEXT:    s_endpgm
7446;
7447; GFX9-LABEL: udiv_v2i64_pow2k_denom:
7448; GFX9:       ; %bb.0:
7449; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7450; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7451; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7452; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7453; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
7454; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 12
7455; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7456; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7457; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7458; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7459; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
7460; GFX9-NEXT:    s_endpgm
7461  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
7462  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7463  ret void
7464}
7465
7466define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7467; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
7468; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7469; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7470; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7471; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7472; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
7473; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7474; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7475; CHECK-NEXT:    ret void
7476;
7477; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom:
7478; GFX6:       ; %bb.0:
7479; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
7480; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
7481; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7482; GFX6-NEXT:    s_movk_i32 s2, 0xf001
7483; GFX6-NEXT:    v_mov_b32_e32 v8, 0
7484; GFX6-NEXT:    v_mov_b32_e32 v7, 0
7485; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7486; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7487; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7488; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7489; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7490; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7491; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7492; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
7493; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7494; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s2
7495; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s2
7496; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
7497; GFX6-NEXT:    s_mov_b32 s6, -1
7498; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
7499; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7500; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
7501; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
7502; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
7503; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
7504; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7505; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
7506; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
7507; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
7508; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
7509; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
7510; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
7511; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
7512; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7513; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
7514; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7515; GFX6-NEXT:    v_mul_hi_u32 v4, v0, s2
7516; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
7517; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s2
7518; GFX6-NEXT:    v_mul_lo_u32 v6, v0, s2
7519; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
7520; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7521; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
7522; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
7523; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v4
7524; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v6
7525; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
7526; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
7527; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
7528; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
7529; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v6
7530; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v6
7531; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
7532; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
7533; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v6, vcc
7534; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
7535; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
7536; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7537; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
7538; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
7539; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7540; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7541; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
7542; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
7543; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v1
7544; GFX6-NEXT:    v_mul_hi_u32 v5, s11, v1
7545; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
7546; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7547; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7548; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
7549; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
7550; GFX6-NEXT:    s_movk_i32 s0, 0xfff
7551; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7552; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7553; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
7554; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7555; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
7556; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
7557; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
7558; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
7559; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
7560; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
7561; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
7562; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
7563; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7564; GFX6-NEXT:    v_mov_b32_e32 v5, s11
7565; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s10, v8
7566; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
7567; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
7568; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
7569; GFX6-NEXT:    s_movk_i32 s0, 0xffe
7570; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
7571; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7572; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
7573; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
7574; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
7575; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7576; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7577; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
7578; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
7579; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
7580; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7581; GFX6-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
7582; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
7583; GFX6-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
7584; GFX6-NEXT:    v_mov_b32_e32 v0, s2
7585; GFX6-NEXT:    v_mov_b32_e32 v1, s3
7586; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7587; GFX6-NEXT:    s_endpgm
7588;
7589; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
7590; GFX9:       ; %bb.0:
7591; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
7592; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
7593; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7594; GFX9-NEXT:    s_movk_i32 s4, 0xf001
7595; GFX9-NEXT:    v_mov_b32_e32 v7, 0
7596; GFX9-NEXT:    v_mov_b32_e32 v5, 0
7597; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7598; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7599; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7600; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7601; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7602; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7603; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s4
7604; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s4
7605; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
7606; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
7607; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7608; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v3
7609; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
7610; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
7611; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
7612; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7613; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
7614; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
7615; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v3
7616; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
7617; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
7618; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
7619; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
7620; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7621; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
7622; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
7623; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
7624; GFX9-NEXT:    v_mul_hi_u32 v4, v0, s4
7625; GFX9-NEXT:    v_mul_lo_u32 v6, v2, s4
7626; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s4
7627; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
7628; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7629; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
7630; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
7631; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
7632; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v8
7633; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
7634; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
7635; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7636; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v9, v6
7637; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v10, vcc
7638; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v8
7639; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v8
7640; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
7641; GFX9-NEXT:    s_movk_i32 s0, 0xfff
7642; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
7643; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v9, v8, vcc
7644; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
7645; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
7646; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
7647; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
7648; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7649; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7650; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7651; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7652; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7653; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
7654; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
7655; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7656; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7657; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
7658; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
7659; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7660; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 12
7661; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
7662; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7663; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
7664; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7665; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
7666; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
7667; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s0
7668; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
7669; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
7670; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
7671; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
7672; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
7673; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
7674; GFX9-NEXT:    v_mov_b32_e32 v6, s7
7675; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
7676; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v4, vcc
7677; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
7678; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc
7679; GFX9-NEXT:    s_movk_i32 s0, 0xffe
7680; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
7681; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7682; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
7683; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
7684; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
7685; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
7686; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7687; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
7688; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
7689; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
7690; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7691; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
7692; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
7693; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
7694; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7695; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7696; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[8:9]
7697; GFX9-NEXT:    s_endpgm
7698  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
7699  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7700  ret void
7701}
7702
7703define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
7704; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
7705; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
7706; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7707; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7708; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
7709; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
7710; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7711; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7712; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
7713; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7714; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7715; CHECK-NEXT:    ret void
7716;
7717; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
7718; GFX6:       ; %bb.0:
7719; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7720; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
7721; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
7722; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7723; GFX6-NEXT:    s_mov_b32 s6, -1
7724; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7725; GFX6-NEXT:    s_add_i32 s0, s0, 12
7726; GFX6-NEXT:    s_add_i32 s2, s2, 12
7727; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
7728; GFX6-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
7729; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7730; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7731; GFX6-NEXT:    v_mov_b32_e32 v2, s2
7732; GFX6-NEXT:    v_mov_b32_e32 v3, s3
7733; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7734; GFX6-NEXT:    s_endpgm
7735;
7736; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
7737; GFX9:       ; %bb.0:
7738; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7739; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7740; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
7741; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7742; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7743; GFX9-NEXT:    s_add_i32 s0, s8, 12
7744; GFX9-NEXT:    s_add_i32 s8, s10, 12
7745; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], s0
7746; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
7747; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7748; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7749; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7750; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7751; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
7752; GFX9-NEXT:    s_endpgm
7753  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7754  %r = udiv <2 x i64> %x, %shl.y
7755  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7756  ret void
7757}
7758
7759define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
7760; CHECK-LABEL: @urem_i64_oddk_denom(
7761; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
7762; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7763; CHECK-NEXT:    ret void
7764;
7765; GFX6-LABEL: urem_i64_oddk_denom:
7766; GFX6:       ; %bb.0:
7767; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
7768; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7769; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7770; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7771; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
7772; GFX6-NEXT:    s_mov_b32 s3, 0x689e0837
7773; GFX6-NEXT:    v_mov_b32_e32 v8, 0
7774; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7775; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7776; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7777; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7778; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7779; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7780; GFX6-NEXT:    v_mov_b32_e32 v7, 0
7781; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7782; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7783; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7784; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7785; GFX6-NEXT:    s_mov_b32 s12, 0x9761f7c9
7786; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7787; GFX6-NEXT:    s_mov_b32 s8, s4
7788; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7789; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
7790; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
7791; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
7792; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
7793; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
7794; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
7795; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7796; GFX6-NEXT:    s_movk_i32 s4, 0x11f
7797; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
7798; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7799; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7800; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7801; GFX6-NEXT:    s_mov_b32 s9, s5
7802; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
7803; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
7804; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
7805; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7806; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
7807; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7808; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
7809; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
7810; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
7811; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
7812; GFX6-NEXT:    s_movk_i32 s5, 0x11e
7813; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7814; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
7815; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7816; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
7817; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
7818; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
7819; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
7820; GFX6-NEXT:    s_mov_b32 s11, 0xf000
7821; GFX6-NEXT:    s_mov_b32 s10, -1
7822; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
7823; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
7824; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
7825; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
7826; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
7827; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
7828; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
7829; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
7830; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
7831; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7832; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
7833; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
7834; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7835; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7836; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
7837; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
7838; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
7839; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
7840; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
7841; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7842; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7843; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
7844; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
7845; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7846; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7847; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
7848; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7849; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
7850; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
7851; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
7852; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
7853; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
7854; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7855; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
7856; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
7857; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
7858; GFX6-NEXT:    v_mov_b32_e32 v3, s4
7859; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
7860; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
7861; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
7862; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
7863; GFX6-NEXT:    s_mov_b32 s6, 0x9761f7c8
7864; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
7865; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
7866; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v4
7867; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
7868; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
7869; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, v5
7870; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
7871; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
7872; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
7873; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
7874; GFX6-NEXT:    v_mov_b32_e32 v5, s7
7875; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
7876; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
7877; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7878; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
7879; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7880; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
7881; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
7882; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7883; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7884; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
7885; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7886; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
7887; GFX6-NEXT:    s_endpgm
7888;
7889; GFX9-LABEL: urem_i64_oddk_denom:
7890; GFX9:       ; %bb.0:
7891; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
7892; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7893; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7894; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7895; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
7896; GFX9-NEXT:    s_mov_b32 s5, 0x689e0837
7897; GFX9-NEXT:    v_mov_b32_e32 v8, 0
7898; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7899; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7900; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7901; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7902; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7903; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7904; GFX9-NEXT:    v_mov_b32_e32 v5, 0
7905; GFX9-NEXT:    s_movk_i32 s8, 0x11f
7906; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
7907; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
7908; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
7909; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s5
7910; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
7911; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7912; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7913; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
7914; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v6
7915; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
7916; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
7917; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7918; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
7919; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
7920; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
7921; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
7922; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
7923; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
7924; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
7925; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
7926; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7927; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
7928; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
7929; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
7930; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
7931; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
7932; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s5
7933; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
7934; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7935; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
7936; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
7937; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
7938; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v9
7939; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
7940; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
7941; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7942; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
7943; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v10, vcc
7944; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
7945; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
7946; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
7947; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
7948; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v9, vcc
7949; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
7950; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
7951; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
7952; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
7953; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7954; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7955; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7956; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7957; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7958; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
7959; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
7960; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7961; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7962; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
7963; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
7964; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7965; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
7966; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7967; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
7968; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7969; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
7970; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
7971; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s9
7972; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
7973; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
7974; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7975; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
7976; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
7977; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
7978; GFX9-NEXT:    v_mov_b32_e32 v3, s8
7979; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
7980; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v0
7981; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
7982; GFX9-NEXT:    s_movk_i32 s6, 0x11e
7983; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
7984; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
7985; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
7986; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v4
7987; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v4
7988; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
7989; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
7990; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
7991; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
7992; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
7993; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
7994; GFX9-NEXT:    v_mov_b32_e32 v6, s7
7995; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
7996; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
7997; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7998; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
7999; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
8000; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
8001; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
8002; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
8003; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
8004; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
8005; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8006; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
8007; GFX9-NEXT:    s_endpgm
8008  %r = urem i64 %x, 1235195393993
8009  store i64 %r, i64 addrspace(1)* %out
8010  ret void
8011}
8012
8013define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
8014; CHECK-LABEL: @urem_i64_pow2k_denom(
8015; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
8016; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8017; CHECK-NEXT:    ret void
8018;
8019; GFX6-LABEL: urem_i64_pow2k_denom:
8020; GFX6:       ; %bb.0:
8021; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
8022; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8023; GFX6-NEXT:    s_mov_b32 s2, -1
8024; GFX6-NEXT:    v_mov_b32_e32 v1, 0
8025; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8026; GFX6-NEXT:    s_mov_b32 s0, s4
8027; GFX6-NEXT:    s_and_b32 s4, s6, 0xfff
8028; GFX6-NEXT:    s_mov_b32 s1, s5
8029; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8030; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
8031; GFX6-NEXT:    s_endpgm
8032;
8033; GFX9-LABEL: urem_i64_pow2k_denom:
8034; GFX9:       ; %bb.0:
8035; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
8036; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8037; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8038; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
8039; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8040; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
8041; GFX9-NEXT:    s_endpgm
8042  %r = urem i64 %x, 4096
8043  store i64 %r, i64 addrspace(1)* %out
8044  ret void
8045}
8046
8047define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
8048; CHECK-LABEL: @urem_i64_pow2_shl_denom(
8049; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
8050; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
8051; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8052; CHECK-NEXT:    ret void
8053;
8054; GFX6-LABEL: urem_i64_pow2_shl_denom:
8055; GFX6:       ; %bb.0:
8056; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
8057; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
8058; GFX6-NEXT:    s_mov_b32 s3, 0xf000
8059; GFX6-NEXT:    s_mov_b32 s2, -1
8060; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8061; GFX6-NEXT:    s_mov_b32 s0, s4
8062; GFX6-NEXT:    s_mov_b32 s1, s5
8063; GFX6-NEXT:    s_mov_b64 s[4:5], 0x1000
8064; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
8065; GFX6-NEXT:    s_add_u32 s4, s4, -1
8066; GFX6-NEXT:    s_addc_u32 s5, s5, -1
8067; GFX6-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
8068; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8069; GFX6-NEXT:    v_mov_b32_e32 v1, s5
8070; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
8071; GFX6-NEXT:    s_endpgm
8072;
8073; GFX9-LABEL: urem_i64_pow2_shl_denom:
8074; GFX9:       ; %bb.0:
8075; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8076; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
8077; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
8078; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8079; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8080; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
8081; GFX9-NEXT:    s_add_u32 s0, s0, -1
8082; GFX9-NEXT:    s_addc_u32 s1, s1, -1
8083; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
8084; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8085; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8086; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
8087; GFX9-NEXT:    s_endpgm
8088  %shl.y = shl i64 4096, %y
8089  %r = urem i64 %x, %shl.y
8090  store i64 %r, i64 addrspace(1)* %out
8091  ret void
8092}
8093
8094define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8095; CHECK-LABEL: @urem_v2i64_pow2k_denom(
8096; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8097; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
8098; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8099; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8100; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
8101; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8102; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8103; CHECK-NEXT:    ret void
8104;
8105; GFX6-LABEL: urem_v2i64_pow2k_denom:
8106; GFX6:       ; %bb.0:
8107; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8108; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
8109; GFX6-NEXT:    s_movk_i32 s8, 0xfff
8110; GFX6-NEXT:    v_mov_b32_e32 v1, 0
8111; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8112; GFX6-NEXT:    s_mov_b32 s6, -1
8113; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8114; GFX6-NEXT:    s_and_b32 s0, s0, s8
8115; GFX6-NEXT:    s_and_b32 s1, s2, s8
8116; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8117; GFX6-NEXT:    v_mov_b32_e32 v2, s1
8118; GFX6-NEXT:    v_mov_b32_e32 v3, v1
8119; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8120; GFX6-NEXT:    s_endpgm
8121;
8122; GFX9-LABEL: urem_v2i64_pow2k_denom:
8123; GFX9:       ; %bb.0:
8124; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8125; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8126; GFX9-NEXT:    s_movk_i32 s0, 0xfff
8127; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8128; GFX9-NEXT:    v_mov_b32_e32 v3, v1
8129; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8130; GFX9-NEXT:    s_and_b32 s1, s4, s0
8131; GFX9-NEXT:    s_and_b32 s0, s6, s0
8132; GFX9-NEXT:    v_mov_b32_e32 v0, s1
8133; GFX9-NEXT:    v_mov_b32_e32 v2, s0
8134; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
8135; GFX9-NEXT:    s_endpgm
8136  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
8137  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8138  ret void
8139}
8140
8141define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
8142; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
8143; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
8144; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8145; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
8146; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
8147; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
8148; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
8149; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
8150; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
8151; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
8152; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8153; CHECK-NEXT:    ret void
8154;
8155; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
8156; GFX6:       ; %bb.0:
8157; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8158; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
8159; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
8160; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
8161; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8162; GFX6-NEXT:    s_mov_b32 s6, -1
8163; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8164; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
8165; GFX6-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
8166; GFX6-NEXT:    s_add_u32 s0, s0, -1
8167; GFX6-NEXT:    s_addc_u32 s1, s1, -1
8168; GFX6-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
8169; GFX6-NEXT:    s_add_u32 s2, s2, -1
8170; GFX6-NEXT:    s_addc_u32 s3, s3, -1
8171; GFX6-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
8172; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8173; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8174; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8175; GFX6-NEXT:    v_mov_b32_e32 v3, s3
8176; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8177; GFX6-NEXT:    s_endpgm
8178;
8179; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
8180; GFX9:       ; %bb.0:
8181; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8182; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8183; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
8184; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
8185; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8186; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8187; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
8188; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
8189; GFX9-NEXT:    s_add_u32 s0, s0, -1
8190; GFX9-NEXT:    s_addc_u32 s1, s1, -1
8191; GFX9-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
8192; GFX9-NEXT:    s_add_u32 s4, s10, -1
8193; GFX9-NEXT:    s_addc_u32 s5, s11, -1
8194; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
8195; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8196; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8197; GFX9-NEXT:    v_mov_b32_e32 v2, s4
8198; GFX9-NEXT:    v_mov_b32_e32 v3, s5
8199; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
8200; GFX9-NEXT:    s_endpgm
8201  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
8202  %r = urem <2 x i64> %x, %shl.y
8203  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8204  ret void
8205}
8206
8207define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
8208; CHECK-LABEL: @sdiv_i64_oddk_denom(
8209; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
8210; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8211; CHECK-NEXT:    ret void
8212;
8213; GFX6-LABEL: sdiv_i64_oddk_denom:
8214; GFX6:       ; %bb.0:
8215; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
8216; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
8217; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8218; GFX6-NEXT:    s_mov_b32 s2, 0xffed2705
8219; GFX6-NEXT:    v_mov_b32_e32 v8, 0
8220; GFX6-NEXT:    v_mov_b32_e32 v7, 0
8221; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8222; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8223; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8224; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8225; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8226; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8227; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
8228; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8229; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
8230; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
8231; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
8232; GFX6-NEXT:    s_mov_b32 s6, -1
8233; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8234; GFX6-NEXT:    s_mov_b32 s4, s8
8235; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8236; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8237; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8238; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
8239; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
8240; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
8241; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8242; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
8243; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
8244; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
8245; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
8246; GFX6-NEXT:    s_mov_b32 s5, s9
8247; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
8248; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
8249; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
8250; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8251; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
8252; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
8253; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
8254; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
8255; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
8256; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8257; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
8258; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
8259; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
8260; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
8261; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
8262; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
8263; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
8264; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
8265; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
8266; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
8267; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
8268; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
8269; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
8270; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
8271; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
8272; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
8273; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
8274; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
8275; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
8276; GFX6-NEXT:    s_add_u32 s0, s10, s2
8277; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8278; GFX6-NEXT:    s_mov_b32 s3, s2
8279; GFX6-NEXT:    s_addc_u32 s1, s11, s2
8280; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
8281; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8282; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
8283; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
8284; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
8285; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
8286; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
8287; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8288; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
8289; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
8290; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
8291; GFX6-NEXT:    s_mov_b32 s3, 0x12d8fb
8292; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8293; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8294; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
8295; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8296; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
8297; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
8298; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
8299; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
8300; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s3
8301; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
8302; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
8303; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
8304; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8305; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
8306; GFX6-NEXT:    v_mov_b32_e32 v5, s1
8307; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
8308; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s3, v8
8309; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
8310; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
8311; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
8312; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
8313; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
8314; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
8315; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
8316; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
8317; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8318; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
8319; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
8320; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8321; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
8322; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8323; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
8324; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8325; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
8326; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
8327; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8328; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
8329; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
8330; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8331; GFX6-NEXT:    s_endpgm
8332;
8333; GFX9-LABEL: sdiv_i64_oddk_denom:
8334; GFX9:       ; %bb.0:
8335; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
8336; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
8337; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8338; GFX9-NEXT:    s_mov_b32 s8, 0xffed2705
8339; GFX9-NEXT:    v_mov_b32_e32 v7, 0
8340; GFX9-NEXT:    v_mov_b32_e32 v5, 0
8341; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8342; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8343; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8344; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8345; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8346; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8347; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8348; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
8349; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
8350; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
8351; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8352; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8353; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
8354; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8355; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
8356; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
8357; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8358; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
8359; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
8360; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v4
8361; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
8362; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
8363; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
8364; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
8365; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8366; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
8367; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
8368; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
8369; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
8370; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s8
8371; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
8372; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
8373; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
8374; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
8375; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
8376; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
8377; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
8378; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
8379; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
8380; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
8381; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
8382; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
8383; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
8384; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
8385; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
8386; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
8387; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
8388; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
8389; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
8390; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8391; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
8392; GFX9-NEXT:    s_add_u32 s0, s6, s2
8393; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8394; GFX9-NEXT:    s_mov_b32 s3, s2
8395; GFX9-NEXT:    s_addc_u32 s1, s7, s2
8396; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
8397; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8398; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
8399; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
8400; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
8401; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
8402; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
8403; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8404; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
8405; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
8406; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
8407; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
8408; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
8409; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
8410; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
8411; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
8412; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
8413; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
8414; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
8415; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s3
8416; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s3
8417; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
8418; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
8419; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
8420; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
8421; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
8422; GFX9-NEXT:    v_mov_b32_e32 v6, s1
8423; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v4, vcc
8424; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s3, v9
8425; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc
8426; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
8427; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
8428; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8429; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
8430; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
8431; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
8432; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
8433; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8434; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
8435; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
8436; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8437; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
8438; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8439; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
8440; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8441; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
8442; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
8443; GFX9-NEXT:    v_mov_b32_e32 v2, s2
8444; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
8445; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
8446; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
8447; GFX9-NEXT:    s_endpgm
8448  %r = sdiv i64 %x, 1235195
8449  store i64 %r, i64 addrspace(1)* %out
8450  ret void
8451}
8452
8453define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
8454; CHECK-LABEL: @sdiv_i64_pow2k_denom(
8455; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
8456; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8457; CHECK-NEXT:    ret void
8458;
8459; GFX6-LABEL: sdiv_i64_pow2k_denom:
8460; GFX6:       ; %bb.0:
8461; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
8462; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8463; GFX6-NEXT:    s_mov_b32 s6, -1
8464; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8465; GFX6-NEXT:    s_mov_b32 s4, s0
8466; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
8467; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
8468; GFX6-NEXT:    s_add_u32 s0, s2, s0
8469; GFX6-NEXT:    s_mov_b32 s5, s1
8470; GFX6-NEXT:    s_addc_u32 s1, s3, 0
8471; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8472; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8473; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8474; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8475; GFX6-NEXT:    s_endpgm
8476;
8477; GFX9-LABEL: sdiv_i64_pow2k_denom:
8478; GFX9:       ; %bb.0:
8479; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
8480; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8481; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8482; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
8483; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8484; GFX9-NEXT:    s_add_u32 s2, s2, s4
8485; GFX9-NEXT:    s_addc_u32 s3, s3, 0
8486; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8487; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8488; GFX9-NEXT:    v_mov_b32_e32 v1, s3
8489; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
8490; GFX9-NEXT:    s_endpgm
8491  %r = sdiv i64 %x, 4096
8492  store i64 %r, i64 addrspace(1)* %out
8493  ret void
8494}
8495
8496define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
8497; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
8498; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
8499; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
8500; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8501; CHECK-NEXT:    ret void
8502;
8503; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
8504; GFX6:       ; %bb.0:
8505; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
8506; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
8507; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
8508; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8509; GFX6-NEXT:    s_mov_b32 s6, -1
8510; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8511; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
8512; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
8513; GFX6-NEXT:    s_add_u32 s2, s2, s12
8514; GFX6-NEXT:    s_mov_b32 s13, s12
8515; GFX6-NEXT:    s_addc_u32 s3, s3, s12
8516; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
8517; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
8518; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
8519; GFX6-NEXT:    s_sub_u32 s4, 0, s2
8520; GFX6-NEXT:    s_subb_u32 s5, 0, s3
8521; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
8522; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8523; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8524; GFX6-NEXT:    s_mov_b32 s15, s14
8525; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8526; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8527; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8528; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8529; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8530; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8531; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
8532; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
8533; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
8534; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
8535; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8536; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
8537; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
8538; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8539; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8540; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8541; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8542; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
8543; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8544; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
8545; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
8546; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
8547; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
8548; GFX6-NEXT:    v_mov_b32_e32 v4, 0
8549; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
8550; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8551; GFX6-NEXT:    v_mov_b32_e32 v6, 0
8552; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
8553; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
8554; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
8555; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v2
8556; GFX6-NEXT:    v_mul_hi_u32 v7, s4, v0
8557; GFX6-NEXT:    v_mul_lo_u32 v8, s5, v0
8558; GFX6-NEXT:    s_mov_b32 s5, s9
8559; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
8560; GFX6-NEXT:    v_mul_lo_u32 v7, s4, v0
8561; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
8562; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
8563; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
8564; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
8565; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
8566; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
8567; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
8568; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
8569; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
8570; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
8571; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
8572; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
8573; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
8574; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
8575; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
8576; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
8577; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
8578; GFX6-NEXT:    s_add_u32 s0, s10, s14
8579; GFX6-NEXT:    s_addc_u32 s1, s11, s14
8580; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8581; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
8582; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8583; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
8584; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
8585; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
8586; GFX6-NEXT:    v_mul_hi_u32 v7, s11, v1
8587; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
8588; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8589; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
8590; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
8591; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
8592; GFX6-NEXT:    s_mov_b32 s4, s8
8593; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
8594; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8595; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
8596; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8597; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
8598; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
8599; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
8600; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
8601; GFX6-NEXT:    v_mov_b32_e32 v5, s3
8602; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8603; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
8604; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8605; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
8606; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
8607; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
8608; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
8609; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
8610; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
8611; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8612; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
8613; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8614; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
8615; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
8616; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
8617; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
8618; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
8619; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
8620; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8621; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
8622; GFX6-NEXT:    v_mov_b32_e32 v6, s11
8623; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
8624; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
8625; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8626; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
8627; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8628; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
8629; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
8630; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
8631; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
8632; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8633; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
8634; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8635; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
8636; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
8637; GFX6-NEXT:    v_mov_b32_e32 v2, s1
8638; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
8639; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
8640; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8641; GFX6-NEXT:    s_endpgm
8642;
8643; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
8644; GFX9:       ; %bb.0:
8645; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
8646; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
8647; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8648; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8649; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
8650; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
8651; GFX9-NEXT:    s_add_u32 s2, s2, s8
8652; GFX9-NEXT:    s_mov_b32 s9, s8
8653; GFX9-NEXT:    s_addc_u32 s3, s3, s8
8654; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[8:9]
8655; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
8656; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
8657; GFX9-NEXT:    s_sub_u32 s12, 0, s10
8658; GFX9-NEXT:    s_subb_u32 s4, 0, s11
8659; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8660; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8661; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8662; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8663; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8664; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8665; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8666; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8667; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
8668; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
8669; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
8670; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v0
8671; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
8672; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
8673; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
8674; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
8675; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
8676; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
8677; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
8678; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
8679; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
8680; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
8681; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
8682; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
8683; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
8684; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
8685; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
8686; GFX9-NEXT:    v_mov_b32_e32 v6, 0
8687; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
8688; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
8689; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
8690; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v3
8691; GFX9-NEXT:    v_mul_hi_u32 v7, s12, v0
8692; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
8693; GFX9-NEXT:    v_mul_lo_u32 v9, s12, v0
8694; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8695; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
8696; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
8697; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
8698; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
8699; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
8700; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
8701; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
8702; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
8703; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
8704; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
8705; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
8706; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
8707; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
8708; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
8709; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
8710; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
8711; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
8712; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
8713; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8714; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
8715; GFX9-NEXT:    s_add_u32 s0, s6, s2
8716; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
8717; GFX9-NEXT:    s_mov_b32 s3, s2
8718; GFX9-NEXT:    s_addc_u32 s1, s7, s2
8719; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
8720; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8721; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
8722; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
8723; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
8724; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
8725; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
8726; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
8727; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
8728; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
8729; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
8730; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
8731; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v4, v0, vcc
8732; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v2, vcc
8733; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
8734; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
8735; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
8736; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
8737; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v0
8738; GFX9-NEXT:    v_mov_b32_e32 v6, s11
8739; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
8740; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v0
8741; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
8742; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v3
8743; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
8744; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
8745; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v4
8746; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
8747; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
8748; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
8749; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
8750; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8751; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
8752; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
8753; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
8754; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
8755; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
8756; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
8757; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
8758; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
8759; GFX9-NEXT:    v_mov_b32_e32 v7, s7
8760; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
8761; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
8762; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
8763; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
8764; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
8765; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
8766; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
8767; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
8768; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
8769; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
8770; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[8:9]
8771; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
8772; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
8773; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
8774; GFX9-NEXT:    v_mov_b32_e32 v3, s1
8775; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
8776; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
8777; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
8778; GFX9-NEXT:    s_endpgm
8779  %shl.y = shl i64 4096, %y
8780  %r = sdiv i64 %x, %shl.y
8781  store i64 %r, i64 addrspace(1)* %out
8782  ret void
8783}
8784
8785define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8786; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
8787; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8788; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8789; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8790; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8791; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
8792; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8793; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8794; CHECK-NEXT:    ret void
8795;
8796; GFX6-LABEL: sdiv_v2i64_pow2k_denom:
8797; GFX6:       ; %bb.0:
8798; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8799; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
8800; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8801; GFX6-NEXT:    s_mov_b32 s6, -1
8802; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8803; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
8804; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8805; GFX6-NEXT:    s_add_u32 s0, s0, s8
8806; GFX6-NEXT:    s_addc_u32 s1, s1, 0
8807; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
8808; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8809; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8810; GFX6-NEXT:    s_add_u32 s2, s2, s8
8811; GFX6-NEXT:    s_addc_u32 s3, s3, 0
8812; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8813; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8814; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8815; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8816; GFX6-NEXT:    v_mov_b32_e32 v3, s3
8817; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8818; GFX6-NEXT:    s_endpgm
8819;
8820; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
8821; GFX9:       ; %bb.0:
8822; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8823; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8824; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8825; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8826; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
8827; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8828; GFX9-NEXT:    s_add_u32 s0, s4, s0
8829; GFX9-NEXT:    s_addc_u32 s1, s5, 0
8830; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
8831; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8832; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8833; GFX9-NEXT:    s_add_u32 s4, s6, s4
8834; GFX9-NEXT:    s_addc_u32 s5, s7, 0
8835; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
8836; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8837; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8838; GFX9-NEXT:    v_mov_b32_e32 v2, s4
8839; GFX9-NEXT:    v_mov_b32_e32 v3, s5
8840; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
8841; GFX9-NEXT:    s_endpgm
8842  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
8843  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8844  ret void
8845}
8846
8847define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8848; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
8849; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8850; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8851; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8852; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8853; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
8854; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8855; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8856; CHECK-NEXT:    ret void
8857;
8858; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8859; GFX6:       ; %bb.0:
8860; GFX6-NEXT:    v_mov_b32_e32 v0, 0x457ff000
8861; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
8862; GFX6-NEXT:    v_mac_f32_e32 v0, 0, v1
8863; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8864; GFX6-NEXT:    s_movk_i32 s6, 0xf001
8865; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8866; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
8867; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8868; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8869; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8870; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8871; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8872; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8873; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8874; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8875; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
8876; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
8877; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
8878; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
8879; GFX6-NEXT:    s_add_u32 s2, s8, s0
8880; GFX6-NEXT:    s_addc_u32 s3, s9, 0
8881; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
8882; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8883; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
8884; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8885; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
8886; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8887; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
8888; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8889; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8890; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8891; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8892; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8893; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
8894; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
8895; GFX6-NEXT:    s_mov_b32 s9, s8
8896; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
8897; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
8898; GFX6-NEXT:    v_mov_b32_e32 v4, 0
8899; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
8900; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8901; GFX6-NEXT:    v_mov_b32_e32 v6, 0
8902; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
8903; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
8904; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
8905; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s6
8906; GFX6-NEXT:    v_mul_hi_u32 v7, v0, s6
8907; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
8908; GFX6-NEXT:    v_mul_lo_u32 v7, v0, s6
8909; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
8910; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
8911; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
8912; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
8913; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
8914; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
8915; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
8916; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
8917; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
8918; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
8919; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
8920; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
8921; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
8922; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
8923; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
8924; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
8925; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
8926; GFX6-NEXT:    s_add_u32 s0, s10, s8
8927; GFX6-NEXT:    s_addc_u32 s1, s11, s8
8928; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8929; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
8930; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8931; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
8932; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
8933; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v1
8934; GFX6-NEXT:    v_mul_hi_u32 v7, s1, v1
8935; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
8936; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8937; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
8938; GFX6-NEXT:    v_mul_lo_u32 v5, s1, v0
8939; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
8940; GFX6-NEXT:    s_movk_i32 s9, 0xfff
8941; GFX6-NEXT:    s_mov_b32 s6, -1
8942; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
8943; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8944; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
8945; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8946; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
8947; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
8948; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s9
8949; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
8950; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s9
8951; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
8952; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
8953; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
8954; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8955; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
8956; GFX6-NEXT:    v_mov_b32_e32 v5, s1
8957; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
8958; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v8
8959; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
8960; GFX6-NEXT:    s_movk_i32 s0, 0xffe
8961; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
8962; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
8963; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
8964; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
8965; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
8966; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
8967; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8968; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
8969; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
8970; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8971; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
8972; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8973; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
8974; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8975; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
8976; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
8977; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
8978; GFX6-NEXT:    v_mov_b32_e32 v3, s8
8979; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
8980; GFX6-NEXT:    v_mov_b32_e32 v0, s2
8981; GFX6-NEXT:    v_mov_b32_e32 v1, s3
8982; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8983; GFX6-NEXT:    s_endpgm
8984;
8985; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8986; GFX9:       ; %bb.0:
8987; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
8988; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
8989; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
8990; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8991; GFX9-NEXT:    s_movk_i32 s8, 0xf001
8992; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8993; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8994; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8995; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8996; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8997; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8998; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8999; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9000; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9001; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
9002; GFX9-NEXT:    s_lshr_b32 s2, s2, 20
9003; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s8
9004; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
9005; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
9006; GFX9-NEXT:    s_add_u32 s4, s4, s2
9007; GFX9-NEXT:    s_addc_u32 s5, s5, 0
9008; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
9009; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
9010; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
9011; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
9012; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
9013; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
9014; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9015; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
9016; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
9017; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
9018; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
9019; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
9020; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
9021; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
9022; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
9023; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9024; GFX9-NEXT:    v_mov_b32_e32 v6, 0
9025; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
9026; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
9027; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
9028; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s8
9029; GFX9-NEXT:    v_mul_hi_u32 v7, v0, s8
9030; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
9031; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
9032; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
9033; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
9034; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v0
9035; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
9036; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
9037; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
9038; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
9039; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
9040; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
9041; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v5
9042; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
9043; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v5
9044; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
9045; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
9046; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
9047; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
9048; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
9049; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
9050; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
9051; GFX9-NEXT:    s_add_u32 s6, s6, s2
9052; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9053; GFX9-NEXT:    s_mov_b32 s3, s2
9054; GFX9-NEXT:    s_addc_u32 s7, s7, s2
9055; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
9056; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9057; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
9058; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
9059; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
9060; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
9061; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
9062; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9063; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
9064; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
9065; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
9066; GFX9-NEXT:    s_movk_i32 s0, 0xfff
9067; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
9068; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9069; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
9070; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9071; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
9072; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
9073; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s0
9074; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
9075; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
9076; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
9077; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
9078; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
9079; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
9080; GFX9-NEXT:    v_mov_b32_e32 v6, s7
9081; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
9082; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
9083; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
9084; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
9085; GFX9-NEXT:    s_movk_i32 s0, 0xffe
9086; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
9087; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9088; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
9089; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
9090; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
9091; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9092; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9093; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
9094; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
9095; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9096; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
9097; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
9098; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
9099; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9100; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
9101; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
9102; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
9103; GFX9-NEXT:    v_mov_b32_e32 v3, s2
9104; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
9105; GFX9-NEXT:    v_mov_b32_e32 v0, s4
9106; GFX9-NEXT:    v_mov_b32_e32 v1, s5
9107; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9108; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
9109; GFX9-NEXT:    s_endpgm
9110  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
9111  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
9112  ret void
9113}
9114
9115define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
9116; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
9117; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
9118; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9119; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
9120; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
9121; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
9122; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
9123; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
9124; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
9125; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
9126; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
9127; CHECK-NEXT:    ret void
9128;
9129; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
9130; GFX6:       ; %bb.0:
9131; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
9132; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
9133; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
9134; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
9135; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
9136; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9137; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
9138; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9139; GFX6-NEXT:    s_ashr_i32 s16, s3, 31
9140; GFX6-NEXT:    s_add_u32 s2, s2, s16
9141; GFX6-NEXT:    s_mov_b32 s17, s16
9142; GFX6-NEXT:    s_addc_u32 s3, s3, s16
9143; GFX6-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
9144; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s14
9145; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s15
9146; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
9147; GFX6-NEXT:    s_sub_u32 s6, 0, s14
9148; GFX6-NEXT:    s_subb_u32 s7, 0, s15
9149; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
9150; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9151; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
9152; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
9153; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
9154; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
9155; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9156; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
9157; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9158; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9159; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
9160; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
9161; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
9162; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v0
9163; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9164; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9165; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
9166; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
9167; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
9168; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9169; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9170; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
9171; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
9172; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
9173; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
9174; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9175; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
9176; GFX6-NEXT:    v_mov_b32_e32 v4, 0
9177; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
9178; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9179; GFX6-NEXT:    v_mov_b32_e32 v6, 0
9180; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
9181; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
9182; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
9183; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
9184; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
9185; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v0
9186; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9187; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
9188; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v0
9189; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
9190; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
9191; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
9192; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
9193; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
9194; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
9195; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
9196; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
9197; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
9198; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
9199; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
9200; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
9201; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
9202; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
9203; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
9204; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
9205; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
9206; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9207; GFX6-NEXT:    s_ashr_i32 s2, s9, 31
9208; GFX6-NEXT:    s_add_u32 s0, s8, s2
9209; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9210; GFX6-NEXT:    s_mov_b32 s3, s2
9211; GFX6-NEXT:    s_addc_u32 s1, s9, s2
9212; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
9213; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9214; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
9215; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
9216; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v1
9217; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v1
9218; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
9219; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9220; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
9221; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v0
9222; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
9223; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
9224; GFX6-NEXT:    s_mov_b32 s6, -1
9225; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9226; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9227; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
9228; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9229; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
9230; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v1
9231; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v0
9232; GFX6-NEXT:    v_mul_lo_u32 v5, s15, v0
9233; GFX6-NEXT:    v_mov_b32_e32 v7, s15
9234; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9235; GFX6-NEXT:    v_mul_lo_u32 v3, s14, v0
9236; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9237; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
9238; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
9239; GFX6-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v7, vcc
9240; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v3
9241; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
9242; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v5
9243; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9244; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
9245; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9246; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v5
9247; GFX6-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
9248; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
9249; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
9250; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v0
9251; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
9252; GFX6-NEXT:    s_ashr_i32 s8, s13, 31
9253; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9254; GFX6-NEXT:    s_add_u32 s12, s12, s8
9255; GFX6-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
9256; GFX6-NEXT:    v_mov_b32_e32 v8, s9
9257; GFX6-NEXT:    s_mov_b32 s9, s8
9258; GFX6-NEXT:    s_addc_u32 s13, s13, s8
9259; GFX6-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
9260; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s12
9261; GFX6-NEXT:    v_cvt_f32_u32_e32 v11, s13
9262; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
9263; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
9264; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9265; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
9266; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9267; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
9268; GFX6-NEXT:    v_mac_f32_e32 v10, s18, v11
9269; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
9270; GFX6-NEXT:    v_rcp_f32_e32 v3, v10
9271; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9272; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
9273; GFX6-NEXT:    s_sub_u32 s14, 0, s12
9274; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v3
9275; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v3
9276; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
9277; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v5
9278; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
9279; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
9280; GFX6-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
9281; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9282; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v3
9283; GFX6-NEXT:    v_mul_lo_u32 v7, s14, v5
9284; GFX6-NEXT:    s_subb_u32 s15, 0, s13
9285; GFX6-NEXT:    v_mul_lo_u32 v8, s15, v3
9286; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
9287; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
9288; GFX6-NEXT:    v_mul_lo_u32 v7, s14, v3
9289; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
9290; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v2
9291; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
9292; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v7
9293; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v2
9294; GFX6-NEXT:    v_mul_lo_u32 v2, v5, v2
9295; GFX6-NEXT:    v_xor_b32_e32 v1, s3, v1
9296; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
9297; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
9298; GFX6-NEXT:    v_mul_lo_u32 v10, v5, v7
9299; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
9300; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
9301; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
9302; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
9303; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
9304; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
9305; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
9306; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
9307; GFX6-NEXT:    v_mul_lo_u32 v8, s14, v3
9308; GFX6-NEXT:    v_mul_hi_u32 v9, s14, v2
9309; GFX6-NEXT:    v_mul_lo_u32 v10, s15, v2
9310; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
9311; GFX6-NEXT:    v_mul_lo_u32 v9, s14, v2
9312; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
9313; GFX6-NEXT:    v_mul_lo_u32 v12, v2, v8
9314; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
9315; GFX6-NEXT:    v_mul_hi_u32 v13, v2, v9
9316; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v9
9317; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v9
9318; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v8
9319; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
9320; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
9321; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v8
9322; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
9323; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
9324; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
9325; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
9326; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
9327; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
9328; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
9329; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
9330; GFX6-NEXT:    s_add_u32 s0, s10, s14
9331; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9332; GFX6-NEXT:    s_mov_b32 s15, s14
9333; GFX6-NEXT:    s_addc_u32 s1, s11, s14
9334; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
9335; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
9336; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v3
9337; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v2
9338; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v3
9339; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v3
9340; GFX6-NEXT:    v_mul_lo_u32 v3, s11, v3
9341; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
9342; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
9343; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v2
9344; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
9345; GFX6-NEXT:    v_mov_b32_e32 v8, s3
9346; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
9347; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
9348; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
9349; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9350; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
9351; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v3
9352; GFX6-NEXT:    v_mul_hi_u32 v5, s12, v2
9353; GFX6-NEXT:    v_mul_lo_u32 v6, s13, v2
9354; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
9355; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
9356; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9357; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v2
9358; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
9359; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
9360; GFX6-NEXT:    v_mov_b32_e32 v7, s13
9361; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
9362; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
9363; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
9364; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
9365; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
9366; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9367; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
9368; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9369; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
9370; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
9371; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
9372; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
9373; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
9374; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
9375; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9376; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
9377; GFX6-NEXT:    v_mov_b32_e32 v8, s11
9378; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
9379; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
9380; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9381; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
9382; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9383; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
9384; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
9385; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9386; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
9387; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9388; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
9389; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
9390; GFX6-NEXT:    v_xor_b32_e32 v2, s0, v2
9391; GFX6-NEXT:    v_xor_b32_e32 v3, s1, v3
9392; GFX6-NEXT:    v_mov_b32_e32 v4, s1
9393; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
9394; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
9395; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
9396; GFX6-NEXT:    s_endpgm
9397;
9398; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
9399; GFX9:       ; %bb.0:
9400; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
9401; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
9402; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
9403; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
9404; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
9405; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9406; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
9407; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9408; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
9409; GFX9-NEXT:    s_add_u32 s2, s2, s12
9410; GFX9-NEXT:    s_mov_b32 s13, s12
9411; GFX9-NEXT:    s_addc_u32 s3, s3, s12
9412; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
9413; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
9414; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
9415; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
9416; GFX9-NEXT:    s_sub_u32 s14, 0, s10
9417; GFX9-NEXT:    s_subb_u32 s4, 0, s11
9418; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
9419; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9420; GFX9-NEXT:    v_mov_b32_e32 v6, 0
9421; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
9422; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
9423; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9424; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
9425; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9426; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9427; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
9428; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
9429; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
9430; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v0
9431; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9432; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
9433; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
9434; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
9435; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
9436; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
9437; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9438; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
9439; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
9440; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
9441; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
9442; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
9443; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
9444; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
9445; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9446; GFX9-NEXT:    v_mov_b32_e32 v5, 0
9447; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
9448; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
9449; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
9450; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v2
9451; GFX9-NEXT:    v_mul_hi_u32 v7, s14, v0
9452; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
9453; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v0
9454; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
9455; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
9456; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
9457; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
9458; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
9459; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
9460; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v9
9461; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v9
9462; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
9463; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
9464; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
9465; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
9466; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
9467; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
9468; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
9469; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
9470; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9471; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
9472; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
9473; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
9474; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
9475; GFX9-NEXT:    s_add_u32 s2, s4, s14
9476; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9477; GFX9-NEXT:    s_mov_b32 s15, s14
9478; GFX9-NEXT:    s_addc_u32 s3, s5, s14
9479; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
9480; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9481; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
9482; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
9483; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v1
9484; GFX9-NEXT:    v_mul_hi_u32 v7, s5, v1
9485; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v1
9486; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9487; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
9488; GFX9-NEXT:    v_mul_lo_u32 v4, s5, v0
9489; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
9490; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
9491; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
9492; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9493; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
9494; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9495; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
9496; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
9497; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
9498; GFX9-NEXT:    v_mul_lo_u32 v4, s11, v0
9499; GFX9-NEXT:    v_mov_b32_e32 v7, s11
9500; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9501; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
9502; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
9503; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v2
9504; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s4, v3
9505; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc
9506; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v3
9507; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
9508; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
9509; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9510; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
9511; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9512; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
9513; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v7, s[0:1]
9514; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], 2, v0
9515; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v1, s[0:1]
9516; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
9517; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
9518; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
9519; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v8, s[0:1]
9520; GFX9-NEXT:    v_mov_b32_e32 v8, s5
9521; GFX9-NEXT:    s_xor_b64 s[4:5], s[14:15], s[12:13]
9522; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
9523; GFX9-NEXT:    s_add_u32 s8, s8, s12
9524; GFX9-NEXT:    s_mov_b32 s13, s12
9525; GFX9-NEXT:    s_addc_u32 s9, s9, s12
9526; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
9527; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s8
9528; GFX9-NEXT:    v_cvt_f32_u32_e32 v11, s9
9529; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v8, v2, vcc
9530; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
9531; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9532; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
9533; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9534; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
9535; GFX9-NEXT:    v_mac_f32_e32 v10, s16, v11
9536; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
9537; GFX9-NEXT:    v_rcp_f32_e32 v3, v10
9538; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9539; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
9540; GFX9-NEXT:    s_sub_u32 s10, 0, s8
9541; GFX9-NEXT:    v_mul_f32_e32 v3, s17, v3
9542; GFX9-NEXT:    v_mul_f32_e32 v4, s18, v3
9543; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
9544; GFX9-NEXT:    v_mac_f32_e32 v3, s19, v4
9545; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
9546; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
9547; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
9548; GFX9-NEXT:    s_subb_u32 s11, 0, s9
9549; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v4
9550; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v3
9551; GFX9-NEXT:    v_mul_lo_u32 v9, s11, v3
9552; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9553; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v3
9554; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
9555; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
9556; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v7
9557; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v2
9558; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v7
9559; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v7
9560; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v7
9561; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
9562; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
9563; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v2
9564; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
9565; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
9566; GFX9-NEXT:    v_xor_b32_e32 v1, s5, v1
9567; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
9568; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v2, vcc
9569; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
9570; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
9571; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v3, v2
9572; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
9573; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1]
9574; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v3
9575; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v2
9576; GFX9-NEXT:    v_mul_lo_u32 v10, s11, v2
9577; GFX9-NEXT:    v_mul_lo_u32 v11, s10, v2
9578; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
9579; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
9580; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
9581; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v8
9582; GFX9-NEXT:    v_mul_hi_u32 v13, v2, v11
9583; GFX9-NEXT:    v_mul_hi_u32 v14, v2, v8
9584; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v11
9585; GFX9-NEXT:    v_mul_lo_u32 v11, v3, v11
9586; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
9587; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v8
9588; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
9589; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v8
9590; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
9591; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
9592; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v6, vcc
9593; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
9594; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
9595; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
9596; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
9597; GFX9-NEXT:    s_add_u32 s0, s6, s10
9598; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
9599; GFX9-NEXT:    s_mov_b32 s11, s10
9600; GFX9-NEXT:    s_addc_u32 s1, s7, s10
9601; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
9602; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
9603; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v3
9604; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v2
9605; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v3
9606; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v3
9607; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v3
9608; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
9609; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
9610; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v2
9611; GFX9-NEXT:    v_mul_hi_u32 v2, s7, v2
9612; GFX9-NEXT:    v_mov_b32_e32 v8, s5
9613; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
9614; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v2, vcc
9615; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v6, vcc
9616; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
9617; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
9618; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v3
9619; GFX9-NEXT:    v_mul_hi_u32 v5, s8, v2
9620; GFX9-NEXT:    v_mul_lo_u32 v7, s9, v2
9621; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
9622; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
9623; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
9624; GFX9-NEXT:    v_mul_lo_u32 v5, s8, v2
9625; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
9626; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v4
9627; GFX9-NEXT:    v_mov_b32_e32 v8, s9
9628; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
9629; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc
9630; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s8, v5
9631; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1]
9632; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v7
9633; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
9634; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
9635; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9636; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v7
9637; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
9638; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 2, v2
9639; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v3, s[0:1]
9640; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v2
9641; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
9642; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
9643; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[0:1]
9644; GFX9-NEXT:    v_mov_b32_e32 v9, s7
9645; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v9, v4, vcc
9646; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v4
9647; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
9648; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
9649; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9650; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v4
9651; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
9652; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9653; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v8, s[0:1]
9654; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9655; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[12:13]
9656; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
9657; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
9658; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
9659; GFX9-NEXT:    v_mov_b32_e32 v4, s1
9660; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
9661; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
9662; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9663; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
9664; GFX9-NEXT:    s_endpgm
9665  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
9666  %r = sdiv <2 x i64> %x, %shl.y
9667  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
9668  ret void
9669}
9670
9671define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
9672; CHECK-LABEL: @srem_i64_oddk_denom(
9673; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
9674; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9675; CHECK-NEXT:    ret void
9676;
9677; GFX6-LABEL: srem_i64_oddk_denom:
9678; GFX6:       ; %bb.0:
9679; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9680; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
9681; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9682; GFX6-NEXT:    s_mov_b32 s2, 0xffed2705
9683; GFX6-NEXT:    v_mov_b32_e32 v8, 0
9684; GFX6-NEXT:    v_mov_b32_e32 v7, 0
9685; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9686; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9687; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9688; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9689; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9690; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9691; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
9692; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9693; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
9694; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
9695; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
9696; GFX6-NEXT:    s_mov_b32 s6, -1
9697; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9698; GFX6-NEXT:    s_mov_b32 s4, s8
9699; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9700; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
9701; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9702; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
9703; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
9704; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
9705; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9706; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
9707; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9708; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9709; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
9710; GFX6-NEXT:    s_mov_b32 s5, s9
9711; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
9712; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
9713; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
9714; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9715; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
9716; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
9717; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
9718; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
9719; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
9720; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9721; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
9722; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
9723; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
9724; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
9725; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
9726; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
9727; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
9728; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
9729; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
9730; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
9731; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
9732; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
9733; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
9734; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
9735; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9736; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
9737; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
9738; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
9739; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
9740; GFX6-NEXT:    s_add_u32 s0, s10, s2
9741; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9742; GFX6-NEXT:    s_mov_b32 s3, s2
9743; GFX6-NEXT:    s_addc_u32 s1, s11, s2
9744; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
9745; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9746; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
9747; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
9748; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
9749; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
9750; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
9751; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9752; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
9753; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
9754; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
9755; GFX6-NEXT:    s_mov_b32 s3, 0x12d8fb
9756; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9757; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9758; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
9759; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9760; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
9761; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
9762; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
9763; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
9764; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
9765; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
9766; GFX6-NEXT:    v_mov_b32_e32 v2, s1
9767; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
9768; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
9769; GFX6-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
9770; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
9771; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
9772; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
9773; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
9774; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9775; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
9776; GFX6-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
9777; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9778; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
9779; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
9780; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9781; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
9782; GFX6-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
9783; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9784; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9785; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
9786; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9787; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
9788; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
9789; GFX6-NEXT:    v_mov_b32_e32 v2, s2
9790; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
9791; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
9792; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9793; GFX6-NEXT:    s_endpgm
9794;
9795; GFX9-LABEL: srem_i64_oddk_denom:
9796; GFX9:       ; %bb.0:
9797; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9798; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
9799; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9800; GFX9-NEXT:    s_mov_b32 s8, 0xffed2705
9801; GFX9-NEXT:    v_mov_b32_e32 v7, 0
9802; GFX9-NEXT:    v_mov_b32_e32 v5, 0
9803; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9804; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9805; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9806; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9807; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9808; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9809; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9810; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
9811; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
9812; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
9813; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9814; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
9815; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
9816; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
9817; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
9818; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
9819; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9820; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
9821; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
9822; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v4
9823; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
9824; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
9825; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
9826; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
9827; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9828; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
9829; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
9830; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
9831; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
9832; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s8
9833; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
9834; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
9835; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
9836; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
9837; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
9838; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
9839; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
9840; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
9841; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
9842; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
9843; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
9844; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
9845; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
9846; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
9847; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
9848; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
9849; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
9850; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
9851; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
9852; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9853; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
9854; GFX9-NEXT:    s_add_u32 s0, s6, s2
9855; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9856; GFX9-NEXT:    s_mov_b32 s3, s2
9857; GFX9-NEXT:    s_addc_u32 s1, s7, s2
9858; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
9859; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9860; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
9861; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
9862; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
9863; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
9864; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
9865; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9866; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
9867; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
9868; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
9869; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
9870; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
9871; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9872; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
9873; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9874; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
9875; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s3
9876; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
9877; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
9878; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
9879; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
9880; GFX9-NEXT:    v_mov_b32_e32 v2, s1
9881; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
9882; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s3, v0
9883; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc
9884; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s3, v2
9885; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
9886; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
9887; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
9888; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9889; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
9890; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
9891; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
9892; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
9893; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
9894; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
9895; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
9896; GFX9-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[0:1]
9897; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9898; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9899; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
9900; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9901; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
9902; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
9903; GFX9-NEXT:    v_mov_b32_e32 v2, s2
9904; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
9905; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
9906; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
9907; GFX9-NEXT:    s_endpgm
9908  %r = srem i64 %x, 1235195
9909  store i64 %r, i64 addrspace(1)* %out
9910  ret void
9911}
9912
9913define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
9914; CHECK-LABEL: @srem_i64_pow2k_denom(
9915; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
9916; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9917; CHECK-NEXT:    ret void
9918;
9919; GFX6-LABEL: srem_i64_pow2k_denom:
9920; GFX6:       ; %bb.0:
9921; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
9922; GFX6-NEXT:    s_mov_b32 s3, 0xf000
9923; GFX6-NEXT:    s_mov_b32 s2, -1
9924; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9925; GFX6-NEXT:    s_mov_b32 s0, s4
9926; GFX6-NEXT:    s_ashr_i32 s4, s7, 31
9927; GFX6-NEXT:    s_lshr_b32 s4, s4, 20
9928; GFX6-NEXT:    s_add_u32 s4, s6, s4
9929; GFX6-NEXT:    s_mov_b32 s1, s5
9930; GFX6-NEXT:    s_addc_u32 s5, s7, 0
9931; GFX6-NEXT:    s_and_b32 s4, s4, 0xfffff000
9932; GFX6-NEXT:    s_sub_u32 s4, s6, s4
9933; GFX6-NEXT:    s_subb_u32 s5, s7, s5
9934; GFX6-NEXT:    v_mov_b32_e32 v0, s4
9935; GFX6-NEXT:    v_mov_b32_e32 v1, s5
9936; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
9937; GFX6-NEXT:    s_endpgm
9938;
9939; GFX9-LABEL: srem_i64_pow2k_denom:
9940; GFX9:       ; %bb.0:
9941; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
9942; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9943; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9944; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
9945; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
9946; GFX9-NEXT:    s_add_u32 s4, s2, s4
9947; GFX9-NEXT:    s_addc_u32 s5, s3, 0
9948; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
9949; GFX9-NEXT:    s_sub_u32 s2, s2, s4
9950; GFX9-NEXT:    s_subb_u32 s3, s3, s5
9951; GFX9-NEXT:    v_mov_b32_e32 v0, s2
9952; GFX9-NEXT:    v_mov_b32_e32 v1, s3
9953; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
9954; GFX9-NEXT:    s_endpgm
9955  %r = srem i64 %x, 4096
9956  store i64 %r, i64 addrspace(1)* %out
9957  ret void
9958}
9959
9960define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
9961; CHECK-LABEL: @srem_i64_pow2_shl_denom(
9962; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
9963; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
9964; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9965; CHECK-NEXT:    ret void
9966;
9967; GFX6-LABEL: srem_i64_pow2_shl_denom:
9968; GFX6:       ; %bb.0:
9969; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
9970; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
9971; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
9972; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9973; GFX6-NEXT:    s_mov_b32 s6, -1
9974; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9975; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9976; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
9977; GFX6-NEXT:    s_add_u32 s2, s2, s4
9978; GFX6-NEXT:    s_mov_b32 s5, s4
9979; GFX6-NEXT:    s_addc_u32 s3, s3, s4
9980; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
9981; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
9982; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
9983; GFX6-NEXT:    s_sub_u32 s2, 0, s12
9984; GFX6-NEXT:    s_subb_u32 s3, 0, s13
9985; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
9986; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9987; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9988; GFX6-NEXT:    s_mov_b32 s15, s14
9989; GFX6-NEXT:    s_mov_b32 s4, s8
9990; GFX6-NEXT:    s_mov_b32 s5, s9
9991; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9992; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9993; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9994; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9995; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9996; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9997; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
9998; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
9999; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
10000; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
10001; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10002; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
10003; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
10004; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
10005; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
10006; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
10007; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10008; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
10009; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
10010; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
10011; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
10012; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
10013; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
10014; GFX6-NEXT:    v_mov_b32_e32 v4, 0
10015; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
10016; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10017; GFX6-NEXT:    v_mov_b32_e32 v6, 0
10018; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
10019; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
10020; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
10021; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
10022; GFX6-NEXT:    v_mul_hi_u32 v7, s2, v0
10023; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v0
10024; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
10025; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v0
10026; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
10027; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
10028; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
10029; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
10030; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
10031; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
10032; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
10033; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
10034; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
10035; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
10036; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
10037; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
10038; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
10039; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
10040; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
10041; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10042; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
10043; GFX6-NEXT:    s_add_u32 s0, s10, s14
10044; GFX6-NEXT:    s_addc_u32 s1, s11, s14
10045; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10046; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
10047; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10048; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
10049; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
10050; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
10051; GFX6-NEXT:    v_mul_hi_u32 v7, s11, v1
10052; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
10053; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10054; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
10055; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
10056; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
10057; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
10058; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
10059; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
10060; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
10061; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
10062; GFX6-NEXT:    v_mul_lo_u32 v1, s12, v1
10063; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
10064; GFX6-NEXT:    v_mul_lo_u32 v3, s13, v0
10065; GFX6-NEXT:    v_mul_lo_u32 v0, s12, v0
10066; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
10067; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10068; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
10069; GFX6-NEXT:    v_mov_b32_e32 v3, s13
10070; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
10071; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10072; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
10073; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
10074; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
10075; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10076; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
10077; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
10078; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
10079; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10080; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
10081; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
10082; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10083; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
10084; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
10085; GFX6-NEXT:    v_mov_b32_e32 v5, s11
10086; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
10087; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
10088; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10089; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
10090; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10091; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
10092; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
10093; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10094; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10095; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
10096; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10097; GFX6-NEXT:    v_xor_b32_e32 v0, s14, v0
10098; GFX6-NEXT:    v_xor_b32_e32 v1, s14, v1
10099; GFX6-NEXT:    v_mov_b32_e32 v2, s14
10100; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
10101; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
10102; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
10103; GFX6-NEXT:    s_endpgm
10104;
10105; GFX9-LABEL: srem_i64_pow2_shl_denom:
10106; GFX9:       ; %bb.0:
10107; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
10108; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
10109; GFX9-NEXT:    v_mov_b32_e32 v2, 0
10110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10111; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
10112; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
10113; GFX9-NEXT:    s_add_u32 s2, s2, s4
10114; GFX9-NEXT:    s_mov_b32 s5, s4
10115; GFX9-NEXT:    s_addc_u32 s3, s3, s4
10116; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
10117; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
10118; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
10119; GFX9-NEXT:    s_sub_u32 s10, 0, s8
10120; GFX9-NEXT:    s_subb_u32 s4, 0, s9
10121; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
10122; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
10123; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10124; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10125; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10126; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10127; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
10128; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10129; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
10130; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
10131; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
10132; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
10133; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
10134; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
10135; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
10136; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
10137; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
10138; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
10139; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
10140; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
10141; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
10142; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
10143; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
10144; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
10145; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
10146; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
10147; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
10148; GFX9-NEXT:    v_mov_b32_e32 v6, 0
10149; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
10150; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
10151; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
10152; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v3
10153; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v0
10154; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
10155; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v0
10156; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10157; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
10158; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
10159; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
10160; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
10161; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
10162; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
10163; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
10164; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
10165; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
10166; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
10167; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
10168; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
10169; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
10170; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
10171; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
10172; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10173; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
10174; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
10175; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
10176; GFX9-NEXT:    s_add_u32 s0, s6, s10
10177; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
10178; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
10179; GFX9-NEXT:    s_mov_b32 s11, s10
10180; GFX9-NEXT:    s_addc_u32 s1, s7, s10
10181; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
10182; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10183; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
10184; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
10185; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
10186; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
10187; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
10188; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
10189; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
10190; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
10191; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
10192; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
10193; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v4, v0, vcc
10194; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v2, vcc
10195; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
10196; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
10197; GFX9-NEXT:    v_mul_lo_u32 v1, s8, v1
10198; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
10199; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
10200; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
10201; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
10202; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
10203; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v1
10204; GFX9-NEXT:    v_mov_b32_e32 v4, s9
10205; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
10206; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
10207; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
10208; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
10209; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
10210; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
10211; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10212; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
10213; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
10214; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10215; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
10216; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
10217; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
10218; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
10219; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
10220; GFX9-NEXT:    v_mov_b32_e32 v6, s7
10221; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
10222; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
10223; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10224; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
10225; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10226; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
10227; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
10228; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
10229; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
10230; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
10231; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
10232; GFX9-NEXT:    v_xor_b32_e32 v0, s10, v0
10233; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
10234; GFX9-NEXT:    v_mov_b32_e32 v3, s10
10235; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s10, v0
10236; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
10237; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
10238; GFX9-NEXT:    s_endpgm
10239  %shl.y = shl i64 4096, %y
10240  %r = srem i64 %x, %shl.y
10241  store i64 %r, i64 addrspace(1)* %out
10242  ret void
10243}
10244
10245define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
10246; CHECK-LABEL: @srem_v2i64_pow2k_denom(
10247; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10248; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
10249; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
10250; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
10251; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
10252; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
10253; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10254; CHECK-NEXT:    ret void
10255;
10256; GFX6-LABEL: srem_v2i64_pow2k_denom:
10257; GFX6:       ; %bb.0:
10258; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
10259; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
10260; GFX6-NEXT:    s_movk_i32 s8, 0xf000
10261; GFX6-NEXT:    s_mov_b32 s7, 0xf000
10262; GFX6-NEXT:    s_mov_b32 s6, -1
10263; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10264; GFX6-NEXT:    s_ashr_i32 s9, s1, 31
10265; GFX6-NEXT:    s_lshr_b32 s9, s9, 20
10266; GFX6-NEXT:    s_add_u32 s9, s0, s9
10267; GFX6-NEXT:    s_addc_u32 s10, s1, 0
10268; GFX6-NEXT:    s_and_b32 s9, s9, s8
10269; GFX6-NEXT:    s_sub_u32 s0, s0, s9
10270; GFX6-NEXT:    s_subb_u32 s1, s1, s10
10271; GFX6-NEXT:    s_ashr_i32 s9, s3, 31
10272; GFX6-NEXT:    s_lshr_b32 s9, s9, 20
10273; GFX6-NEXT:    s_add_u32 s9, s2, s9
10274; GFX6-NEXT:    s_addc_u32 s10, s3, 0
10275; GFX6-NEXT:    s_and_b32 s8, s9, s8
10276; GFX6-NEXT:    s_sub_u32 s2, s2, s8
10277; GFX6-NEXT:    s_subb_u32 s3, s3, s10
10278; GFX6-NEXT:    v_mov_b32_e32 v0, s0
10279; GFX6-NEXT:    v_mov_b32_e32 v1, s1
10280; GFX6-NEXT:    v_mov_b32_e32 v2, s2
10281; GFX6-NEXT:    v_mov_b32_e32 v3, s3
10282; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10283; GFX6-NEXT:    s_endpgm
10284;
10285; GFX9-LABEL: srem_v2i64_pow2k_denom:
10286; GFX9:       ; %bb.0:
10287; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10288; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10289; GFX9-NEXT:    s_movk_i32 s8, 0xf000
10290; GFX9-NEXT:    v_mov_b32_e32 v4, 0
10291; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10292; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
10293; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
10294; GFX9-NEXT:    s_add_u32 s0, s4, s0
10295; GFX9-NEXT:    s_addc_u32 s1, s5, 0
10296; GFX9-NEXT:    s_and_b32 s0, s0, s8
10297; GFX9-NEXT:    s_sub_u32 s0, s4, s0
10298; GFX9-NEXT:    s_subb_u32 s1, s5, s1
10299; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
10300; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
10301; GFX9-NEXT:    s_add_u32 s4, s6, s4
10302; GFX9-NEXT:    s_addc_u32 s5, s7, 0
10303; GFX9-NEXT:    s_and_b32 s4, s4, s8
10304; GFX9-NEXT:    s_sub_u32 s4, s6, s4
10305; GFX9-NEXT:    s_subb_u32 s5, s7, s5
10306; GFX9-NEXT:    v_mov_b32_e32 v0, s0
10307; GFX9-NEXT:    v_mov_b32_e32 v1, s1
10308; GFX9-NEXT:    v_mov_b32_e32 v2, s4
10309; GFX9-NEXT:    v_mov_b32_e32 v3, s5
10310; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10311; GFX9-NEXT:    s_endpgm
10312  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
10313  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10314  ret void
10315}
10316
10317define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
10318; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
10319; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
10320; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10321; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
10322; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
10323; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
10324; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
10325; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
10326; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
10327; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
10328; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10329; CHECK-NEXT:    ret void
10330;
10331; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
10332; GFX6:       ; %bb.0:
10333; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
10334; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
10335; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
10336; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
10337; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
10338; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10339; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
10340; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
10341; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
10342; GFX6-NEXT:    s_add_u32 s2, s2, s4
10343; GFX6-NEXT:    s_mov_b32 s5, s4
10344; GFX6-NEXT:    s_addc_u32 s3, s3, s4
10345; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[4:5]
10346; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s16
10347; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s17
10348; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
10349; GFX6-NEXT:    s_sub_u32 s6, 0, s16
10350; GFX6-NEXT:    s_subb_u32 s7, 0, s17
10351; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
10352; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
10353; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
10354; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
10355; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
10356; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
10357; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
10358; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
10359; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
10360; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
10361; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10362; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
10363; GFX6-NEXT:    s_add_u32 s0, s8, s12
10364; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
10365; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
10366; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
10367; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v0
10368; GFX6-NEXT:    s_mov_b32 s13, s12
10369; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10370; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10371; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
10372; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
10373; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
10374; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
10375; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10376; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
10377; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
10378; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
10379; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
10380; GFX6-NEXT:    s_addc_u32 s1, s9, s12
10381; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
10382; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
10383; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
10384; GFX6-NEXT:    v_mov_b32_e32 v4, 0
10385; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
10386; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10387; GFX6-NEXT:    v_mov_b32_e32 v6, 0
10388; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
10389; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
10390; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
10391; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
10392; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
10393; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v0
10394; GFX6-NEXT:    s_mov_b32 s7, 0xf000
10395; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
10396; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v0
10397; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
10398; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
10399; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
10400; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
10401; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
10402; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
10403; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
10404; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
10405; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
10406; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
10407; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
10408; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
10409; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
10410; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
10411; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
10412; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10413; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
10414; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10415; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10416; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
10417; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
10418; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v1
10419; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v1
10420; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
10421; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10422; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
10423; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v0
10424; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
10425; GFX6-NEXT:    s_mov_b32 s6, -1
10426; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
10427; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
10428; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
10429; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
10430; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
10431; GFX6-NEXT:    v_mul_lo_u32 v1, s16, v1
10432; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v0
10433; GFX6-NEXT:    v_mul_lo_u32 v3, s17, v0
10434; GFX6-NEXT:    v_mul_lo_u32 v0, s16, v0
10435; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
10436; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10437; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
10438; GFX6-NEXT:    v_mov_b32_e32 v3, s17
10439; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
10440; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10441; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
10442; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
10443; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
10444; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10445; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10446; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
10447; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
10448; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10449; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
10450; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
10451; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
10452; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10453; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
10454; GFX6-NEXT:    s_add_u32 s8, s14, s2
10455; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
10456; GFX6-NEXT:    v_mov_b32_e32 v7, s9
10457; GFX6-NEXT:    s_mov_b32 s3, s2
10458; GFX6-NEXT:    s_addc_u32 s9, s15, s2
10459; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
10460; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s8
10461; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s9
10462; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
10463; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
10464; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10465; GFX6-NEXT:    v_mac_f32_e32 v8, s18, v9
10466; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
10467; GFX6-NEXT:    v_rcp_f32_e32 v8, v8
10468; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
10469; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
10470; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
10471; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10472; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10473; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
10474; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v8
10475; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v3
10476; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
10477; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v5
10478; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
10479; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
10480; GFX6-NEXT:    s_sub_u32 s2, 0, s8
10481; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10482; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v3
10483; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v5
10484; GFX6-NEXT:    s_subb_u32 s3, 0, s9
10485; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v3
10486; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
10487; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
10488; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v3
10489; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
10490; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v2
10491; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
10492; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v7
10493; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v2
10494; GFX6-NEXT:    v_mul_lo_u32 v2, v5, v2
10495; GFX6-NEXT:    s_mov_b32 s15, s14
10496; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
10497; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
10498; GFX6-NEXT:    v_mul_lo_u32 v10, v5, v7
10499; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
10500; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
10501; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
10502; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
10503; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
10504; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
10505; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
10506; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
10507; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
10508; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
10509; GFX6-NEXT:    v_mul_lo_u32 v8, s2, v3
10510; GFX6-NEXT:    v_mul_hi_u32 v9, s2, v2
10511; GFX6-NEXT:    v_mul_lo_u32 v10, s3, v2
10512; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
10513; GFX6-NEXT:    v_mul_lo_u32 v9, s2, v2
10514; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
10515; GFX6-NEXT:    v_mul_lo_u32 v12, v2, v8
10516; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
10517; GFX6-NEXT:    v_mul_hi_u32 v13, v2, v9
10518; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v9
10519; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v9
10520; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v8
10521; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
10522; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
10523; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v8
10524; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
10525; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
10526; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
10527; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
10528; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
10529; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
10530; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
10531; GFX6-NEXT:    s_add_u32 s0, s10, s14
10532; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
10533; GFX6-NEXT:    s_addc_u32 s1, s11, s14
10534; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
10535; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
10536; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v3
10537; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v2
10538; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v3
10539; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v3
10540; GFX6-NEXT:    v_mul_lo_u32 v3, s11, v3
10541; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
10542; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
10543; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v2
10544; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
10545; GFX6-NEXT:    v_mov_b32_e32 v8, s12
10546; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
10547; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
10548; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
10549; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
10550; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
10551; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v3
10552; GFX6-NEXT:    v_mul_hi_u32 v4, s8, v2
10553; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v2
10554; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
10555; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v2
10556; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
10557; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
10558; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
10559; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
10560; GFX6-NEXT:    v_mov_b32_e32 v5, s9
10561; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
10562; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
10563; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
10564; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
10565; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
10566; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
10567; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10568; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
10569; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
10570; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10571; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
10572; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
10573; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
10574; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
10575; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
10576; GFX6-NEXT:    v_mov_b32_e32 v7, s11
10577; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
10578; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
10579; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10580; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
10581; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10582; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
10583; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
10584; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10585; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
10586; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
10587; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
10588; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
10589; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
10590; GFX6-NEXT:    v_mov_b32_e32 v4, s14
10591; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
10592; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
10593; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10594; GFX6-NEXT:    s_endpgm
10595;
10596; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
10597; GFX9:       ; %bb.0:
10598; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
10599; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
10600; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
10601; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
10602; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
10603; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10604; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
10605; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
10606; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
10607; GFX9-NEXT:    s_add_u32 s2, s2, s4
10608; GFX9-NEXT:    s_mov_b32 s5, s4
10609; GFX9-NEXT:    s_addc_u32 s3, s3, s4
10610; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
10611; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
10612; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
10613; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
10614; GFX9-NEXT:    s_sub_u32 s8, 0, s12
10615; GFX9-NEXT:    s_subb_u32 s4, 0, s13
10616; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
10617; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
10618; GFX9-NEXT:    v_mov_b32_e32 v6, 0
10619; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
10620; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
10621; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10622; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
10623; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
10624; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10625; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
10626; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
10627; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
10628; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v0
10629; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10630; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
10631; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
10632; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
10633; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
10634; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
10635; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
10636; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
10637; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
10638; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
10639; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
10640; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
10641; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
10642; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
10643; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10644; GFX9-NEXT:    v_mov_b32_e32 v5, 0
10645; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
10646; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
10647; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
10648; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v2
10649; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v0
10650; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
10651; GFX9-NEXT:    v_mul_lo_u32 v9, s8, v0
10652; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10653; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
10654; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
10655; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
10656; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
10657; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
10658; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v9
10659; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v9
10660; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
10661; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
10662; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
10663; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
10664; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
10665; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
10666; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
10667; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
10668; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10669; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
10670; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
10671; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
10672; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
10673; GFX9-NEXT:    s_add_u32 s2, s4, s8
10674; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
10675; GFX9-NEXT:    s_addc_u32 s3, s5, s8
10676; GFX9-NEXT:    s_mov_b32 s9, s8
10677; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
10678; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10679; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
10680; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
10681; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v1
10682; GFX9-NEXT:    v_mul_hi_u32 v7, s15, v1
10683; GFX9-NEXT:    v_mul_lo_u32 v1, s15, v1
10684; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10685; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
10686; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
10687; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
10688; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
10689; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
10690; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
10691; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
10692; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
10693; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
10694; GFX9-NEXT:    v_mul_lo_u32 v1, s12, v1
10695; GFX9-NEXT:    v_mul_hi_u32 v2, s12, v0
10696; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v0
10697; GFX9-NEXT:    v_mul_lo_u32 v0, s12, v0
10698; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
10699; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
10700; GFX9-NEXT:    v_sub_u32_e32 v2, s15, v1
10701; GFX9-NEXT:    v_mov_b32_e32 v3, s13
10702; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s14, v0
10703; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
10704; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v0
10705; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1]
10706; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
10707; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10708; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
10709; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10710; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
10711; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
10712; GFX9-NEXT:    s_ashr_i32 s2, s11, 31
10713; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10714; GFX9-NEXT:    s_add_u32 s10, s10, s2
10715; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v4
10716; GFX9-NEXT:    s_mov_b32 s3, s2
10717; GFX9-NEXT:    s_addc_u32 s11, s11, s2
10718; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[2:3]
10719; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10720; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
10721; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s10
10722; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s11
10723; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
10724; GFX9-NEXT:    v_mov_b32_e32 v7, s15
10725; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v7, v1, vcc
10726; GFX9-NEXT:    v_mac_f32_e32 v8, s16, v9
10727; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
10728; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10729; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
10730; GFX9-NEXT:    v_rcp_f32_e32 v8, v8
10731; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
10732; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
10733; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
10734; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10735; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10736; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
10737; GFX9-NEXT:    v_mul_f32_e32 v3, s17, v8
10738; GFX9-NEXT:    v_mul_f32_e32 v4, s18, v3
10739; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
10740; GFX9-NEXT:    v_mac_f32_e32 v3, s19, v4
10741; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
10742; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
10743; GFX9-NEXT:    s_sub_u32 s2, 0, s10
10744; GFX9-NEXT:    s_subb_u32 s3, 0, s11
10745; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v3
10746; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v4
10747; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v3
10748; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10749; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v3
10750; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
10751; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
10752; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v7
10753; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v2
10754; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v7
10755; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v7
10756; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v7
10757; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
10758; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
10759; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v2
10760; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
10761; GFX9-NEXT:    s_ashr_i32 s12, s7, 31
10762; GFX9-NEXT:    s_mov_b32 s13, s12
10763; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
10764; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v2, vcc
10765; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
10766; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
10767; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v3, v2
10768; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
10769; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1]
10770; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v3
10771; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v2
10772; GFX9-NEXT:    v_mul_lo_u32 v10, s3, v2
10773; GFX9-NEXT:    v_mul_lo_u32 v11, s2, v2
10774; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
10775; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
10776; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
10777; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v8
10778; GFX9-NEXT:    v_mul_hi_u32 v13, v2, v11
10779; GFX9-NEXT:    v_mul_hi_u32 v14, v2, v8
10780; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v11
10781; GFX9-NEXT:    v_mul_lo_u32 v11, v3, v11
10782; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
10783; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v8
10784; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
10785; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v8
10786; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
10787; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
10788; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v6, vcc
10789; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
10790; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
10791; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
10792; GFX9-NEXT:    s_add_u32 s0, s6, s12
10793; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
10794; GFX9-NEXT:    s_addc_u32 s1, s7, s12
10795; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
10796; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
10797; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v3
10798; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v2
10799; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v3
10800; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v3
10801; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v3
10802; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
10803; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
10804; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v2
10805; GFX9-NEXT:    v_mul_hi_u32 v2, s7, v2
10806; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
10807; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
10808; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
10809; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v2, vcc
10810; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v6, vcc
10811; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
10812; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
10813; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v3
10814; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v2
10815; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v2
10816; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v2
10817; GFX9-NEXT:    v_mov_b32_e32 v8, s8
10818; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s8, v0
10819; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
10820; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
10821; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
10822; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v3
10823; GFX9-NEXT:    v_mov_b32_e32 v5, s11
10824; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
10825; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc
10826; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v2
10827; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v4, s[0:1]
10828; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s11, v8
10829; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1]
10830; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10831; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v7
10832; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s10, v7
10833; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
10834; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s11, v8
10835; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
10836; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
10837; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
10838; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[0:1]
10839; GFX9-NEXT:    v_mov_b32_e32 v8, s7
10840; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v8, v3, vcc
10841; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
10842; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10843; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
10844; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
10845; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
10846; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
10847; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
10848; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
10849; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v5, s[0:1]
10850; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
10851; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
10852; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
10853; GFX9-NEXT:    v_mov_b32_e32 v4, s12
10854; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s12, v2
10855; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
10856; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10857; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
10858; GFX9-NEXT:    s_endpgm
10859  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
10860  %r = srem <2 x i64> %x, %shl.y
10861  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10862  ret void
10863}
10864